Reorganize contrib op schemas (#10494)

2026-06-23 02:38:28 +00:00 · 2022-02-09 09:31:58 -08:00 · 2022-02-09 09:31:58 -08:00 · 7a2bf3c24c
commit 7a2bf3c24c
parent 399ffc9700
12 changed files with 2975 additions and 2869 deletions
--- a/cmake/onnxruntime_graph.cmake
+++ b/cmake/onnxruntime_graph.cmake
@ -17,6 +17,7 @@ if (onnxruntime_MINIMAL_BUILD)
    "${ONNXRUNTIME_ROOT}/core/graph/schema_registry.cc"
    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/*defs.h"
    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/*defs.cc"
+    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_deprecated_operators.cc"
    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.h"
    "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc"
  )
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@ -0,0 +1,571 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/graph/constants.h"
+#include "core/graph/contrib_ops/contrib_defs.h"
+#include "core/graph/contrib_ops/quantization_defs.h"
+#include "core/graph/contrib_ops/onnx_function_util.h"
+
+using namespace ::ONNX_NAMESPACE;
+
+namespace onnxruntime {
+namespace contrib {
+void embedLayerNormalizationShapeInference(InferenceContext& ctx) {
+  propagateElemTypeFromInputToOutput(ctx, 2, 0);
+  propagateElemTypeFromInputToOutput(ctx, 0, 1);
+  if (!hasInputShape(ctx, 0)) {
+    // TODO(kreeger): In this case update the output to (?, ?, hidden_size).
+    return;
+  }
+
+  auto& input_ids_shape = getInputShape(ctx, 0);
+  auto& input_ids_dims = input_ids_shape.dim();
+
+  // Note that both batch size and sequence length could be symbolic.
+  // So we only check dimension size here.
+  if (input_ids_dims.size() != 2) {
+    fail_shape_inference("input_ids shall be 2 dimensions");
+  }
+
+  bool has_segment = hasInputShape(ctx, 1);
+  if (has_segment) {
+    // Ensure that segment_ids has the same shape.
+    auto& segment_ids_shape = getInputShape(ctx, 1);
+    auto& segment_ids_dims = segment_ids_shape.dim();
+    if (segment_ids_dims.size() != 2) {
+      fail_shape_inference("segment_ids input shall be 2 dimensions");
+    }
+  }
+
+  // get hidden_size from the last dimension of embedding
+  auto& word_embedding_shape = getInputShape(ctx, 2);
+  auto& word_embedding_dims = word_embedding_shape.dim();
+  if (word_embedding_dims.size() != 2 ||
+      !word_embedding_dims[1].has_dim_value() ||
+      word_embedding_shape.dim(1).dim_value() <= 0) {
+    fail_shape_inference("word_embedding should have 2 dimensions and dimension size is known.");
+  }
+  int64_t hidden_size = word_embedding_shape.dim(1).dim_value();
+
+  // Ensure that all embeddings + the gamma/beta tensors have the same hidden_size:
+  auto& position_embedding_shape = getInputShape(ctx, 3);
+  auto& position_embedding_dims = position_embedding_shape.dim();
+  if (position_embedding_dims.size() != 2 ||
+      !position_embedding_dims[1].has_dim_value() ||
+      position_embedding_shape.dim(1).dim_value() != hidden_size) {
+    fail_shape_inference(
+        "position_embedding should have 2 dimensions, dimension size known, "
+        "and same hidden size as word_embedding.");
+  }
+
+  if (has_segment) {
+    auto& segment_embedding_shape = getInputShape(ctx, 4);
+    auto& segment_embedding_dims = segment_embedding_shape.dim();
+    if (segment_embedding_dims.size() != 2 ||
+        !segment_embedding_dims[1].has_dim_value() ||
+        segment_embedding_shape.dim(1).dim_value() != hidden_size) {
+      fail_shape_inference(
+          "segment_embedding should have 2 dimensions, dimension size known, "
+          "and same hidden size as word_embedding.");
+    }
+  }
+
+  auto& gamma_shape = getInputShape(ctx, 5);
+  auto& gamma_dims = gamma_shape.dim();
+  if (gamma_dims.size() != 1 ||
+      !gamma_dims[0].has_dim_value() ||
+      gamma_shape.dim(0).dim_value() != hidden_size) {
+    fail_shape_inference(
+        "gamma should have 2 dimension, dimension size known, "
+        "and same hidden size as word_embedding.");
+  }
+
+  auto& beta_shape = getInputShape(ctx, 6);
+  auto& beta_dims = gamma_shape.dim();
+  if (beta_dims.size() != 1 ||
+      !beta_dims[0].has_dim_value() ||
+      beta_shape.dim(0).dim_value() != hidden_size) {
+    fail_shape_inference(
+        "beta should have 1 dimension, dimension size known, "
+        "and same hidden size as word_embedding.");
+  }
+
+  // input shape is (batch_size, sequence_length), output shape is (batch_size, sequence_length, hidden_size)
+  ONNX_NAMESPACE::TensorShapeProto output_shape;
+  *output_shape.add_dim() = input_ids_dims[0];
+  *output_shape.add_dim() = input_ids_dims[1];
+
+  output_shape.add_dim();
+  output_shape.mutable_dim(2)->set_dim_value(hidden_size);
+
+  updateOutputShape(ctx, 0, output_shape);
+
+  // mask_index shape is (batch_size)
+  ONNX_NAMESPACE::TensorShapeProto mask_index_shape;
+  *mask_index_shape.add_dim() = input_ids_dims[0];
+  updateOutputShape(ctx, 1, mask_index_shape);
+
+  if (ctx.getNumOutputs() > 2) {
+    updateOutputShape(ctx, 2, output_shape);
+    propagateElemTypeFromInputToOutput(ctx, 0, 2);
+  }
+}
+void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int past_input_index) {
+  // Type inference
+  ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 2, 0);
+  if (ctx.getNumOutputs() > 1) {
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 2, 1);
+  }
+
+  // Shape inference
+  if (hasInputShape(ctx, 0) && hasInputShape(ctx, 2)) {
+    auto& input_shape = getInputShape(ctx, 0);
+    auto& input_dims = input_shape.dim();
+    if (input_dims.size() != 3) {
+      fail_shape_inference("Inputs 0 shall be 3 dimensions");
+    }
+
+    auto& bias_shape = getInputShape(ctx, 2);
+    auto& bias_dims = bias_shape.dim();
+    if (bias_dims.size() != 1) {
+      fail_shape_inference("Invalid bias shape");
+    }
+
+    std::vector<int64_t> qkv_hidden_sizes;
+    getRepeatedAttribute(ctx, "qkv_hidden_sizes", qkv_hidden_sizes);
+
+    int64_t output_hidden_size;
+    if (qkv_hidden_sizes.size() != 0) {
+      if (qkv_hidden_sizes.size() != 3) {
+        fail_shape_inference("qkv_hidden_sizes should have 3 elements")
+      }
+      output_hidden_size = qkv_hidden_sizes[2];
+    } else {
+      output_hidden_size = bias_shape.dim(0).dim_value() / 3;
+    }
+
+    ONNX_NAMESPACE::TensorShapeProto output_shape;
+    for (auto& dim : input_dims) {
+      *output_shape.add_dim() = dim;
+    }
+
+    output_shape.mutable_dim(2)->set_dim_value(output_hidden_size);
+    updateOutputShape(ctx, 0, output_shape);
+
+    // TODO does the extra output need any changes?
+    if (ctx.getNumOutputs() > 1) {
+      if (hasInputShape(ctx, past_input_index)) {
+        auto& past_shape = getInputShape(ctx, past_input_index);
+        auto& past_dims = past_shape.dim();
+        if (past_dims.size() != 5) {
+          fail_shape_inference("Inputs 4 shall be 5 dimensions");
+        }
+
+        if (past_dims[3].has_dim_value() && input_dims[1].has_dim_value()) {
+          auto all_sequence_length = past_shape.dim(3).dim_value() + input_shape.dim(1).dim_value();
+
+          ONNX_NAMESPACE::TensorShapeProto present_shape;
+          for (auto& dim : past_dims) {
+            *present_shape.add_dim() = dim;
+          }
+          present_shape.mutable_dim(3)->set_dim_value(all_sequence_length);
+
+          updateOutputShape(ctx, 1, present_shape);
+        }
+      }
+    }
+  }
+}
+
+void DecoderAttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) {
+  // Type inference
+  ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+  if (ctx.getNumOutputs() > 1) {
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 1);
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 2);
+  }
+  // Shape inference
+  if (hasInputShape(ctx, 0)) {
+    auto& query_shape = getInputShape(ctx, 0);
+    updateOutputShape(ctx, 0, query_shape);
+  }
+  if (ctx.getNumOutputs() > 1) {
+    if (hasInputShape(ctx, 6) && hasInputShape(ctx, 7)) {
+      auto& cache_shape = getInputShape(ctx, 6);
+      auto& cache_dims = cache_shape.dim();
+      if (cache_dims.size() != 4) {
+        fail_shape_inference("key and value cache shall be 4 dimensions");
+      }
+      // has_dim_value() will return false if value is dynamic
+      if (cache_dims[0].has_dim_value() &&
+          cache_dims[1].has_dim_value() &&
+          cache_dims[2].has_dim_value() &&
+          cache_dims[3].has_dim_value()) {
+        ONNX_NAMESPACE::TensorShapeProto new_cache_shape;
+        *new_cache_shape.add_dim() = cache_shape.dim(0);
+        *new_cache_shape.add_dim() = cache_shape.dim(1);
+        new_cache_shape.add_dim();
+        *new_cache_shape.add_dim() = cache_shape.dim(3);
+
+        updateOutputShape(ctx, 1, new_cache_shape);
+        updateOutputShape(ctx, 2, new_cache_shape);
+      }
+    }
+  }
+}
+
+constexpr const char* Attention_ver1_doc = R"DOC(
+Multi-Head Self Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT).
+The mask_index input is optional. Besides raw attention mask with shape (batch_size, past_sequence_length + sequence_length)
+or (batch_size, sequence_length, past_sequence_length + sequence_length) with value 0 for masked and 1 otherwise,
+we also support other two formats: When input has right-side padding, mask_index is one dimension with shape (batch_size),
+where value of each element is the end position, or valid length of actual sequence excluding padding. When input has
+left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by
+the inclusive start positions. When unidirectional is 1, and each token only attend to previous tokens. For GPT-2, both past
+and present state are optional. Present state could appear in output even when past state is not in input.
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(Attention, 1,
+                            OpSchema()
+                                .SetDoc(Attention_ver1_doc)
+                                .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
+                                .Attr("unidirectional",
+                                      "Whether every token can only attend to previous tokens. Default value is 0.",
+                                      AttributeProto::INT,
+                                      static_cast<int64_t>(0))
+                                .Attr("qkv_hidden_sizes",
+                                      "Hidden layer sizes of Q, K, V paths in Attention",
+                                      AttributeProto::INTS,
+                                      OPTIONAL_VALUE)
+                                .Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, input_hidden_size)", "T")
+                                .Input(1, "weight", "2D input tensor with shape (input_hidden_size, 3 * hidden_size), where hidden_size = num_heads * head_size", "T")
+                                .Input(2, "bias", "1D input tensor with shape (3 * hidden_size)", "T")
+                                .Input(3, "mask_index",
+                                       "Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, past_sequence_length + sequence_length)"
+                                       "or (batch_size, sequence_length, past_sequence_length + sequence_length), or index with shape (batch_size) or (2 * batch_size).",
+                                       "M", OpSchema::Optional)
+                                .Input(4, "past", "past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).", "T", OpSchema::Optional)
+                                .Input(5, "extra_add", "additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).", "T", OpSchema::Optional)
+                                .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .Output(1, "present", "present state for key and value with shape (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)", "T", OpSchema::Optional)
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+                                .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask index to integer types")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  constexpr int past_input_index = 4;
+                                  AttentionTypeAndShapeInference(ctx, past_input_index);
+                                }));
+
+ONNX_MS_OPERATOR_SET_SCHEMA(QAttention, 1,
+                            OpSchema()
+                                .SetDoc("Quantization of Multi-Head Self Attention.")
+                                .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
+                                .Attr("unidirectional",
+                                      "Whether every token can only attend to previous tokens. Default value is 0.",
+                                      AttributeProto::INT,
+                                      static_cast<int64_t>(0))
+                                .Input(
+                                    0,
+                                    "input",
+                                    "3D input tensor with shape (batch_size, sequence_length, input_hidden_size)",
+                                    "T1")
+                                .Input(
+                                    1,
+                                    "weight",
+                                    "2D input tensor with shape (input_hidden_size, 3 * hidden_size), hidden_size = num_heads * head_size",
+                                    "T2")
+                                .Input(
+                                    2,
+                                    "bias",
+                                    "1D input tensor with shape (3 * hidden_size)",
+                                    "T3")
+                                .Input(
+                                    3,
+                                    "input_scale",
+                                    "scale of quantized input tensor. It's a scalar, which means a per-tensor/layer quantization.",
+                                    "T3")
+                                .Input(
+                                    4,
+                                    "weight_scale",
+                                    "scale of weight scale. It's a scalar or a 1D tensor, which means a per-tensor/per-column quantization."
+                                    "Its size should be 3 * hidden_size if it is per-column quantization",
+                                    "T3")
+                                .Input(
+                                    5,
+                                    "mask_index",
+                                    "Attention mask index with shape (batch_size)",
+                                    "T4",
+                                    OpSchema::Optional)
+                                .Input(
+                                    6,
+                                    "input_zero_point",
+                                    "zero point of quantized input tensor.It's a scalar, which means a per-tensor/layer quantization.",
+                                    "T1",
+                                    OpSchema::Optional)
+                                .Input(
+                                    7,
+                                    "weight_zero_point",
+                                    "zero point of quantized weight tensor. It's a scalar or a 1D tensor, which means a per-tensor/per-column quantization."
+                                    "Its size should be 3 * hidden_size if it is per-column quantization",
+                                    "T2",
+                                    OpSchema::Optional)
+                                .Input(
+                                    8,
+                                    "past",
+                                    "past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).",
+                                    "T3",
+                                    OpSchema::Optional)
+                                .Output(
+                                    0,
+                                    "output",
+                                    "3D output tensor with shape (batch_size, sequence_length, hidden_size)",
+                                    "T3")
+                                .Output(
+                                    1,
+                                    "present",
+                                    "present state for key and value with shape (2, batch_size, num_heads, past_sequence_length + sequence_length, head_size)",
+                                    "T3",
+                                    OpSchema::Optional)
+                                .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "Constrain input and output types to int8 tensors.")
+                                .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input and output types to int8 tensors.")
+                                .TypeConstraint("T3", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+                                .TypeConstraint("T4", {"tensor(int32)"}, "Constrain mask index to integer types")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  constexpr int past_input_index = 8;
+
+                                  AttentionTypeAndShapeInference(ctx, past_input_index);
+                                }));
+
+constexpr const char* Longformer_Attention_doc = R"DOC(
+Longformer Self Attention with a local context and a global context. Tokens attend locally: Each token
+attends to its W previous tokens and W succeding tokens with W being the window length. A selected few tokens
+attend globally to all other tokens.
+
+The attention mask is of shape (batch_size, sequence_length), where sequence_length is a multiple of 2W after padding.
+Mask value < 0 (like -10000.0) means the token is masked, 0 otherwise.
+
+Global attention flags have value 1 for the tokens attend globally and 0 otherwise.
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(LongformerAttention, 1,
+                            OpSchema()
+                                .SetDomain(kMSDomain)
+                                .SinceVersion(1)
+                                .SetDoc(Longformer_Attention_doc)
+                                .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
+                                .Attr("window", "One sided attention windows length W, or half of total window length", AttributeProto::INT)
+                                .Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, hidden_size), hidden_size = num_heads * head_size", "T")
+                                .Input(1, "weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "T")
+                                .Input(2, "bias", "1D input tensor with shape (3 * hidden_size)", "T")
+                                .Input(3, "mask", "Attention mask with shape (batch_size, sequence_length)", "T")
+                                .Input(4, "global_weight", "2D input tensor with shape (hidden_size, 3 * hidden_size)", "T")
+                                .Input(5, "global_bias", "1D input tensor with shape (3 * hidden_size)", "T")
+                                .Input(6, "global", "Global attention flags with shape (batch_size, sequence_length)", "G")
+                                .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.")
+                                .TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
+                                .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* Decoder_Attention_doc = R"DOC(
+This DecoderAttention supports self attention and cross attention, key and value cache, and key_padding_mask. The attention mask is not support at the moment.
+Some boolean parameters are passed by runtime input for generic purpose
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(DecoderAttention, 1,
+                            OpSchema()
+                                .SetDoc(Decoder_Attention_doc)
+                                .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
+                                .Input(0, "query", "3D input tensor with shape (sequence_length, batch_size, hidden_size), hidden_size = num_heads * head_size", "T")
+                                .Input(1, "key", "3D input tensor with shape (total_sequence_length, batch_size, hidden_size)", "T")
+                                .Input(2, "q_weight", "2D input tensor with shape (hidden_size, hidden_size)", "T")
+                                .Input(3, "kv_weight", "2D input tensor with shape (hidden_size, 2 * hidden_size)", "T")
+                                .Input(4, "bias", "1D input tensor with shape (3 * hidden_size)", "T")
+                                .Input(5, "key_padding_mask", "2D input tensor with shape (batch_size, total_sequence_length)", "B", OpSchema::Optional)
+                                .Input(6, "key_cache", "input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)", "T", OpSchema::Optional)    // self & cross
+                                .Input(7, "value_cache", "input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)", "T", OpSchema::Optional)  // self & cross
+                                .Input(8, "static_kv", "If static_kv = true, cross-attention; else self-attention", "B")
+                                .Input(9, "use_past", "If use_past = true, use cache; else no cache", "B")
+                                .Input(10, "has_layer_state", "If has_layer_state = true, layer_state = {} or [a,b]; else layer_state = None", "B")
+                                .Input(11, "has_key_padding_mask", "has_key_padding_mask or not", "B")
+                                .Output(0, "output", "3D output tensor with shape (sequence_length, batch_size, hidden_size)", "T")
+                                .Output(1, "new_key_cache", "output tensor with shape (batch_size, num_heads, new sequence_length, head_size)", "T", OpSchema::Optional)    // self & cross
+                                .Output(2, "new_value_cache", "output tensor with shape (batch_size, num_heads, new sequence_length, head_size)", "T", OpSchema::Optional)  // self & cross
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float and float16 tensors.")
+                                .TypeConstraint("B", {"tensor(bool)"}, "Constrain key_padding_mask to bool tensors.")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  DecoderAttentionTypeAndShapeInference(ctx);
+                                }));
+
+constexpr const char* EmbedLayerNormalization_ver1_doc = R"DOC(
+EmbedLayerNormalization is the fusion of embedding layer in BERT model, with optional mask processing.
+The embedding layer takes input_ids (word IDs) and segment_ids (sentence IDs) to look up word_embedding, position_embedding,
+and segment_emedding; the embeddings are added then applied layer normalization using gamma and beta tensors.
+The last input mask is optional. If mask is provided, mask index (that is position of first 0 in mask, or number of words)
+will be calculated.)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(EmbedLayerNormalization, 1,
+                            OpSchema()
+                                .SetDoc(EmbedLayerNormalization_ver1_doc)
+                                .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, kDefaultEmbedLayerNormEpsilon)
+                                .Input(0, "input_ids", "2D words IDs with shape (batch_size, sequence_length)", "T1")
+                                .Input(1, "segment_ids", "2D segment IDs with shape (batch_size, sequence_length)", "T1", OpSchema::Optional)
+                                .Input(2, "word_embedding", "2D with shape (,hidden_size)", "T")
+                                .Input(3, "position_embedding", "2D with shape (, hidden_size)", "T")
+                                .Input(4, "segment_embedding", "2D with shape (, hidden_size)", "T", OpSchema::Optional)
+                                .Input(5, "gamma", "1D gamma tensor for layer normalization with shape (hidden_size)", "T")
+                                .Input(6, "beta", "1D beta tensor for layer normalization  with shape (hidden_size)", "T")
+                                .Input(7, "mask", "2D attention mask with shape (batch_size, sequence_length)", "T1", OpSchema::Optional)
+                                .Input(8, "position_ids", "2D position ids with shape (batch_size, sequence_length)", "T1", OpSchema::Optional)
+                                .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .Output(1, "mask_index", "1D mask_index tensor with shape (batch_size)", "T1")
+                                .Output(2, "embedding_sum", "sum of word_embedding and position_embedding without layer normalization", "T", OpSchema::Optional)
+                                .TypeConstraint("T1", {"tensor(int32)"}, "Constrain input and output integer tensors types")
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output float tensors types.")
+                                .TypeAndShapeInferenceFunction(embedLayerNormalizationShapeInference));
+
+constexpr const char* QEmbedLayerNormalization_ver1_doc = R"DOC(
+QEmbedLayerNormalization is the quantized fusion of embedding layer in BERT model, with optional mask processing.
+The embedding layer takes input_ids (word IDs) and segment_ids (sentence IDs) to look up word_embedding, position_embedding,
+and segment_emedding; the embeddings are added then applied layer normalization using gamma and beta tensors. The input_ids
+and segment_ids remain int32. All embeddings, gamma, and beta tensors are converted to int8/uint8. The last input mask is optional.
+If mask is provided, mask index (that is position of first 0 in mask, or number of words will be calculated.)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(QEmbedLayerNormalization, 1,
+                            OpSchema()
+                                .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
+                                .SetDoc(QEmbedLayerNormalization_ver1_doc)
+                                .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, kDefaultEmbedLayerNormEpsilon)
+                                .Input(0, "input_ids", "2D words IDs with shape (batch_size, sequence_length)", "T1")
+                                .Input(1, "segment_ids", "2D segment IDs with shape (batch_size, sequence_length)", "T1", OpSchema::Optional)
+                                .Input(2, "word_embedding_quant", "2D with shape (,hidden_size)", "T2")
+                                .Input(3, "position_embedding_quant", "2D with shape (, hidden_size)", "T2")
+                                .Input(4, "segment_embedding", "2D with shape (, hidden_size)", "T2", OpSchema::Optional)
+                                .Input(5, "gamma_quant", "1D gamma tensor for layer normalization with shape (hidden_size)", "T2")
+                                .Input(6, "beta_quant", "1D beta tensor for layer normalization  with shape (hidden_size)", "T2")
+                                .Input(7, "mask", "Mask", "T1", OpSchema::Optional)
+                                .Input(8, "word_embedding_scale", "Scale for word embeddings", "T")
+                                .Input(9, "position_embedding_scale", "Scale for position embeddings", "T")
+                                .Input(10, "segment_embedding_scale", "Scale for segment embeddings", "T", OpSchema::Optional)
+                                .Input(11, "gamma_scale", "Scale for 1D gamma tensor", "T")
+                                .Input(12, "beta_scale", "Scale for 1D beta tensor", "T")
+                                .Input(13, "word_embedding_zero_point", "Zero point for word embeddings", "T2")
+                                .Input(14, "position_embedding_zero_point", "Zero point for position embeddings", "T2")
+                                .Input(15, "segment_embedding_zero_point", "Zero Point for segment embeddings", "T2", OpSchema::Optional)
+                                .Input(16, "gamma_zero_point", "Zero Point for 1D gamma tensor", "T2")
+                                .Input(17, "beta_zero_point", "Zero Point for 1D beta tensor", "T2")
+                                .Output(0, "layernorm_out", "LayerNorm Output", "T")
+                                .Output(1, "mask_index_out", "Mask Index Output", "T1")
+                                .TypeConstraint("T1", {"tensor(int32)"}, "Constrain mask index to integer types")
+                                .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "Constrain input and output types to int8 tensors.")
+                                .TypeConstraint("T", {"tensor(float)"}, "Constrain input and output types to float32 tensors.")
+                                .TypeAndShapeInferenceFunction(embedLayerNormalizationShapeInference));
+
+constexpr const char* FastGelu_ver1_doc = R"DOC(
+GELU (Gaussian Error Linear Unit) approximation: Y=0.5*X*(1+tanh(0.797885*X+0.035677*X*X*X)) with an optional input of bias that will be added to X before GELU.)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(FastGelu, 1,
+                            OpSchema()
+                                .SetDoc(FastGelu_ver1_doc)
+                                .Input(0, "X", "input tensor", "T")
+                                .Input(1, "bias", "bias tensor", "T", OpSchema::Optional)
+                                .Output(0, "Y", "output tensor", "T")
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output types to float or half tensors.")
+                                .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)
+                                .SetContextDependentFunctionBodyBuilder([](const FunctionBodyBuildContext& ctx, const OpSchema& schema, FunctionProto& functionProto) {
+                                  // fastgelu(x) =
+                                  auto* tp = ctx.getInputType(0);
+                                  if ((tp == nullptr) || (!tp->has_tensor_type()))
+                                    return false;
+                                  auto elem_type = tp->tensor_type().elem_type();
+
+                                  // Optional input 1 indicates a bias to be added to input 0.
+                                  auto hasBias = ctx.hasInput(1);
+
+                                  FunctionBuilder builder(functionProto);
+                                  builder
+                                      .AddOpset("", 13)
+                                      .Const("a", 0.5, elem_type)
+                                      .Const("b", 0.797885, elem_type)
+                                      .Const("c", 0.035677, elem_type)
+                                      .Const("one", 1.0, elem_type)
+                                      .Add(hasBias ? "X_bias = Add (X, bias)" : "X_bias = Identity (X)")
+                                      .Add(R"(
+                T1 = Mul (X_bias, X_bias)
+                T2 = Mul (c, T1)
+                T3 = Add (b, T2)
+                T4 = Mul (X_bias, T3)
+                T5 = Tanh (T4)
+                T6 = Add (one, T5)
+                T7 = Mul (X_bias, T6)
+                Y = Mul (a, T7)
+            )");
+
+                                  schema.BuildFunction(functionProto);
+                                  return true;
+                                }));
+
+ONNX_MS_OPERATOR_SET_SCHEMA(SkipLayerNormalization, 1,
+                            OpSchema()
+                                .SetDoc("Skip and Layer Normalization Fusion")
+                                .Attr("epsilon", "The epsilon value to use to avoid division by zero.", AttributeProto::FLOAT, kDefaultSkipLayerNormEpsilon)
+                                .Input(0, "input", "3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .Input(1, "skip", "3D skip tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .Input(2, "gamma", "1D input tensor with shape (hidden_size)", "T")
+                                .Input(3, "beta", "1D skip tensor with shape (hidden_size", "T", OpSchema::Optional)
+                                .Input(4, "bias", "1D bias tensor with shape (hidden_size", "T", OpSchema::Optional)
+                                .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "T")
+                                .Output(1, "mean", "Saved mean used during training to speed up gradient computation", "U", OpSchema::Optional)
+                                .Output(2, "inv_std_var", "Saved inverse standard variance used during training to speed up gradient computation.", "U", OpSchema::Optional)
+                                .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or half tensors.")
+                                .TypeConstraint("U", {"tensor(float)"}, "Constrain mean and inv_std_var to float tensors.")
+                                .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* NGramRepeatBlock_ver1_doc = R"DOC(
+Enforce no repetition of n-grams. Scores are set to `-inf` for tokens that form a repeated n-gram if added to the back of the input_ids.
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(NGramRepeatBlock, 1,
+                            OpSchema().SetDoc(NGramRepeatBlock_ver1_doc).Attr("ngram_size", "The NGram size.", AttributeProto::INT).Input(0, "input_ids", "2D input tensor with shape (batch_size, sequence_length)", "Tid").Input(1, "scores", "2D input tensor with shape (batch_size, vocab_size)", "T").Output(0, "scores_out", "2D output tensor with shape (batch_size, vocab_size)", "T").TypeConstraint("Tid", {"tensor(int64)"}, "Constrain indices to integer types").TypeConstraint("T", {"tensor(float)"}, "Constrain scores input and output types to float tensors.").TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                              propagateElemTypeFromInputToOutput(ctx, 1, 0);
+                              if (!hasInputShape(ctx, 1)) {
+                                return;
+                              }
+                              propagateShapeFromInputToOutput(ctx, 1, 0);
+                            }));
+
+constexpr const char* BifurcationDetector_ver1_doc = R"DOC(
+Component for aggressive decoding. Find the bifurcation index of predicted tokens, between source tokens,
+starting from previous suffix match index, and predicted tokens.
+Concat predicted tokens, starting from bifurcation index, to the back
+of current tokens. This forms the output tokens.
+Detect suffix match index in source tokens, between source tokens and output tokens.
+Detection is based on finding the appearances of last n-gram in output tokens
+in source tokens.
+A match is considered found if source tokens contain a single matching n-gram.
+Return the index of the start of the n-gram in source tokens.
+No matching if found if src tokens contain multiple or zero matching n-grams. Return -1.
+)DOC";
+
+ONNX_MS_OPERATOR_SET_SCHEMA(BifurcationDetector, 1,
+                            OpSchema()
+                                .SetDoc(BifurcationDetector_ver1_doc)
+                                .Attr("min_ngram_size", "The minimum NGram size for suffix matching.", AttributeProto::INT, static_cast<int64_t>(1))
+                                .Attr("max_ngram_size", "The maximum NGram size for suffix matching.", AttributeProto::INT, static_cast<int64_t>(3))
+                                .Input(0, "src_tokens", "Encoder input ids.", "T")
+                                .Input(1, "cur_tokens", "Decoder input ids.", "T")
+                                .Input(2, "prev_suffix_match_idx", "Previous suffix match index", "T")
+                                .Input(3, "pred_tokens", "Predicted token ids from aggressive decoding", "T", OpSchema::Optional)
+                                .Output(0, "tokens", "Decoder input ids after merging predicted tokens", "T")
+                                .Output(1, "suffix_match_idx", "new suffix match index", "T")
+                                .TypeConstraint("T", {"tensor(int64)"}, "Constrain to integer types.")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  propagateElemTypeFromInputToOutput(ctx, 1, 0);
+                                  propagateElemTypeFromInputToOutput(ctx, 2, 1);
+                                  if (hasInputShape(ctx, 2)) {
+                                    propagateShapeFromInputToOutput(ctx, 2, 1);
+                                  }
+                                  // output tokens lengths is dynamic as it depends on the bifurcation index of predicted tokens and source tokens,
+                                  // and current tokens length.
+                                  // tokens_length = cur_tokens_length + bifurcation_index + 1.
+                                }));
+}
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.h
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.h
@ -5,12 +5,32 @@

 #if !defined(ORT_MINIMAL_BUILD)
 #include "onnx/defs/schema.h"
+#include "core/graph/contrib_ops/ms_schema.h"
 #else
 #include "onnx/defs/data_type_utils.h"
 #endif

+#define ONNX_MS_OPERATOR_SET_SCHEMA(name, ver, impl) \
+  ONNX_OPERATOR_SET_SCHEMA_EX(name, Microsoft, ::onnxruntime::kMSDomain, ver, true, impl)
+
+//They are in ONNX domain but they are in our source code
+#define ONNX_CONTRIB_OPERATOR_SET_SCHEMA(name, ver, impl) \
+  ONNX_OPERATOR_SET_SCHEMA_EX(name, Onnx, ::ONNX_NAMESPACE::ONNX_DOMAIN, ver, true, impl)
+
 namespace onnxruntime {
 namespace contrib {
+namespace utils {
+inline bool HasDimValue(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim) {
+  return dim.value_case() == ONNX_NAMESPACE::TensorShapeProto_Dimension::kDimValue;
+}
+inline bool HasRawData(const ONNX_NAMESPACE::TensorProto& ten_proto) {
+  // Can not be UNDEFINED and can not be STRING but test for STRING is usually performed separately
+  // to return an error
+  return ten_proto.data_type() != ONNX_NAMESPACE::TensorProto::UNDEFINED &&
+         ten_proto.has_raw_data();  // XXX: Figure out how to do in proto3
+}
+}
+
 #define ONNX_CONTRIB_OPERATOR_SCHEMA(name) \
  ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name)
 #define ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(Counter, name) \
@ -31,7 +51,6 @@ namespace contrib {

 void RegisterContribSchemas();
 void RegisterNchwcSchemas();
-void RegisterNhwcSchemas();
 void RegisterQuantizationSchemas();

 constexpr const float kDefaultSkipLayerNormEpsilon = 1e-12f;
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@ -0,0 +1,144 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "onnx/defs/schema.h"
+#include "core/graph/contrib_ops/ms_schema.h"
+
+namespace onnxruntime {
+namespace contrib {
+//NHWC ops
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, NhwcMaxPool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearGlobalAveragePool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearAveragePool);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearConv);
+
+//Quantization ops
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DequantizeLinear);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DynamicQuantizeLSTM);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DynamicQuantizeMatMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulIntegerToFloat);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MulInteger);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QGemm);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearAdd);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearConcat);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearLeakyRelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearReduceMean);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSigmoid);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuantizeLinear);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ReduceSumInteger);
+
+//Others
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Attention);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BeamSearch);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasDropout);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasGelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasSoftmax);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BifurcationDetector);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, CDist);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ComplexMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ComplexMulConj);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ConvTransposeWithDynamicPads);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, CropAndResize);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderAttention);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, EmbedLayerNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ExpandDims);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FastGelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedConv);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedGemm);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedMatMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatherND);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, IsAllFinite);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, LongformerAttention);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulInteger16);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, NGramRepeatBlock);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Pad);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QAttention);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QEmbedLayerNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipLayerNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SparseToDenseMatMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Tokenizer);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, TorchEmbedding);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, TransposeMatMul);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Trilu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Unique);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, WordConvEmbedding);
+
+class OpSet_Microsoft_ver1 {
+ public:
+  static void ForEachSchema(std::function<void(ONNX_NAMESPACE::OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, NhwcMaxPool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearGlobalAveragePool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearAveragePool)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearConv)>());
+
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DequantizeLinear)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DynamicQuantizeLSTM)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DynamicQuantizeMatMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulIntegerToFloat)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MulInteger)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QGemm)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearAdd)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearConcat)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearLeakyRelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearReduceMean)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QLinearSigmoid)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuantizeLinear)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ReduceSumInteger)>());
+
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Attention)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BeamSearch)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasDropout)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasGelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasSoftmax)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BifurcationDetector)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, CDist)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ComplexMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ComplexMulConj)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ConvTransposeWithDynamicPads)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, CropAndResize)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderAttention)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, EmbedLayerNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, ExpandDims)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FastGelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedConv)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedGemm)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, FusedMatMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatherND)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, IsAllFinite)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, LongformerAttention)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MatMulInteger16)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MaxpoolWithMask)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, NGramRepeatBlock)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Pad)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QAttention)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QEmbedLayerNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SampleOp)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SkipLayerNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, SparseToDenseMatMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Tokenizer)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, TorchEmbedding)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, TransposeMatMul)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Trilu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Unique)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, WordConvEmbedding)>());
+  }
+};
+}  // namespace contrib
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/ms_schema.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_schema.h
@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "onnx/defs/schema.h"
+
+namespace onnxruntime {
+namespace contrib {
+// ONNX namespace has the same function. We copy it to our namespace so that we can provide explicit specializations
+// for it in onnxruntime::contrib namespace. Otherwise we will need to put a lot of our code in ONNX namespace.
+template <typename T>
+::ONNX_NAMESPACE::OpSchema GetOpSchema();
+}
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
@ -1,21 +1,16 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.

-#include "core/framework/tensorprotoutils.h"
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/contrib_defs.h"
 #include "core/graph/contrib_ops/quantization_defs.h"

 namespace ONNX_NAMESPACE {
-void convPoolShapeInference(
-    InferenceContext& ctx,
-    bool use_dilation,
-    bool require_kernel_shape,
-    int input1Idx,
-    int input2Idx);
+void convPoolShapeInference(InferenceContext& ctx, bool use_dilation, bool require_kernel_shape, int input1Idx,
+                            int input2Idx);
 }  // namespace ONNX_NAMESPACE

-using namespace ONNX_NAMESPACE;
+using namespace ::ONNX_NAMESPACE;

 namespace onnxruntime {
 namespace contrib {
@ -72,8 +67,7 @@ class NhwcInferenceContext : public InferenceContext {
    return (index == 0) ? &input_type_ : ctx_.getInputType(index);
  }

-  const TensorProto* getInputData(size_t index) const override {
-    ORT_UNUSED_PARAMETER(index);
+  const TensorProto* getInputData(size_t) const override {
    return nullptr;
  }

@ -85,8 +79,7 @@ class NhwcInferenceContext : public InferenceContext {
    return (index == 0) ? &output_type_ : ctx_.getOutputType(index);
  }

-  GraphInferencer* getGraphAttributeInferencer(const std::string& attribute_name) override {
-    ORT_UNUSED_PARAMETER(attribute_name);
+  GraphInferencer* getGraphAttributeInferencer(const std::string&) override {
    return nullptr;
  }

@ -104,12 +97,9 @@ class NhwcInferenceContext : public InferenceContext {
  TypeProto output_type_;
 };

-void convPoolShapeInferenceNhwc(
-    InferenceContext& ctx,
-    bool use_dilation,
-    bool require_kernel_shape,
-    int input1Idx,
-    int input2Idx) {
+
+void convPoolShapeInferenceNhwc(InferenceContext& ctx, bool use_dilation, bool require_kernel_shape, int input1Idx,
+                                int input2Idx) {
  // Reuse the NCHW implementation by transposing the input/output tensor using
  // a local inference context.
  NhwcInferenceContext nhwc_ctx(ctx);
@ -118,161 +108,80 @@ void convPoolShapeInferenceNhwc(
  nhwc_ctx.TransposeOutputShape();
 }

-void RegisterNhwcSchemas() {
-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearConv)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
-      .Input(0, "x", "", "T1")
-      .Input(1, "x_scale", "", "tensor(float)")
-      .Input(2, "x_zero_point", "", "T1")
-      .Input(3, "w", "", "T2")
-      .Input(4, "w_scale", "", "tensor(float)")
-      .Input(5, "w_zero_point", "", "T2")
-      .Input(6, "y_scale", "", "tensor(float)")
-      .Input(7, "y_zero_point", "", "T3")
-      .Input(8, "B", "", "T4", OpSchema::Optional)
-      .Output(0, "y", "", "T3")
-      .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "")
-      .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "")
-      .TypeConstraint("T3", {"tensor(int8)", "tensor(uint8)"}, "")
-      .TypeConstraint("T4", {"tensor(int32)"}, "")
-      .Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"))
-      .Attr("kernel_shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("dilations", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("strides", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("pads", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("group", "", AttributeProto::INT, static_cast<int64_t>(1))
-      .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
-      .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-        auto x_type = ctx.getInputType(0);
-        auto w_type = ctx.getInputType(3);
-        if (nullptr == x_type || nullptr == w_type ||
-            x_type->value_case() != TypeProto::kTensorType ||
-            w_type->value_case() != TypeProto::kTensorType) {
-          fail_type_inference("inputs are expected to have tensor type.");
-        }
+ONNX_MS_OPERATOR_SET_SCHEMA(NhwcMaxPool, 1,
+                            OpSchema()
+                                .Input(0, "x", "", "T")
+                                .Output(0, "y", "", "T")
+                                .TypeConstraint("T", {"tensor(int8)", "tensor(uint8)"}, "")
+                                .Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"))
+                                .Attr("kernel_shape", "", AttributeProto::INTS)
+                                .Attr("dilations", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("strides", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("pads", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("ceil_mode", "", AttributeProto::INT, static_cast<int64_t>(0))
+                                .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+                                  propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                                  ::onnxruntime::contrib::convPoolShapeInferenceNhwc(ctx, true, true, 0, 1);
+                                }));

-        auto x_zero_point_type = ctx.getInputType(2);
-        if (nullptr == x_zero_point_type ||
-            x_zero_point_type->tensor_type().elem_type() !=
-                x_type->tensor_type().elem_type()) {
-          fail_type_inference(
-              "input and zero_point pair is expected to have be same type.");
-        }
-
-        auto w_zero_point_type = ctx.getInputType(5);
-        if (nullptr == w_zero_point_type ||
-            w_zero_point_type->tensor_type().elem_type() !=
-                w_type->tensor_type().elem_type()) {
-          fail_type_inference(
-              "weight and zero_point pair is expected to have same type.");
-        }
-
-        propagateElemTypeFromInputToOutput(ctx, 7, 0);
-
-        if (getAttribute(ctx, "channels_last", 0) == 0) {
-          convPoolShapeInference(ctx, true, false, 0, 3);
-        } else {
-          convPoolShapeInferenceNhwc(ctx, true, false, 0, 3);
-        }
-      });
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(NhwcMaxPool)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
-      .Input(0, "x", "", "T")
-      .Output(0, "y", "", "T")
-      .TypeConstraint("T", {"tensor(int8)", "tensor(uint8)"}, "")
-      .Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"))
-      .Attr("kernel_shape", "", AttributeProto::INTS)
-      .Attr("dilations", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("strides", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("pads", "", AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr("ceil_mode", "", AttributeProto::INT, static_cast<int64_t>(0))
-      .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
-        convPoolShapeInferenceNhwc(ctx, true, true, 0, 1);
-      });
-
-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearGlobalAveragePool)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
-      .SetDoc(R"DOC(
+ONNX_MS_OPERATOR_SET_SCHEMA(QLinearGlobalAveragePool, 1,
+                            OpSchema()
+                                .SetDoc(R"DOC(
 QLinearGlobalAveragePool consumes an input tensor X and applies Average pooling across
 the values in the same channel. This is equivalent to AveragePool with kernel size
 equal to the spatial dimension of input tensor. Input is of type uint8_t or int8_t.
 )DOC")
-      .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
-      .Input(
-          0,
-          "X",
-          "Input data tensor from the previous operator; According to channels_last, "
-          "dimensions for image case are (N x C x H x W), or (N x H x W x C) "
-          "where N is the batch size, C is the number of "
-          "channels, and H and W are the height and the width "
-          "of the data. For non image case, the dimensions are "
-          "in the form of (N x C x D1 x D2 ... Dn), or (N x D1 X D2 ... Dn x C) "
-          "where N is the batch size.",
-          "T")
-      .Input(
-          1,
-          "x_scale",
-          "Scale of quantized input 'X'. It must be a scalar.",
-          "tensor(float)")
-      .Input(
-          2,
-          "x_zero_point",
-          "Zero point tensor for input 'X'. It must be a scalar.",
-          "T")
-      .Input(
-          3,
-          "y_scale",
-          "Scale of quantized output 'Y'. It must be a scalar.",
-          "tensor(float)")
-      .Input(
-          4,
-          "y_zero_point",
-          "Zero point tensor for output 'Y'. It must be a scalar.",
-          "T")
-      .Output(
-          0,
-          "Y",
-          "Output data tensor from pooling across the input "
-          "tensor. The output tensor has the same rank as the input. "
-          "with the N and C value keep it value, while the other"
-          "dimensions are all 1.",
-          "T")
-      .TypeConstraint(
-          "T",
-          {"tensor(uint8)", "tensor(int8)"},
-          "Constrain input and output types to singed/unsigned int8 tensors.")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                                .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
+                                .Input(0, "X",
+                                       "Input data tensor from the previous operator; According to channels_last, "
+                                       "dimensions for image case are (N x C x H x W), or (N x H x W x C) "
+                                       "where N is the batch size, C is the number of "
+                                       "channels, and H and W are the height and the width "
+                                       "of the data. For non image case, the dimensions are "
+                                       "in the form of (N x C x D1 x D2 ... Dn), or (N x D1 X D2 ... Dn x C) "
+                                       "where N is the batch size.",
+                                       "T")
+                                .Input(1, "x_scale", "Scale of quantized input 'X'. It must be a scalar.",
+                                       "tensor(float)")
+                                .Input(2, "x_zero_point", "Zero point tensor for input 'X'. It must be a scalar.", "T")
+                                .Input(3, "y_scale", "Scale of quantized output 'Y'. It must be a scalar.",
+                                       "tensor(float)")
+                                .Input(4, "y_zero_point", "Zero point tensor for output 'Y'. It must be a scalar.", "T")
+                                .Output(0, "Y",
+                                        "Output data tensor from pooling across the input "
+                                        "tensor. The output tensor has the same rank as the input. "
+                                        "with the N and C value keep it value, while the other"
+                                        "dimensions are all 1.",
+                                        "T")
+                                .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"},
+                                                "Constrain input and output types to singed/unsigned int8 tensors.")
+                                .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                  propagateElemTypeFromInputToOutput(ctx, 0, 0);

-        int64_t channel_last = getAttribute(ctx, "channels_last", 0);
+                                  int64_t channel_last = getAttribute(ctx, "channels_last", 0);

-        // needs at least one input with shape.
-        if (!hasNInputShapes(ctx, 1)) {
-          return;
-        }
+                                  // needs at least one input with shape.
+                                  if (!hasNInputShapes(ctx, 1)) {
+                                    return;
+                                  }

-        auto input_shape = ctx.getInputType(0)->tensor_type().shape();
-        if (input_shape.dim_size() < 2) {
-          return;
-        }
+                                  auto input_shape = ctx.getInputType(0)->tensor_type().shape();
+                                  if (input_shape.dim_size() < 2) {
+                                    return;
+                                  }

-        // (N, C, 1, 1, ..., 1) or (N, 1, 1, ..., 1, C)
-        auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
-        output_shape->CopyFrom(input_shape);
-        int image_dim_index = (channel_last ? 1 : 2);
-        for (auto n_hw_dims = input_shape.dim_size() - 2; n_hw_dims > 0; --n_hw_dims) {
-          output_shape->mutable_dim(image_dim_index)->clear_dim_param();
-          output_shape->mutable_dim(image_dim_index)->set_dim_value(1);
-          ++image_dim_index;
-        }
-      });
+                                  // (N, C, 1, 1, ..., 1) or (N, 1, 1, ..., 1, C)
+                                  auto output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+                                  output_shape->CopyFrom(input_shape);
+                                  int image_dim_index = (channel_last ? 1 : 2);
+                                  for (auto n_hw_dims = input_shape.dim_size() - 2; n_hw_dims > 0; --n_hw_dims) {
+                                    output_shape->mutable_dim(image_dim_index)->clear_dim_param();
+                                    output_shape->mutable_dim(image_dim_index)->set_dim_value(1);
+                                    ++image_dim_index;
+                                  }
+                                }));

-  const char* QLinearAveragePoolDoc_ver1 = R"DOC(
+constexpr const char* QLinearAveragePoolDoc_ver1 = R"DOC(
 QLinearAveragePool consumes an input tensor X and applies average pooling across
 the tensor according to kernel sizes, stride sizes, and pad lengths.
 average pooling consisting of computing the average on all values of a
@ -307,121 +216,143 @@ Input and output scales and zero points are used to convert the output to a new
 Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
 )DOC";

-  static const char* contrib_ops_pads_doc =
-      "Padding for the beginning and ending along each spatial axis, it can take any value greater "
-      "than or equal to 0. The value represent the number of pixels added to the beginning "
-      "and end part of the corresponding axis. `pads` format should be as follow "
-      "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
-      "added at the beginning of axis `i` and xi_end, the number of pixels added at "
-      "the end of axis `i`. This attribute cannot be used simultaneously with "
-      "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
-  static const char* contrib_ops_auto_pad_doc =
-      "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
-      "default value is NOTSET, which means explicit padding is used. "
-      "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
-      "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
-      "beginning for SAME_LOWER. VALID mean no padding.";
+constexpr const char* contrib_ops_pads_doc =
+    "Padding for the beginning and ending along each spatial axis, it can take any value greater "
+    "than or equal to 0. The value represent the number of pixels added to the beginning "
+    "and end part of the corresponding axis. `pads` format should be as follow "
+    "[x1_begin, x2_begin...x1_end, x2_end,...], where xi_begin the number of pixels "
+    "added at the beginning of axis `i` and xi_end, the number of pixels added at "
+    "the end of axis `i`. This attribute cannot be used simultaneously with "
+    "auto_pad attribute. If not present, the padding defaults to 0 along start and end of each spatial axis.";
+constexpr const char* contrib_ops_auto_pad_doc =
+    "auto_pad must be either NOTSET, SAME_UPPER, SAME_LOWER or VALID. Where "
+    "default value is NOTSET, which means explicit padding is used. "
+    "SAME_UPPER or SAME_LOWER mean pad the input so that the output spatial size match the input."
+    "In case of odd number add the extra padding at the end for SAME_UPPER and at the "
+    "beginning for SAME_LOWER. VALID mean no padding.";

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAveragePool)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
-      .SetDoc(QLinearAveragePoolDoc_ver1)
-      .Attr(
-          "count_include_pad",
-          "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include pad.",
-          AttributeProto::INT,
-          static_cast<int64_t>(0))
-      .Attr(
-          "kernel_shape",
-          "The size of the kernel along each axis.",
-          AttributeProto::INTS)
-      .Attr(
-          "strides",
-          "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
-          AttributeProto::INTS,
-          OPTIONAL_VALUE)
-      .Attr(
-          "auto_pad",
-          contrib_ops_auto_pad_doc,
-          AttributeProto::STRING,
-          std::string("NOTSET"))
-      .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
-      .Attr(
-          "ceil_mode",
-          "Whether to use ceil or floor (default) to compute the output shape.",
-          AttributeProto::INT,
-          static_cast<int64_t>(0))
-      .Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT, static_cast<int64_t>(0))
-      .Input(
-          0,
-          "X",
-          "Input data tensor from the previous operator; "
-          "dimensions for image case are (N x C x H x W), "
-          "where N is the batch size, C is the number of "
-          "channels, and H and W are the height and the "
-          "width of the data. For non image case, the "
-          "dimensions are in the form of "
-          "(N x C x D1 x D2 ... Dn), where N is the batch "
-          "size. Optionally, if dimension denotation is "
-          "in effect, the operation expects the input "
-          "data tensor to arrive with the dimension denotation "
-          "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
-          "T")
-      .Input(
-          1,
-          "x_scale",
-          "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
-          "tensor(float)")
-      .Input(
-          2,
-          "x_zero_point",
-          "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
-          "T",
-          OpSchema::Optional)
-      .Input(
-          3,
-          "y_scale",
-          "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
-          "tensor(float)")
-      .Input(
-          4,
-          "y_zero_point",
-          "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
-          "T",
-          OpSchema::Optional)
-      .Output(
-          0,
-          "Y",
-          "Output data tensor from average or max pooling across "
-          "the input tensor. Dimensions will vary based "
-          "on various kernel, stride, and pad sizes. Floor value of "
-          "the dimension is used",
-          "T")
-      .TypeConstraint(
-          "T",
-          {"tensor(uint8)", "tensor(int8)"},
-          "Constrain input and output types to 8 bit tensors.")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+ONNX_MS_OPERATOR_SET_SCHEMA(
+    QLinearAveragePool, 1,
+    OpSchema()
+        .SetDoc(QLinearAveragePoolDoc_ver1)
+        .Attr("count_include_pad",
+              "Whether include pad pixels when calculating values for the edges. Default is 0, doesn't count include "
+              "pad.",
+              AttributeProto::INT, static_cast<int64_t>(0))
+        .Attr("kernel_shape", "The size of the kernel along each axis.", AttributeProto::INTS)
+        .Attr("strides",
+              "Stride along each spatial axis. If not present, the stride defaults to 1 along each spatial axis.",
+              AttributeProto::INTS, OPTIONAL_VALUE)
+        .Attr("auto_pad", contrib_ops_auto_pad_doc, AttributeProto::STRING, std::string("NOTSET"))
+        .Attr("pads", contrib_ops_pads_doc, AttributeProto::INTS, OPTIONAL_VALUE)
+        .Attr("ceil_mode", "Whether to use ceil or floor (default) to compute the output shape.", AttributeProto::INT,
+              static_cast<int64_t>(0))
+        .Attr("channels_last", "Works on NHWC layout or not? Default not.", AttributeProto::INT,
+              static_cast<int64_t>(0))
+        .Input(0, "X",
+               "Input data tensor from the previous operator; "
+               "dimensions for image case are (N x C x H x W), "
+               "where N is the batch size, C is the number of "
+               "channels, and H and W are the height and the "
+               "width of the data. For non image case, the "
+               "dimensions are in the form of "
+               "(N x C x D1 x D2 ... Dn), where N is the batch "
+               "size. Optionally, if dimension denotation is "
+               "in effect, the operation expects the input "
+               "data tensor to arrive with the dimension denotation "
+               "of [DATA_BATCH, DATA_CHANNEL, DATA_FEATURE, DATA_FEATURE ...].",
+               "T")
+        .Input(1, "x_scale", "Input scale. It's a scalar, which means a per-tensor/layer quantization.",
+               "tensor(float)")
+        .Input(2, "x_zero_point",
+               "Input zero point. Default value is 0 if it's not specified. It's a scalar, which means a "
+               "per-tensor/layer quantization.",
+               "T", OpSchema::Optional)
+        .Input(3, "y_scale", "Output scale. It's a scalar, which means a per-tensor/layer quantization.",
+               "tensor(float)")
+        .Input(4, "y_zero_point",
+               "Output zero point. Default value is 0 if it's not specified. It's a scalar, which means a "
+               "per-tensor/layer quantization.",
+               "T", OpSchema::Optional)
+        .Output(0, "Y",
+                "Output data tensor from average or max pooling across "
+                "the input tensor. Dimensions will vary based "
+                "on various kernel, stride, and pad sizes. Floor value of "
+                "the dimension is used",
+                "T")
+        .TypeConstraint("T", {"tensor(uint8)", "tensor(int8)"}, "Constrain input and output types to 8 bit tensors.")
+        .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+          ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);

-        auto data_type = ctx.getInputType(0);
-        if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
-          fail_type_inference("inputs are expected to have tensor type.");
-        }
+          auto data_type = ctx.getInputType(0);
+          if (nullptr == data_type || data_type->value_case() != ONNX_NAMESPACE::TypeProto::kTensorType) {
+            fail_type_inference("inputs are expected to have tensor type.");
+          }

-        // validate scale and zero points
-        ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
-        ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);
+          // validate scale and zero points
+          onnxruntime::contrib::ValidateTypeAndShapeForScaleAndZP(ctx, 1, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+          onnxruntime::contrib::ValidateTypeAndShapeForScaleAndZP(ctx, 2, data_type->tensor_type().elem_type(), true);
+          onnxruntime::contrib::ValidateTypeAndShapeForScaleAndZP(ctx, 3, ONNX_NAMESPACE::TensorProto::FLOAT, true);
+          onnxruntime::contrib::ValidateTypeAndShapeForScaleAndZP(ctx, 4, data_type->tensor_type().elem_type(), true);

-        if (getAttribute(ctx, "channels_last", 0) == 0) {
-          ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
-        } else {
-          convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
-        }
-      });
-}
+          if (getAttribute(ctx, "channels_last", 0) == 0) {
+            ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
+          } else {
+            onnxruntime::contrib::convPoolShapeInferenceNhwc(ctx, false, true, 0, 5);
+          }
+        }));

+ONNX_MS_OPERATOR_SET_SCHEMA(QLinearConv, 1,
+                            OpSchema()
+                                .Input(0, "x", "", "T1")
+                                .Input(1, "x_scale", "", "tensor(float)")
+                                .Input(2, "x_zero_point", "", "T1")
+                                .Input(3, "w", "", "T2")
+                                .Input(4, "w_scale", "", "tensor(float)")
+                                .Input(5, "w_zero_point", "", "T2")
+                                .Input(6, "y_scale", "", "tensor(float)")
+                                .Input(7, "y_zero_point", "", "T3")
+                                .Input(8, "B", "", "T4", OpSchema::Optional)
+                                .Output(0, "y", "", "T3")
+                                .TypeConstraint("T1", {"tensor(int8)", "tensor(uint8)"}, "")
+                                .TypeConstraint("T2", {"tensor(int8)", "tensor(uint8)"}, "")
+                                .TypeConstraint("T3", {"tensor(int8)", "tensor(uint8)"}, "")
+                                .TypeConstraint("T4", {"tensor(int32)"}, "")
+                                .Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"))
+                                .Attr("kernel_shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("dilations", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("strides", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("pads", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                .Attr("group", "", AttributeProto::INT, static_cast<int64_t>(1))
+                                .Attr("channels_last", "", AttributeProto::INT, static_cast<int64_t>(0))
+                                .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+                                  auto x_type = ctx.getInputType(0);
+                                  auto w_type = ctx.getInputType(3);
+                                  if (nullptr == x_type || nullptr == w_type ||
+                                      x_type->value_case() != TypeProto::kTensorType ||
+                                      w_type->value_case() != TypeProto::kTensorType) {
+                                    fail_type_inference("inputs are expected to have tensor type.");
+                                  }
+
+                                  auto x_zero_point_type = ctx.getInputType(2);
+                                  if (nullptr == x_zero_point_type || x_zero_point_type->tensor_type().elem_type() !=
+                                                                          x_type->tensor_type().elem_type()) {
+                                    fail_type_inference("input and zero_point pair is expected to have be same type.");
+                                  }
+
+                                  auto w_zero_point_type = ctx.getInputType(5);
+                                  if (nullptr == w_zero_point_type || w_zero_point_type->tensor_type().elem_type() !=
+                                                                          w_type->tensor_type().elem_type()) {
+                                    fail_type_inference("weight and zero_point pair is expected to have same type.");
+                                  }
+
+                                  propagateElemTypeFromInputToOutput(ctx, 7, 0);
+
+                                  if (getAttribute(ctx, "channels_last", 0) == 0) {
+                                    convPoolShapeInference(ctx, true, false, 0, 3);
+                                  } else {
+                                    onnxruntime::contrib::convPoolShapeInferenceNhwc(ctx, true, false, 0, 3);
+                                  }
+                                }));
 }  // namespace contrib
-}  // namespace onnxruntime
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc
+++ b/onnxruntime/core/graph/contrib_ops/onnx_deprecated_operators.cc
@ -0,0 +1,500 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "onnx/defs/schema.h"
+#include "onnx/defs/shape_inference.h"
+#include "onnx/defs/tensor_proto_util.h"
+
+// Register removed experimental ops for backward compatibility.
+// Experimental operators do not have version history. However, Windows 10 1809(RS5) takes bunch of experimental operators
+// as production ops. In order to maintain backward compatibility when the experimental ops are removed from ONNX
+// they need to be added in onnxruntime as contrib ops.
+// ONNX exp ops(Affine, Crop, ParametricSoftplus, ImageScaler, ThresholdedRelu, DynamicSlice, ScaledTanh, MVN) old
+// version history maintenance
+// See: https://github.com/onnx/onnx/pull/1909
+
+#include "core/graph/contrib_ops/contrib_defs.h"
+using namespace ONNX_NAMESPACE;
+namespace onnxruntime {
+namespace contrib {
+constexpr const char* Affine_ver1_doc = R"DOC(
+Affine takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the affine function, y = alpha * x + beta,
+is applied to the tensor elementwise.
+)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Affine, 1,
+    OpSchema()
+        .SetDoc(Affine_ver1_doc)
+        .Attr("alpha", "Value of alpha", AttributeProto::FLOAT, 1.0f)
+        .Attr("beta", "Value of beta", AttributeProto::FLOAT, 0.0f)
+        .Input(0, "X", "1D input tensor", "T")
+        .Output(0, "Y", "1D output tensor", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* ParametricSoftplus_ver1_doc = R"DOC(
+ParametricSoftplus takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the softplus function, y = alpha * ln(exp(beta * x) + 1), is applied to
+the tensor elementwise.
+)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ParametricSoftplus, 1,
+    OpSchema()
+        .SetDoc(ParametricSoftplus_ver1_doc)
+        .Attr("alpha", "Value of alpha", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Attr("beta", "Value of beta", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Input(0, "X", "1D input tensor", "T")
+        .Output(0, "Y", "1D input tensor", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* ImageScaler_ver1_doc =
+    R"DOC(Scale and bias the input image. Bias values are stored in
+the same ordering as the image pixel format.)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ImageScaler, 1,
+    OpSchema()
+        .SetDoc(ImageScaler_ver1_doc)
+        .Attr("bias", "Bias applied to each channel, same size as C.", AttributeProto::FLOATS, OPTIONAL_VALUE)
+        .Attr("scale", "The scale to apply.", AttributeProto::FLOAT, 1.0f)
+        .Input(0, "input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output(0, "output", "Result, has same shape and type as input", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* Crop_ver1_doc =
+    R"DOC(Crop and image to the specified spatial dimensions. If scale is given,
+then optionally start the crop offset by the left/top border amounts.
+If scale is not provided, crop the borders as provided.)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Crop, 1,
+    OpSchema()
+        .SetDoc(Crop_ver1_doc)
+        .Attr("border", "A 1-D values of (leftBorder, topBorder, rightBorder, bottomBorder).", AttributeProto::INTS,
+              OPTIONAL_VALUE)
+        .Attr("scale", "A 1-D values of (height, width).", AttributeProto::INTS, OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output(0, "output", "Result, has same type as input, with H and W dimensions reduced.", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors."));
+
+constexpr const char* ThresholdedRelu_ver1_doc = R"DOC(
+ThresholdedRelu takes one input data (Tensor<T>) and produces one output data
+(Tensor<T>) where the rectified linear function, y = x for x > alpha, y = 0 otherwise,
+is applied to the tensor elementwise. )DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ThresholdedRelu, 1,
+    OpSchema()
+        .SetDoc(ThresholdedRelu_ver1_doc)
+        .Attr("alpha", "Threshold value", AttributeProto::FLOAT, 1.0f)
+        .Input(0, "X", "Input tensor", "T")
+        .Output(0, "Y", "Output tensor", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* DynamicSlice_ver1_doc = R"DOC(
+Produces a slice of the input tensor along multiple axes. Similar to numpy:
+https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+Slices uses `axes`, `starts` and `ends` inputs to specify the start and end
+dimension for each axis in the list of axes, it uses this information to
+slice the input `data` tensor. If a negative value is passed for any of the
+start or end indices, it represent number of elements before the end of that
+dimension. If the value passed to start or end is larger than the `n` (the
+number of elements in this dimension), it represents `n`. For slicing to the
+end of a dimension with unknown size, it is recommended to pass in `INT_MAX`.
+If `axes` are omitted, they are set to `[0, ..., ndim-1]`.
+Example 1:
+  data = [
+      [1, 2, 3, 4],
+      [5, 6, 7, 8],
+  ]
+  axes = [0, 1]
+  starts = [1, 0]
+  ends = [2, 3]
+  result = [
+      [5, 6, 7],
+  ]
+Example 2:
+  data = [
+      [1, 2, 3, 4],
+      [5, 6, 7, 8],
+  ]
+  starts = [0, 1]
+  ends = [-1, 1000]
+  result = [
+      [2, 3, 4],
+  ]
+)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    DynamicSlice, 1,
+    OpSchema()
+        .SetDoc(DynamicSlice_ver1_doc)
+        .Input(0, "data", "Tensor of data to extract slices from.", "T")
+        .Input(1, "starts", "1-D tensor of starting indices of corresponding axis in `axes`", "Tind")
+        .Input(2, "ends", "1-D tensor of ending indices (exclusive) of corresponding axis in axes", "Tind")
+        .Input(3, "axes", "1-D tensor of axes that `starts` and `ends` apply to.", "Tind", OpSchema::Optional)
+        .Output(0, "output", "Sliced data tensor.", "T")
+        .TypeConstraint("T", OpSchema::all_tensor_types(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint("Tind", {"tensor(int32)", "tensor(int64)"}, "Constrain indices to integer types"));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(GivenTensorFill, 1,
+                                 OpSchema()
+                                     .Input(0, "shape", "The shape of filled tensor", "T", OpSchema::Optional)
+                                     .Output(0, "X", "The filled tensor", "T")
+                                     .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                                                     "Constrain input and output types to float tensors.")
+                                     .Attr("values", "", AttributeProto::FLOATS, OPTIONAL_VALUE)
+                                     .Attr("shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                     .Attr("input_as_shape", "", AttributeProto::INT, OPTIONAL_VALUE)
+                                     .Attr("extra_shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                     .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                       ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                                       if (ctx.getAttribute("shape") != nullptr) {
+                                         propagateShapeFromAttributeToOutput(ctx, "shape", 0);
+                                         return;
+                                       }
+                                       // The type constraints above do not allow for input_as_shape
+                                       // and may need to be fixed.
+                                       if (getAttribute(ctx, "input_as_shape", 0) != 0)  // dynamic shape
+                                         return;
+                                       std::vector<int64_t> extra_shape;
+                                       getRepeatedAttribute(ctx, "extra_shape", extra_shape);
+                                       if (hasInputShape(ctx, 0)) {
+                                         ONNX_NAMESPACE::TensorShapeProto shape =
+                                             ctx.getInputType(0)->tensor_type().shape();
+                                         for (auto extra_dim_val : extra_shape) {
+                                           if (extra_dim_val < 0)
+                                             fail_shape_inference(
+                                                 "Negative values are not allowed in a shape specification");
+                                           shape.add_dim()->set_dim_value(extra_dim_val);
+                                         }
+                                         updateOutputShape(ctx, 0, shape);
+                                       }
+                                     }));
+
+constexpr const char* Scale_ver1_doc = R"DOC(
+Scale takes one input data (Tensor<float>) and produces one output data
+(Tensor<float>) whose value is the input data tensor scaled element-wise.
+)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Scale, 1,
+    OpSchema()
+        .Input(0, "input", "Input data to be scaled", "T")
+        .Output(0, "output", "Output data after scaling", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .SetDoc(Scale_ver1_doc)
+        .Attr("scale", "The scale to apply.", AttributeProto::FLOAT, 1.0f)
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+constexpr const char* GRUUnit_ver1_doc = R"DOC(
+GRUUnit computes the activations of a standard GRU,
+in a sequence-length aware fashion.
+Concretely, given the (fused) inputs X (TxNxD), the previous hidden
+state (NxD), and the sequence lengths (N), computes the GRU
+activations, avoiding computation if the input is invalid (as in, the
+value at X[t][n] >= seqLengths[n].
+)DOC";
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(GRUUnit, 1,
+                                 OpSchema()
+                                     .SetDoc(GRUUnit_ver1_doc)
+                                     .Attr("drop_states",
+                                           "Bool to determine if hidden state is zeroes or passed "
+                                           "along for timesteps past the given sequence_length.",
+                                           AttributeProto::INT, OPTIONAL_VALUE)
+                                     .Input(0, "hidden_prev", "The previous GRU hidden state.", "T")
+                                     .Input(1, "gates",
+                                            "Unactivated gate outputs from forget, update, "
+                                            "and output gates, pre-activation.",
+                                            "T")
+                                     .Input(2, "seq_lengths",
+                                            "Array of sequence lengths.  "
+                                            "len(seq_lengths) should equal batch size N.",
+                                            "T")
+                                     .Input(3, "t", "The timestep for this operation.", "T")
+                                     .Output(0, "hidden", "The new GRU hidden state calculated by this op.", "T")
+                                     .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                                                     "Constrain input and output types to float tensors."));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(GivenTensorFill, 10,
+                                 OpSchema()
+                                     .Deprecate()
+                                     .Input(0, "shape", "The shape of filled tensor", "T", OpSchema::Optional)
+                                     .Output(0, "X", "The filled tensor", "T")
+                                     .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                                                     "Constrain input and output types to float tensors.")
+                                     .Attr("values", "", AttributeProto::FLOATS, OPTIONAL_VALUE)
+                                     .Attr("shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                     .Attr("input_as_shape", "", AttributeProto::INT, OPTIONAL_VALUE)
+                                     .Attr("extra_shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
+                                     .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+                                       ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+                                       if (ctx.getAttribute("shape") != nullptr) {
+                                         propagateShapeFromAttributeToOutput(ctx, "shape", 0);
+                                         return;
+                                       }
+                                       // The type constraints above do not allow for input_as_shape
+                                       // and may need to be fixed.
+                                       if (getAttribute(ctx, "input_as_shape", 0) != 0)  // dynamic shape
+                                         return;
+                                       std::vector<int64_t> extra_shape;
+                                       getRepeatedAttribute(ctx, "extra_shape", extra_shape);
+                                       if (hasInputShape(ctx, 0)) {
+                                         ONNX_NAMESPACE::TensorShapeProto shape =
+                                             ctx.getInputType(0)->tensor_type().shape();
+                                         for (auto extra_dim_val : extra_shape) {
+                                           if (extra_dim_val < 0)
+                                             fail_shape_inference(
+                                                 "Negative values are not allowed in a shape specification");
+                                           shape.add_dim()->set_dim_value(extra_dim_val);
+                                         }
+                                         updateOutputShape(ctx, 0, shape);
+                                       }
+                                     }));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Scale, 10,
+    OpSchema()
+        .Deprecate()
+        .Input(0, "input", "Input data to be scaled", "T")
+        .Output(0, "output", "Output data after scaling", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .SetDoc(Scale_ver1_doc)
+        .Attr("scale", "The scale to apply.", AttributeProto::FLOAT, 1.0f)
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(GRUUnit, 10,
+                                 OpSchema()
+                                     .Deprecate()
+                                     .SetDoc(GRUUnit_ver1_doc)
+                                     .Attr("drop_states",
+                                           "Bool to determine if hidden state is zeroes or passed "
+                                           "along for timesteps past the given sequence_length.",
+                                           AttributeProto::INT, OPTIONAL_VALUE)
+                                     .Input(0, "hidden_prev", "The previous GRU hidden state.", "T")
+                                     .Input(1, "gates",
+                                            "Unactivated gate outputs from forget, update, "
+                                            "and output gates, pre-activation.",
+                                            "T")
+                                     .Input(2, "seq_lengths",
+                                            "Array of sequence lengths.  "
+                                            "len(seq_lengths) should equal batch size N.",
+                                            "T")
+                                     .Input(3, "t", "The timestep for this operation.", "T")
+                                     .Output(0, "hidden", "The new GRU hidden state calculated by this op.", "T")
+                                     .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                                                     "Constrain input and output types to float tensors."));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    MeanVarianceNormalization, 1,
+    OpSchema()
+        .SetDoc(R"DOC(Perform mean variance normalization.)DOC")
+        .Attr("across_channels", "If 1, mean and variance are computed across channels. Default is 0.",
+              AttributeProto::INT, static_cast<int64_t>(0))
+        .Attr("normalize_variance", "If 0, normalize the mean only.  Default is 1.", AttributeProto::INT,
+              static_cast<int64_t>(1))
+        .Input(0, "input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output(0, "output", "Result, has same shape and type as input", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ScaledTanh, 1,
+    OpSchema()
+        .Attr("alpha", "Scaling value", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Attr("beta", "Scaling value", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor", "T")
+        .Output(0, "output",
+                "The scaled hyperbolic tangent values of the input tensor "
+                "computed element-wise",
+                "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Affine, 10,
+    OpSchema()
+        .Deprecate()
+        .SetDoc(Affine_ver1_doc)
+        .Attr("alpha", "Value of alpha", AttributeProto::FLOAT, 1.0f)
+        .Attr("beta", "Value of beta", AttributeProto::FLOAT, 0.0f)
+        .Input(0, "X", "1D input tensor", "T")
+        .Output(0, "Y", "1D output tensor", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ParametricSoftplus, 10,
+    OpSchema()
+        .Deprecate()
+        .SetDoc(ParametricSoftplus_ver1_doc)
+        .Attr("alpha", "Value of alpha", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Attr("beta", "Value of beta", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Input(0, "X", "1D input tensor", "T")
+        .Output(0, "Y", "1D input tensor", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ImageScaler, 10,
+    OpSchema()
+        .Deprecate()
+        .SetDoc(ImageScaler_ver1_doc)
+        .Attr("bias", "Bias applied to each channel, same size as C.", AttributeProto::FLOATS, OPTIONAL_VALUE)
+        .Attr("scale", "The scale to apply.", AttributeProto::FLOAT, 1.0f)
+        .Input(0, "input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output(0, "output", "Result, has same shape and type as input", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    Crop, 10,
+    OpSchema()
+        .Deprecate()
+        .SetDoc(Crop_ver1_doc)
+        .Attr("border", "A 1-D values of (leftBorder, topBorder, rightBorder, bottomBorder).", AttributeProto::INTS)
+        .Attr("scale", "A 1-D values of (height, width).", AttributeProto::INTS, OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor of shape [N,C,H,W]", "T")
+        .Output(0, "output", "Result, has same type as input, with H and W dimensions reduced.", "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+          // Type inference
+          ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+
+          // Shape inference
+          auto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+
+          if (ONNX_NAMESPACE::hasNInputShapes(ctx, 1)) {
+            const auto& input_shape = ctx.getInputType(0)->tensor_type().shape();
+            const auto input_rank = input_shape.dim_size();
+            if (input_rank != 4) fail_shape_inference("Input's shape must be 4-D");
+
+            // parse necessary attributes for futher processing
+            std::vector<int64_t> border;
+            bool border_present = getRepeatedAttribute(ctx, "border", border);
+            if (!border_present || border.size() != 4)
+              fail_shape_inference(
+                  "'Border' attribute must be present and must contain exactly 4 values - "
+                  "(left_border, top_border, right_border, bottom_border)");
+
+            std::vector<int64_t> scale;
+            bool scale_present = getRepeatedAttribute(ctx, "scale", scale);
+            if (scale_present && scale.size() != 2)
+              fail_shape_inference("'Scale' must contain exactly 2 values - (height, width)");
+
+            // actual shape inference processing
+            // [N, C] can be copied over from the input as is
+            *output_shape->mutable_dim(static_cast<int>(0)) = input_shape.dim(static_cast<int>(0));
+            *output_shape->mutable_dim(static_cast<int>(1)) = input_shape.dim(static_cast<int>(1));
+
+            // process 'H' and 'W'
+            if (!utils::HasDimValue(input_shape.dim(static_cast<int>(2))) ||
+                !utils::HasDimValue(input_shape.dim(static_cast<int>(3)))) {
+              // either height and width input has symbolic dims, so can't proceed further
+              // add two dims as placeholders for output_H and output_W and return
+              output_shape->add_dim();
+              output_shape->add_dim();
+              return;
+            }
+
+            int64_t H = input_shape.dim(static_cast<int>(2)).dim_value();
+            int64_t W = input_shape.dim(static_cast<int>(3)).dim_value();
+
+            int64_t left_border = border[0], top_border = border[1], right_border = border[2],
+                    bottom_border = border[3];
+
+            if (H < top_border + bottom_border)
+              fail_shape_inference("Input's height (", H,
+                                   ") needs to be greater than or equal to "
+                                   "the top_border (",
+                                   top_border, ") + bottom_border (", bottom_border, ")");
+
+            if (W < left_border + right_border)
+              fail_shape_inference("Input's width (", W,
+                                   ") needs to be greater than or equal to "
+                                   "the left_border (",
+                                   left_border, ") + right_border (", right_border, ")");
+
+            int64_t bottom_limit = H - bottom_border;
+            int64_t right_limit = W - right_border;
+
+            // scale = (height, width)
+            if (!scale.empty()) {
+              bottom_limit = top_border + scale[0];
+              right_limit = left_border + scale[1];
+
+              if (H < bottom_limit)
+                fail_shape_inference("Input's height (", H, ") needs to be greater than or equal to the top_border (",
+                                     top_border, ") + scale[0] (", scale[0], ")");
+
+              if (W < right_limit)
+                fail_shape_inference("Input's width (", W, ") needs to be greater than or equal to the left_border (",
+                                     left_border, ") + scale[1] (", scale[1], ")");
+            }
+
+            auto* h_output_dim = output_shape->add_dim();
+            h_output_dim->set_dim_value(bottom_limit - top_border);
+
+            auto* w_output_dim = output_shape->add_dim();
+            w_output_dim->set_dim_value(right_limit - left_border);
+          } else {
+            // Rank Inference at the very least
+            // (We know that the output is going to be 4-D)
+            for (int i = 0; i < 4; ++i) {
+              output_shape->add_dim();
+            }
+          }
+        }));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    DynamicSlice, 10,
+    OpSchema()
+        .Deprecate()
+        .SetDoc(DynamicSlice_ver1_doc)
+        .Input(0, "data", "Tensor of data to extract slices from.", "T")
+        .Input(1, "starts", "1-D tensor of starting indices of corresponding axis in `axes`", "Tind")
+        .Input(2, "ends", "1-D tensor of ending indices (exclusive) of corresponding axis in axes", "Tind")
+        .Input(3, "axes", "1-D tensor of axes that `starts` and `ends` apply to.", "Tind", OpSchema::Optional)
+        .Output(0, "output", "Sliced data tensor.", "T")
+        .TypeConstraint("T", OpSchema::all_tensor_types(), "Constrain input and output types to all tensor types.")
+        .TypeConstraint("Tind", {"tensor(int32)", "tensor(int64)"}, "Constrain indices to integer types"));
+
+ONNX_CONTRIB_OPERATOR_SET_SCHEMA(
+    ScaledTanh, 10,
+    OpSchema()
+        .Deprecate()
+        .Attr("alpha", "Scaling value", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Attr("beta", "Scaling value", AttributeProto::FLOAT, OPTIONAL_VALUE)
+        .Input(0, "input", "Input tensor", "T")
+        .Output(0, "output",
+                "The scaled hyperbolic tangent values of the input tensor "
+                "computed element-wise",
+                "T")
+        .TypeConstraint("T", {"tensor(float16)", "tensor(float)", "tensor(double)"},
+                        "Constrain input and output types to float tensors.")
+        .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));
+
+// End of ONNX exp ops(Affine, Crop, ParametricSoftplus, ImageScaler, ThresholdedRelu, DynamicSlice, ScaledTanh, MVN)
+// old version history maintenance
+}  // namespace contrib
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/onnx_deprecated_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/onnx_deprecated_opset.h
@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "onnx/defs/schema.h"
+#include "core/graph/contrib_ops/ms_schema.h"
+
+// This file contains deprecated ONNX operators that have been removed from ONNX spec, but we still need to keep them
+// to maintain backward compatibility. Strictly speaking, this file doesn't define an opset. It only contains a group
+// of operators.
+
+namespace onnxruntime {
+namespace contrib {
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Affine);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ParametricSoftplus);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ImageScaler);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Crop);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ThresholdedRelu);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, DynamicSlice);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, GivenTensorFill);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Scale);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, GRUUnit);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, GivenTensorFill);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Scale);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, GRUUnit);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, MeanVarianceNormalization);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ScaledTanh);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Affine);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ParametricSoftplus);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ImageScaler);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Crop);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, DynamicSlice);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ScaledTanh);
+
+class OpSet_ONNX_Deprecated {
+ public:
+  static void ForEachSchema(std::function<void(ONNX_NAMESPACE::OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Affine)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ParametricSoftplus)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ImageScaler)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Crop)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ThresholdedRelu)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, DynamicSlice)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, GivenTensorFill)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, Scale)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, GRUUnit)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, GivenTensorFill)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Scale)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, GRUUnit)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, MeanVarianceNormalization)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 1, ScaledTanh)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Affine)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ParametricSoftplus)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ImageScaler)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, Crop)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, DynamicSlice)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, ScaledTanh)>());
+  }
+};
+}  // namespace contrib
+}  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@ -5,6 +5,8 @@
 #include "core/graph/constants.h"
 #include "core/graph/contrib_ops/contrib_defs.h"

+
+
 namespace ONNX_NAMESPACE {
 void RNNShapeInference(InferenceContext& ctx);

@ -22,11 +24,13 @@ void matmulShapeInference(

 namespace onnxruntime {
 namespace contrib {
-
 using ONNX_NAMESPACE::AttributeProto;
 using ONNX_NAMESPACE::InferenceContext;
 using ONNX_NAMESPACE::OpSchema;
 using ONNX_NAMESPACE::OPTIONAL_VALUE;
+#ifndef NDEBUG
+using ONNX_NAMESPACE::DbgOperatorSetTracker;
+#endif

 void ValidateTypeAndShapeForScaleAndZP(ONNX_NAMESPACE::InferenceContext& ctx, int index, ::google::protobuf::int32 expectedType, bool isScalar, int expectedTensorSize) {
  if (ctx.getNumInputs() > static_cast<size_t>(index)) {
@ -136,16 +140,13 @@ Performs element-wise binary {name} on 8 bit data types (with Numpy-style broadc
  };
 }

-void RegisterQuantizationSchemas() {
  static const char* QuantizeLinear_ver1_doc = R"DOC(
 The linear quantization operator. It consumes a full precision data, a scale, a zero point to compute the low precision / quantized tensor.
 The quantization formula is y = saturate ((x / y_scale) + y_zero_point).For saturation, it saturates to [0, 255] if it's uint8, or [-128, 127] if it's int8.
 For (x / y_scale), it's rounding to nearest ties to even. Refer to https://en.wikipedia.org/wiki/Rounding for details.
 Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC";

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QuantizeLinear)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QuantizeLinear, 1, OpSchema()
      .Attr(
          "axis",
          "The axis along which same quantization parameters are applied. It's optional."
@ -193,16 +194,14 @@ Scale and zero point must have same shape. They must be either scalar (per tenso

        auto& input_shape = getInputShape(ctx, 0);
        updateOutputShape(ctx, 0, input_shape);
-      });
+      }));

  static const char* DequantizeLinear_ver1_doc = R"DOC(
 The linear dequantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data.
 The dequantization formula is y = (x - x_zero_point) * x_scale.
 Scale and zero point must have same shape. They must be either scalar (per tensor) or 1-D tensor (per 'axis').)DOC";

-  ONNX_CONTRIB_OPERATOR_SCHEMA(DequantizeLinear)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(DequantizeLinear, 1, OpSchema()
      .Attr("axis",
            "The axis along which same quantization parameters are applied. It's optional."
            "If it's not specified, it means per-tensor quantization and input 'x_scale' and 'x_zero_point' must be scalars."
@ -250,11 +249,9 @@ Scale and zero point must have same shape. They must be either scalar (per tenso

        auto& input_shape = getInputShape(ctx, 0);
        updateOutputShape(ctx, 0, input_shape);
-      });
+      }));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(ReduceSumInteger)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(ReduceSumInteger, 1, OpSchema()
      .SetDoc(R"DOC(
 Computes the sum of the low-precision input tensor's element along the provided axes.
 The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
@ -274,12 +271,9 @@ with the exception that numpy default keepdims to False instead of True.)DOC")
          AttributeProto::INTS)
      .Attr(
          "keepdims",
-          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.",
-          AttributeProto::INT);
+          "Keep the reduced dimension or not, default 1 mean keep reduced dimension.", AttributeProto::INT));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(MulInteger)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(MulInteger, 1, OpSchema()
      .SetDoc(R"DOC(Performs element-wise binary quantized multiplication (with Numpy-style broadcasting support).
 "This operator supports **multidirectional (i.e., Numpy-style) broadcasting**"
 The output of this op is the int32 accumulated result of the mul operation
@ -328,11 +322,9 @@ C (int32) = (A - A_zero_point) * (B - B_zero_point)
              ctx.getInputType(2)->tensor_type().shape(),
              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape());
        }
-      });
+      }));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(DynamicQuantizeMatMul)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(DynamicQuantizeMatMul, 1, OpSchema()
      .Input(0, "A", "N-dimensional matrix A", "T1")
      .Input(1, "B", "N-dimensional matrix B", "T2")
      .Input(
@ -367,11 +359,9 @@ C (int32) = (A - A_zero_point) * (B - B_zero_point)
      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
        propagateElemTypeFromInputToOutput(ctx, 0, 0);
        ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
-      });
+      }));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulIntegerToFloat)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(MatMulIntegerToFloat, 1, OpSchema()
      .Input(0, "A", "N-dimensional matrix A", "T1")
      .Input(1, "B", "N-dimensional matrix B", "T2")
      .Input(
@ -426,23 +416,17 @@ C (int32) = (A - A_zero_point) * (B - B_zero_point)
      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
        propagateElemTypeFromInputToOutput(ctx, 2, 0);
        ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
-      });
+      }));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearAdd)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearAdd, 1, OpSchema()
      .FillUsing(QLinearMathDocGenerator("addition",
-                                         "C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point"));
+                                         "C = (A_scale * (A - A_zero_point) + B_scale * (B - B_zero_point))/C_scale + C_zero_point")));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearMul)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearMul, 1, OpSchema()
      .FillUsing(QLinearMathDocGenerator("multiplication",
-                                         "C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point"));
+                                         "C = ((A - A_zero_point) * (B - B_zero_point)) * (A_scale * B_scale)/C_scale + C_zero_point")));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearReduceMean)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearReduceMean, 1, OpSchema()
      .SetDoc(R"DOC(
 Computes the mean of the low-precision input tensor's element along the provided axes.
 The resulting tensor has the same rank as the input if keepdims equal 1. If keepdims equal 0,
@ -544,7 +528,7 @@ This helps to improve accuracy as after ReduceMean operation the range of the ou
            }
          }
        }
-      });
+      }));

  const char* QLinearLeakyReluDoc_ver1 = R"DOC(
 QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
@ -552,9 +536,7 @@ and produces one output data (Tensor<T>) where the function `f(x) = quantize(alp
 `f(x) = quantize(dequantize(x)) for dequantize(x) >= 0`, is applied to the data tensor elementwise.
 )DOC";

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearLeakyRelu)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearLeakyRelu, 1, OpSchema()
      .SetDoc(QLinearLeakyReluDoc_ver1)
      .Attr("alpha", "Coefficient of leakage.", AttributeProto::FLOAT, 0.01f)
      .Input(0, "X", "Input tensor", "T")
@ -575,16 +557,14 @@ and produces one output data (Tensor<T>) where the function `f(x) = quantize(alp
          "T",
          {"tensor(uint8)", "tensor(int8)"},
          "Constrain input and output types to 8 bit tensors.")
-      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
+      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));

  const char* QLinearSigmoidDoc_ver1 = R"DOC(
 QLinearSigmoid takes quantized input data (Tensor), and quantize parameter for output, and produces one output data 
 (Tensor<T>) where the function `f(x) = quantize(Sigmoid(dequantize(x)))`, is applied to the data tensor elementwise.
 Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearSigmoid)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearSigmoid, 1, OpSchema()
      .SetDoc(QLinearSigmoidDoc_ver1)
      .Input(0, "X", "Input tensor", "T")
      .Input(1, "X_scale",
@ -604,11 +584,9 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
          "T",
          {"tensor(uint8)", "tensor(int8)"},
          "Constrain input and output types to 8 bit tensors.")
-      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
+      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(DynamicQuantizeLSTM)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(DynamicQuantizeLSTM, 1, OpSchema()
      .Attr(
          "direction",
          "Specify if the RNN is forward, reverse, or bidirectional. "
@ -781,11 +759,9 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
          "T2",
          {"tensor(uint8)", "tensor(int8)"},
          "Constrain weights types to 8 bit tensors.")
-      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::RNNShapeInference);
+      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::RNNShapeInference));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearConcat)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QLinearConcat, 1, OpSchema()
      .Attr("axis", "Which axis to concat on", AttributeProto::INT)
      .SetDoc(
          "Concatenate a list of tensors into a single tensor."
@ -861,11 +837,9 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
        if (all_lengths_known) {
          output_shape->mutable_dim(axis)->set_dim_value(total_length);
        }
-      });
+      }));

-  ONNX_CONTRIB_OPERATOR_SCHEMA(QGemm)
-      .SetDomain(kMSDomain)
-      .SinceVersion(1)
+  ONNX_MS_OPERATOR_SET_SCHEMA(QGemm, 1, OpSchema()
      .SetDoc("Quantized Gemm")
      .Input(0,
             "A",
@ -985,8 +959,6 @@ Wwhere the function `Sigmoid(x) = 1 / (1 + exp(-x))` )DOC";
              {first_input_shape.dim(transA ? 1 : 0),
               second_input_shape.dim(transB ? 0 : 1)});
        }
-      });
-}
-
+      }));
 }  // namespace contrib
 }  // namespace onnxruntime
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.h
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.h
@ -9,7 +9,6 @@
 #endif
 #include "onnx/onnx_pb.h"
 #include "onnx/onnx-operators_pb.h"
-#include "core/framework/tensorprotoutils.h"

 namespace onnxruntime {
 namespace contrib {
--- a/onnxruntime/core/session/environment.cc
+++ b/onnxruntime/core/session/environment.cc
@ -10,6 +10,8 @@
 #if !defined(ORT_MINIMAL_BUILD)
 #include "onnx/defs/operator_sets.h"
 #include "onnx/defs/operator_sets_ml.h"
+#include "core/graph/contrib_ops/ms_opset.h"
+#include "core/graph/contrib_ops/onnx_deprecated_opset.h"
 #if defined(ENABLE_TRAINING) || defined(ENABLE_TRAINING_OPS)
 #include "onnx/defs/operator_sets_training.h"
 #endif
@ -24,6 +26,7 @@
 #include "core/platform/env.h"
 #include "core/util/thread_utils.h"

+
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
 #include "core/platform/tracing.h"
 #endif
@ -225,6 +228,10 @@ Status Environment::Initialize(std::unique_ptr<logging::LoggingManager> logging_
 // Register contributed schemas.
 // The corresponding kernels are registered inside the appropriate execution provider.
 #ifndef DISABLE_CONTRIB_OPS
+#ifndef ORT_MINIMAL_BUILD
+      RegisterOpSetSchema<contrib::OpSet_Microsoft_ver1>();
+      RegisterOpSetSchema<contrib::OpSet_ONNX_Deprecated>();
+#endif
      contrib::RegisterContribSchemas();
 #endif
 #ifdef USE_DML