diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
new file mode 100644
index 0000000000..2cad94aae0
--- /dev/null
+++ b/docs/OperatorKernels.md
@@ -0,0 +1,470 @@
+## Supported Operators Data Types
+*This file is automatically generated from the
+            [def files](/onnxruntime/core/providers/cpu/cpu_execution_provider.cc) via [this script](/tools/python/gen_opkernel_doc.py).
+            Do not modify directly and instead edit operator definitions.*
+
+
+
+## Operators implemented by CPUExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+**Operator Domain:** *ai.onnx.ml*
+|Abs|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+|Acos|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
+|Acosh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|Add|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Affine|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|And|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|ArgMax|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(int32), tensor(float)|
+|ArgMin|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(int32), tensor(float)|
+|ArrayFeatureExtractor|(*in* X:**T**, *in* Y:**tensor(int64)**, *out* Z:**T**)|1+|**T** = tensor(string), tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Asin|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
+|Asinh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|Atan|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
+|Atanh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|AveragePool|(*in* X:**T**, *out* Y:**T**)|10+|**T** = tensor(float)|
+| | |[7, 9]|**T** = tensor(float)|
+|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|[7, 9]|**B** = tensor(float)|
+| | ||**X** = tensor(float)|
+| | ||**mean** = tensor(float)|
+| | ||**scale** = tensor(float)|
+| | ||**var** = tensor(float)|
+|Binarizer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Cast|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(string)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[6, 9]|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|CastMap|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = unknown|
+| | ||**T2** = tensor(string), tensor(float), tensor(int64)|
+|CategoryMapper|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = tensor(string), tensor(int64)|
+| | ||**T2** = tensor(string), tensor(int64)|
+|Ceil|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Clip|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
+|Compress|(*in* input:**T**, *in* condition:**T1**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(bool)|
+|Concat|(*in* inputs:**T**, *out* concat_result:**T**)|4+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|ConstantOfShape|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int64)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|ConvInteger|(*in* x:**T1**, *in* w:**T2**, *in* x_zero_point:**T1**, *in* w_zero_point:**T2**, *out* y:**T3**)|10+|**T1** = tensor(uint8)|
+| | ||**T2** = tensor(uint8)|
+| | ||**T3** = tensor(int32)|
+|ConvTranspose|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Cos|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
+|Cosh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|Crop|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|DepthToSpace|(*in* input:**T**, *out* output:**T**)|[1, 4]|**T** = tensor(float)|
+|DequantizeLinear|(*in* x:**T**, *in* x_scale:**tensor(float)**, *in* x_zero_point:**T**, *out* y:**tensor(float)**)|10+|**x** = tensor(uint8), unknown|
+| | ||**x_scale** = tensor(float)|
+| | ||**x_zero_point** = tensor(uint8), unknown|
+| | ||**y** = tensor(float)|
+|DictVectorizer|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = unknown|
+| | ||**T2** = tensor(string), tensor(float), tensor(int64), tensor(double)|
+|Div|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Dropout|(*in* data:**T**, *out* output:**T**, *out* mask:**T**) or (*in* data:**T**, *out* output:**T**, *out* mask:**T1**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(bool)|
+| | |[7, 9]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(bool)|
+|DynamicSlice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Elu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Equal|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|11+|**T** = tensor(float)|
+| | ||**T1** = tensor(bool)|
+| | |7+|**T** = tensor(int32), tensor(bool), tensor(int64)|
+| | ||**T1** = tensor(bool)|
+|Erf|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|Exp|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(double)|
+|Expand|(*in* input:**T**, *in* shape:**tensor(int64)**, *out* output:**T**)|8+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|EyeLike|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(uint64), tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(uint64), tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|FeatureVectorizer|(*in* X:**T1**, *out* Y:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Flatten|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[1, 8]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Floor|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|GRU|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|Gather|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|[7, 9]|**T** = tensor(float)|
+|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|GlobalLpPool|(*in* X:**T**, *out* Y:**T**)|2+|**T** = tensor(float)|
+|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Greater|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(int64)|
+| | ||**T1** = tensor(bool)|
+| | |[7, 9]|**T** = tensor(float)|
+| | ||**T1** = tensor(bool)|
+|HardSigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Hardmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Identity|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|If|(*in* cond:**B**, *out* outputs:**V**)|1+|**B** = tensor(bool)|
+| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|ImageScaler|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Imputer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(int64)|
+|InstanceNormalization|(*in* input:**T**, *in* scale:**T**, *in* B:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
+|IsInf|(*in* X:**T1**, *out* Y:**T2**)|10+|**T1** = tensor(float), tensor(double)|
+| | ||**T2** = tensor(bool)|
+|IsNaN|(*in* X:**T1**, *out* Y:**T2**)|9+|**T1** = tensor(float), tensor(MLFloat16)|
+| | ||**T2** = tensor(bool)|
+|LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|LSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|7+|**T** = tensor(float), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|LabelEncoder|(*in* X:**T1**, *out* Y:**T2**)|2+|**T1** = tensor(string), tensor(float), tensor(int64)|
+| | ||**T2** = tensor(string), tensor(float), tensor(int64)|
+| | |[1, 1]|**T1** = tensor(string), tensor(int64)|
+| | ||**T2** = tensor(string), tensor(int64)|
+|LeakyRelu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Less|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(int64)|
+| | ||**T1** = tensor(bool)|
+| | |[7, 9]|**T** = tensor(float)|
+| | ||**T1** = tensor(bool)|
+|LinearClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(string), tensor(int64)|
+|LinearRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
+|Log|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
+|LogSoftmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Loop|(*in* M:**I**, *in* cond:**B**, *in* v_initial:**V**, *out* v_final_and_scan_outputs:**V**)|1+|**B** = tensor(bool)|
+| | ||**I** = tensor(int64)|
+| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|LpNormalization|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|LpPool|(*in* X:**T**, *out* Y:**T**)|2+|**T** = tensor(float)|
+|MatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|[1, 9]|**T** = tensor(float), tensor(double)|
+| | |[9, 9]|**T** = tensor(uint64), tensor(int32), tensor(int64), tensor(uint32)|
+|MatMulInteger|(*in* A:**T1**, *in* B:**T2**, *in* a_zero_point:**T1**, *in* b_zero_point:**T2**, *out* Y:**T3**)|10+|**T1** = tensor(uint8)|
+| | ||**T2** = tensor(uint8)|
+| | ||**T3** = tensor(int32)|
+|Max|(*in* data_0:**T**, *out* max:**T**)|8+|**T** = tensor(float), tensor(double)|
+| | |[6, 7]|**T** = tensor(float)|
+|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|10+|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
+| | |[1, 7]|**T** = tensor(float)|
+| | |[8, 9]|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
+|MaxRoiPool|(*in* X:**T**, *in* rois:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|MaxUnpool|(*in* X:**T1**, *in* I:**T2**, *in* output_shape:**T2**, *out* output:**T1**)|9+|**T1** = tensor(float)|
+| | ||**T2** = tensor(int64)|
+|Mean|(*in* data_0:**T**, *out* mean:**T**)|8+|**T** = tensor(float)|
+| | |[6, 7]|**T** = tensor(float)|
+|MeanVarianceNormalization|(*in* X:**T**, *out* Y:**T**) or (*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+| | |[1, 8]|**T** = tensor(float)|
+|Min|(*in* data_0:**T**, *out* min:**T**)|8+|**T** = tensor(float)|
+| | |[6, 7]|**T** = tensor(float)|
+|Mod|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|10+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Mul|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Multinomial|(*in* input:**T1**, *out* output:**T2**)|7+|**T1** = tensor(float)|
+| | ||**T2** = tensor(int32), tensor(int64)|
+|Neg|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(float), unknown|
+|NonZero|(*in* X:**T**, *out* Y:**tensor(int64)**)|9+|**T** = tensor(int32), tensor(float), tensor(bool), tensor(int64)|
+|Normalizer|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Not|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|OneHot|(*in* indices:**T1**, *in* depth:**T2**, *in* values:**T3**, *out* output:**T3**)|9+|**T1** = tensor(int32), tensor(float), tensor(int64)|
+| | ||**T2** = tensor(int32), tensor(float), tensor(int64)|
+| | ||**T3** = tensor(string), tensor(int32), tensor(float), tensor(int64)|
+|OneHotEncoder|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(string), tensor(float), tensor(int64), tensor(double)|
+|Or|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|PRelu|(*in* X:**T**, *in* slope:**T**, *out* Y:**T**)|[7, 9]|**T** = tensor(float)|
+|Pad|(*in* data:**T**, *out* output:**T**)|2+|**T** = tensor(float)|
+|ParametricSoftplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Pow|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|7+|**T** = tensor(float), tensor(double)|
+|QLinearConv|(*in* x:**T1**, *in* x_scale:**tensor(float)**, *in* x_zero_point:**T1**, *in* w:**T2**, *in* w_scale:**tensor(float)**, *in* w_zero_point:**T2**, *in* y_scale:**tensor(float)**, *in* y_zero_point:**T3**, *in* B:**T4**, *out* y:**T3**)|10+|**T1** = tensor(uint8)|
+| | ||**T2** = tensor(uint8)|
+| | ||**T3** = tensor(uint8)|
+| | ||**T4** = tensor(int32)|
+|QLinearMatMul|(*in* a:**T1**, *in* a_scale:**tensor(float)**, *in* a_zero_point:**T1**, *in* b:**T2**, *in* b_scale:**tensor(float)**, *in* b_zero_point:**T2**, *in* y_scale:**tensor(float)**, *in* y_zero_point:**T3**, *out* y:**T3**)|10+|**T1** = tensor(uint8)|
+| | ||**T2** = tensor(uint8)|
+| | ||**T3** = tensor(uint8)|
+|QuantizeLinear|(*in* x:**T1**, *in* y_scale:**tensor(float)**, *in* y_zero_point:**T2**, *out* y:**T2**)|10+|**x** = tensor(float)|
+| | ||**y** = tensor(uint8), unknown|
+| | ||**y_zero_point** = tensor(uint8), unknown|
+|RNN|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float)|
+| | ||**T1** = tensor(int32)|
+|RandomNormal|(*out* output:**T**)|1+|**T** = tensor(float), tensor(double)|
+|RandomNormalLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(float), tensor(double)|
+|RandomUniform|(*out* output:**T**)|1+|**T** = tensor(float), tensor(double)|
+|RandomUniformLike|(*in* input:**T1**, *out* output:**T2**)|1+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(float), tensor(double)|
+|Reciprocal|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|ReduceL1|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceL2|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceLogSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceLogSumExp|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceMax|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceMean|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceMin|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceProd|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float)|
+|ReduceSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(double)|
+|ReduceSumSquare|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(double)|
+|Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Reshape|(*in* data:**T**, *in* shape:**tensor(int64)**, *out* reshaped:**T**) or (*in* data:**T**, *out* reshaped:**T**)|5+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**shape** = tensor(int64)|
+|Reshape_1||[1, 4]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Resize|(*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(float), tensor(uint8)|
+|ReverseSequence|(*in* input:**T**, *in* sequence_lens:**tensor(int64)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|RoiAlign|(*in* X:**T1**, *in* rois:**T1**, *in* batch_indices:**T2**, *out* Y:**T1**)|10+|**T** = tensor(float), tensor(double)|
+| | ||**T2** = tensor(int64)|
+|SVMClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(string), tensor(int64)|
+|SVMRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
+|Scale|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|ScaledTanh|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Scaler|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Scan|(*in* sequence_lens:**I**, *in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**) or (*in* initial_state_and_scan_inputs:**V**, *out* final_state_and_scan_outputs:**V**)|9+|**I** = tensor(int64)|
+| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[8, 8]|**I** = tensor(int64)|
+| | ||**V** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Scatter|(*in* data:**T**, *in* indices:**Tind**, *in* updates:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Selu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Shape|(*in* data:**T**, *out* shape:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(int64)|
+|Shrink|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Sign|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sin|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float), tensor(double)|
+|Sinh|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float)|
+|Size|(*in* data:**T**, *out* size:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(int64)|
+|Slice|(*in* data:**T**, *out* output:**T**) or (*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *in* steps:**Tind**, *out* output:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+| | |[1, 9]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Softmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Softplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Softsign|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|SpaceToDepth|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Split|(*in* input:**T**, *out* outputs:**T**) or (*in* input:**T**, *in* split:**T**, *out* outputs...:**T**)|2+|**T** = tensor(string), tensor(int32), tensor(float)|
+|Sqrt|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(double)|
+|Squeeze|(*in* data:**T**, *out* squeezed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|StringNormalizer|(*in* X:**tensor(string)**, *out* Y:**tensor(string)**)|10+|**T** = tensor(string)|
+|Sub|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+|Sum|(*in* data_0:**T**, *out* sum:**T**)|8+|**T** = tensor(float)|
+| | |[6, 7]|**T** = tensor(float)|
+|Tan|(*in* input:**T**, *out* output:**T**)|7+|**T** = tensor(float)|
+|Tanh|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float)|
+|TfIdfVectorizer|(*in* X:**T**, *out* Y:**T1**)|9+|**T** = tensor(string), tensor(int32), tensor(int64)|
+| | ||**T1** = tensor(float)|
+|ThresholdedRelu|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+| | |10+|**T** = tensor(float)|
+|Tile|(*in* input:**T**, *in* tiles:**T**, *in* axis:**T**, *out* output:**T**) or (*in* input:**T**, *in* repeats:**T1**, *out* output:**T**)|6+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(int64)|
+|TopK|(*in* X:**T**, *in* K:**tensor(int64)**, *out* Values:**T**, *out* Indices:**I**) or (*in* X:**T**, *out* Values:**T**, *out* Indices:**I**)|10+|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
+| | |[1, 9]|**I** = tensor(int64)|
+| | ||**T** = tensor(float)|
+|Transpose|(*in* data:**T**, *out* transposed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|TreeEnsembleClassifier|(*in* X:**T1**, *out* Y:**T2**, *out* Z:**tensor(float)**)|1+|**T1** = tensor(int32), tensor(float), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(string), tensor(int64)|
+|TreeEnsembleRegressor|(*in* X:**T**, *out* Y:**tensor(float)**)|1+|**T** = tensor(float)|
+|Unsqueeze|(*in* data:**T**, *out* expanded:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Upsample|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|[7, 9]|**T** = tensor(int32), tensor(float), tensor(uint8)|
+|Where|(*in* condition:**B**, *in* X:**T**, *in* Y:**T**, *out* output:**T**)|9+|**T** = tensor(string), tensor(int32), tensor(float)|
+|Xor|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|ZipMap|(*in* X:**tensor(float)**, *out* Z:**T**)|1+|**T** = unknown|
+| |
+| |
+**Operator Domain:** *com.microsoft*
+|AttnLSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *in* QW:**T**, *in* MW:**T**, *in* V:**T**, *in* M:**T**, *in* memory_seq_lens:**T1**, *in* AW:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|1+|**T** = tensor(float), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|ConvTransposeWithDynamicPads|(*in* X:**T**, *in* W:**T**, *in* Pads:**tensor(int64)**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|CropAndResize|(*in* X:**T1**, *in* rois:**T1**, *in* batch_indices:**T2**, *in* crop_size:**T2**, *out* Y:**T1**)|1+|**T** = tensor(float)|
+| | ||**T2** = tensor(int32)|
+|ExpandDims|(*in* X:**T**, *in* axis:**tensor(int32)**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**axis** = tensor(int32)|
+|FusedConv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|FusedGemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|GatherND|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(string), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|MaxpoolWithMask|(*in* X:**T**, *in* M:**tensor(int32)**, *out* Y:**T**)|1+|**X** = tensor(float)|
+|MurmurHash3|(*in* X:**T1**, *out* Y:**T2**)|1+|**T1** = tensor(string), tensor(int32), tensor(uint32)|
+| | ||**T2** = tensor(int32), tensor(uint32)|
+|Pad|(*in* data:**T**, *in* pads:**tensor(int64)**, *in* value:**T**, *out* output:**T**)|1+|**T** = tensor(float)|
+|Range|(*in* start:**T**, *in* limit:**T**, *in* delta:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(float), tensor(int64), tensor(int16), tensor(double)|
+|SampleOp|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Tokenizer|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(string)|
+|Unique|(*in* x:**T**, *out* y:**T**, *out* idx:**tensor(int64)**, *out* counts:**tensor(int64)**)|1+|**T** = tensor(float)|
+|WordConvEmbedding|(*in* Sequence:**T**, *in* W:**T1**, *in* B:**T1**, *in* C:**T1**, *out* Y:**T1**)|1+|**T** = tensor(int32)|
+| | ||**T1** = tensor(float)|
+| |
+| |
+**Operator Domain:** *com.microsoft.nchwc*
+|AveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *in* Sum:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|MaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|ReorderInput|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|ReorderOutput|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+| |
+| |
+
+
+## Operators implemented by CUDAExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+**Operator Domain:** *ai.onnx.ml*
+|Abs|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Add|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Affine|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|And|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|ArgMax|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ArgMin|(*in* data:**T**, *out* reduced:**tensor(int64)**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|AveragePool|(*in* X:**T**, *out* Y:**T**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[7, 9]|**I** = tensor(int64)|
+| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|9+|**B** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**X** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**mean** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**scale** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**var** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[7, 8]|**B** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**X** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**mean** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**scale** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**var** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Cast|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[6, 8]|**T1** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Ceil|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Compress|(*in* input:**T**, *in* condition:**T1**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(bool)|
+|Concat|(*in* inputs:**T**, *out* concat_result:**T**)|4+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|ConstantOfShape|(*in* input:**T1**, *out* output:**T2**)|9+|**T1** = tensor(int64)|
+| | ||**T2** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ConvTranspose|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Crop|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Div|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Dropout|(*in* data:**T**, *out* output:**T**, *out* mask:**T**) or (*in* data:**T**, *out* output:**T**, *out* mask:**T1**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(bool)|
+| | |[7, 9]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|DynamicSlice|(*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Elu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Equal|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(int32), tensor(bool), tensor(int64)|
+|Erf|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Exp|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Expand|(*in* input:**T**, *in* shape:**tensor(int64)**, *out* output:**T**)|8+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Flatten|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[1, 8]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Floor|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|GRU|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|Gather|(*in* data:**T**, *in* indices:**Tind**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[7, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Greater|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|9+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(bool)|
+| | |[7, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|HardSigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Identity|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|ImageScaler|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|InstanceNormalization|(*in* input:**T**, *in* scale:**T**, *in* B:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|LSTM|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *in* initial_c:**T**, *in* P:**T**, *out* Y:**T**, *out* Y_h:**T**, *out* Y_c:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|LeakyRelu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Log|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|MatMul|(*in* A:**T**, *in* B:**T**, *out* Y:**T**)|9+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[1, 8]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Max|(*in* data_0:**T**, *out* max:**T**)|8+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[6, 7]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[1, 7]|**I** = tensor(int64)|
+| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[8, 9]|**I** = tensor(int64)|
+| | ||**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|MemcpyFromHost|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|MemcpyToHost|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Min|(*in* data_0:**T**, *out* min:**T**)|8+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |[6, 7]|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Mul|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Neg|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(int32), tensor(int16), unknown, tensor(float), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Or|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+|PRelu|(*in* X:**T**, *in* slope:**T**, *out* Y:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Pad|(*in* data:**T**, *out* output:**T**)|2+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ParametricSoftplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Pow|(*in* X:**T**, *in* Y:**T**, *out* Z:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|RNN|(*in* X:**T**, *in* W:**T**, *in* R:**T**, *in* B:**T**, *in* sequence_lens:**T1**, *in* initial_h:**T**, *out* Y:**T**, *out* Y_h:**T**)|7+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(int32)|
+|Reciprocal|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceL1|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceL2|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceLogSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceLogSumExp|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceMax|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceMean|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceMin|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceProd|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceSum|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ReduceSumSquare|(*in* data:**T**, *out* reduced:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Reshape|(*in* data:**T**, *in* shape:**tensor(int64)**, *out* reshaped:**T**) or (*in* data:**T**, *out* reshaped:**T**)|5+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**shape** = tensor(int64)|
+|Reshape_1||[1, 4]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Resize|(*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|10+|**T** = tensor(int32), tensor(float), tensor(MLFloat16), tensor(uint8), tensor(double)|
+|ScaledTanh|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Selu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Shape|(*in* data:**T**, *out* shape:**T1**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**T1** = tensor(int64)|
+|Shrink|(*in* input:**T**, *out* output:**T**)|9+|**T** = tensor(int32), tensor(int16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sigmoid|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Slice|(*in* data:**T**, *out* output:**T**) or (*in* data:**T**, *in* starts:**Tind**, *in* ends:**Tind**, *in* axes:**Tind**, *in* steps:**Tind**, *out* output:**T**)|10+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+| | |[1, 9]|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | ||**Tind** = tensor(int32), tensor(int64)|
+|Softmax|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Softplus|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Softsign|(*in* input:**T**, *out* output:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Split|(*in* input:**T**, *out* outputs:**T**) or (*in* input:**T**, *in* split:**T**, *out* outputs...:**T**)|2+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sqrt|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Squeeze|(*in* data:**T**, *out* squeezed:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sub|(*in* A:**T**, *in* B:**T**, *out* C:**T**)|7+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Sum|(*in* data_0:**T**, *out* sum:**T**)|8+|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+| | |[6, 7]|**T** = tensor(int32), tensor(uint32), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Tanh|(*in* input:**T**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|ThresholdedRelu|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | |10+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Tile|(*in* input:**T**, *in* tiles:**T**, *in* axis:**T**, *out* output:**T**) or (*in* input:**T**, *in* repeats:**T1**, *out* output:**T**)|6+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+| | ||**T1** = tensor(int64)|
+|Transpose|(*in* data:**T**, *out* transposed:**T**)|1+|**T** = tensor(float), tensor(MLFloat16), tensor(double)|
+|Unsqueeze|(*in* data:**T**, *out* expanded:**T**)|1+|**T** = tensor(int32), tensor(bool), tensor(int16), tensor(bfloat16), tensor(uint8), unknown, tensor(uint32), tensor(uint16), tensor(float), tensor(uint64), tensor(MLFloat16), tensor(int64), tensor(double)|
+|Upsample|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *in* scales:**tensor(float)**, *out* Y:**T**)|[7, 9]|**T** = tensor(int32), tensor(float), tensor(MLFloat16), tensor(uint8), tensor(double)|
+|Xor|(*in* A:**T**, *in* B:**T**, *out* C:**T1**)|7+|**T** = tensor(bool)|
+| | ||**T1** = tensor(bool)|
+| |
+| |
+**Operator Domain:** *com.microsoft*
+|ConvTransposeWithDynamicPads|(*in* X:**T**, *in* W:**T**, *in* Pads:**tensor(int64)**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+| |
+| |
+
+
+## Operators implemented by MKLDNNExecutionProvider
+
+| Op Name | Parameters | OpSet Version | Types Supported |
+|---------|------------|---------------|-----------------|
+**Operator Domain:** *ai.onnx.ml*
+|AveragePool|(*in* X:**T**, *out* Y:**T**)|[7, 8]|**T** = tensor(float)|
+|BatchNormalization|(*in* X:**T**, *in* scale:**T**, *in* B:**T**, *in* mean:**T**, *in* var:**T**, *out* Y:**T**, *out* mean:**T**, *out* var:**T**, *out* saved_mean:**T**, *out* saved_var:**T**)|7+|**T** = tensor(float)|
+|Conv|(*in* X:**T**, *in* W:**T**, *in* B:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|Gemm|(*in* A:**T**, *in* B:**T**, *in* C:**T**, *out* Y:**T**)|7+|**T** = tensor(float)|
+|GlobalAveragePool|(*in* X:**T**, *out* Y:**T**)|[1, 8]|**T** = tensor(float)|
+|GlobalMaxPool|(*in* X:**T**, *out* Y:**T**)|[1, 8]|**T** = tensor(float)|
+|LRN|(*in* X:**T**, *out* Y:**T**)|1+|**T** = tensor(float)|
+|MaxPool|(*in* X:**T**, *out* Y:**T**) or (*in* X:**T**, *out* Y:**T**, *out* Indices:**I**)|[1, 7]|**T** = tensor(float)|
+| | |[8, 8]|**T** = tensor(float)|
+|Relu|(*in* X:**T**, *out* Y:**T**)|6+|**T** = tensor(float)|
+|Sum|(*in* data_0:**T**, *out* sum:**T**)|6+|**T** = tensor(float)|
+| |
+| |
diff --git a/include/onnxruntime/core/framework/kernel_def_builder.h b/include/onnxruntime/core/framework/kernel_def_builder.h
index 3c093f4540..5f78334836 100644
--- a/include/onnxruntime/core/framework/kernel_def_builder.h
+++ b/include/onnxruntime/core/framework/kernel_def_builder.h
@@ -42,6 +42,12 @@ class KernelDef {
     *end = op_since_version_end_;
   }
 
+#ifdef onnxruntime_PYBIND_EXPORT_OPSCHEMA
+  const std::pair<int, int> SinceVersion() const {
+    return std::pair<int, int>(op_since_version_start_, op_since_version_end_);
+  }
+#endif
+
   onnxruntime::ProviderType Provider() const {
     return provider_type_;
   }
diff --git a/include/onnxruntime/core/framework/kernel_registry.h b/include/onnxruntime/core/framework/kernel_registry.h
index 3a0d35e298..95d9b1d415 100644
--- a/include/onnxruntime/core/framework/kernel_registry.h
+++ b/include/onnxruntime/core/framework/kernel_registry.h
@@ -39,6 +39,14 @@ class KernelRegistry {
 
   bool IsEmpty() const { return kernel_creator_fn_map_.empty(); }
 
+#ifdef onnxruntime_PYBIND_EXPORT_OPSCHEMA
+// This is used by the opkernel doc generator to enlist all registered operators for a given provider's opkernel
+  const KernelCreateMap& GetKernelCreateMap() const
+  {
+    return kernel_creator_fn_map_;
+  }
+#endif
+
  private:
   // Check whether the types of inputs/outputs of the given node match the extra
   // type-constraints of the given kernel. This serves two purposes: first, to
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 33d330f7cf..7c13c98a74 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -18,6 +18,15 @@ namespace onnxruntime {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()), \
       KERNEL_CLASS<TYPE>);
 
+#define REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(OP_TYPE, VERSION, TYPE, KERNEL_CLASS)         \
+  ONNX_CPU_OPERATOR_TYPED_KERNEL(                                                  \
+      OP_TYPE,                                                                     \
+      VERSION,                                                                     \
+      TYPE,                                                                        \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>())  \
+                        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()), \
+      KERNEL_CLASS<TYPE>);
+
 #define REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, TYPE, KERNEL_CLASS) \
   ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                                           \
       OP_TYPE,                                                                                        \
@@ -26,6 +35,15 @@ namespace onnxruntime {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>()),                    \
       KERNEL_CLASS<TYPE>);
 
+#define REG_ELEMENTWISE_LOGICALOP_VERSIONED_TYPED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, TYPE, KERNEL_CLASS) \
+  ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(                                                           \
+      OP_TYPE,                                                                                        \
+      VERSION_FROM, VERSION_TO,                                                                       \
+      TYPE,                                                                                           \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<TYPE>())                    \
+                        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),                    \
+      KERNEL_CLASS<TYPE>);
+
 REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, float, Add);
 REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, double, Add);
 REG_ELEMENTWISE_TYPED_KERNEL(Add, 7, int32_t, Add);
@@ -88,46 +106,55 @@ REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Max, 6, 7, float, Max_6);
 REG_ELEMENTWISE_TYPED_KERNEL(Max, 8, float, Max_8);
 REG_ELEMENTWISE_TYPED_KERNEL(Max, 8, double, Max_8);
 
-REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Less, 7, 9, float, Less);
-REG_ELEMENTWISE_TYPED_KERNEL(Less, 9, int32_t, Less);
-REG_ELEMENTWISE_TYPED_KERNEL(Less, 9, int64_t, Less);
+REG_ELEMENTWISE_LOGICALOP_VERSIONED_TYPED_KERNEL(Less, 7, 9, float, Less);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Less, 9, int32_t, Less);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Less, 9, int64_t, Less);
 
-REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Greater, 7, 9, float, Greater)
-REG_ELEMENTWISE_TYPED_KERNEL(Greater, 9, int32_t, Greater);
-REG_ELEMENTWISE_TYPED_KERNEL(Greater, 9, int64_t, Greater);
+REG_ELEMENTWISE_LOGICALOP_VERSIONED_TYPED_KERNEL(Greater, 7, 9, float, Greater)
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Greater, 9, int32_t, Greater);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Greater, 9, int64_t, Greater);
 
-REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, bool, Equal);
-REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, int32_t, Equal);
-REG_ELEMENTWISE_TYPED_KERNEL(Equal, 7, int64_t, Equal);
-REG_ELEMENTWISE_TYPED_KERNEL(Equal, 11, float, Equal);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Equal, 7, bool, Equal);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Equal, 7, int32_t, Equal);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Equal, 7, int64_t, Equal);
+REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Equal, 11, float, Equal);
 
 REG_ELEMENTWISE_VERSIONED_TYPED_KERNEL(Mean, 6, 7, float, Mean_6);
 REG_ELEMENTWISE_TYPED_KERNEL(Mean, 8, float, Mean_8);
 
 REG_ELEMENTWISE_TYPED_KERNEL(Erf, 9, float, Erf);
 
+// REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Not, 1, bool, Not);
+// REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(And, 7, bool, And);
+// REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Or, 7, bool, Or);
+// REG_ELEMENTWISE_LOGICALOP_TYPED_KERNEL(Xor, 7, bool, Xor);
+
 ONNX_CPU_OPERATOR_KERNEL(
     Not,
     1,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     Not);
 
 ONNX_CPU_OPERATOR_KERNEL(
     And,
     7,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     And);
 
 ONNX_CPU_OPERATOR_KERNEL(
     Or,
     7,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     Or);
 
 ONNX_CPU_OPERATOR_KERNEL(
     Xor,
     7,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<bool>())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     Xor);
 
 template <typename T>
diff --git a/onnxruntime/core/providers/cpu/nn/Unpool.cc b/onnxruntime/core/providers/cpu/nn/Unpool.cc
index 3b1c16f354..853bd05cdd 100644
--- a/onnxruntime/core/providers/cpu/nn/Unpool.cc
+++ b/onnxruntime/core/providers/cpu/nn/Unpool.cc
@@ -18,9 +18,9 @@ ONNX_CPU_OPERATOR_KERNEL(
     MaxUnpool,
     9,
     KernelDefBuilder()
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>())
-        .TypeConstraint("Y", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())
+        .TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()),
+        // .TypeConstraint("Y", DataTypeImpl::GetTensorType<float>()),
     MaxUnpool);
 
 Status MaxUnpool::Compute(OpKernelContext* context) const {
diff --git a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
index d6dce056ea..78a5367932 100644
--- a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
@@ -15,7 +15,8 @@ ONNX_OPERATOR_KERNEL_EX(
     KernelDefBuilder()
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<uint8_t>())
         .TypeConstraint("T2", DataTypeImpl::GetTensorType<uint8_t>())
-        .TypeConstraint("T3", DataTypeImpl::GetTensorType<uint8_t>()),
+        .TypeConstraint("T3", DataTypeImpl::GetTensorType<uint8_t>())
+        .TypeConstraint("T4", DataTypeImpl::GetTensorType<int32_t>()),
     QLinearConv);
 
 Status QLinearConv::Compute(OpKernelContext* context) const {
diff --git a/onnxruntime/core/providers/cpu/tensor/compress.cc b/onnxruntime/core/providers/cpu/tensor/compress.cc
index e732121adb..b3f82bf9fd 100644
--- a/onnxruntime/core/providers/cpu/tensor/compress.cc
+++ b/onnxruntime/core/providers/cpu/tensor/compress.cc
@@ -9,7 +9,8 @@ namespace onnxruntime {
 ONNX_CPU_OPERATOR_KERNEL(
     Compress,
     9,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     Compress);
 
 Status Compress::Compute(OpKernelContext* ctx) const {
diff --git a/onnxruntime/core/providers/cpu/tensor/identity_op.cc b/onnxruntime/core/providers/cpu/tensor/identity_op.cc
index b7fe35c73f..f431d9de70 100644
--- a/onnxruntime/core/providers/cpu/tensor/identity_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/identity_op.cc
@@ -10,7 +10,8 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     7, 9,
     KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(), 
                                             DataTypeImpl::GetTensorType<float>(), 
-                                            DataTypeImpl::GetTensorType<double>()}),
+                                            DataTypeImpl::GetTensorType<double>()})
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     IdentityOp<true>);
 
 ONNX_CPU_OPERATOR_KERNEL(
diff --git a/onnxruntime/core/providers/cpu/tensor/size.cc b/onnxruntime/core/providers/cpu/tensor/size.cc
index 675c14b8cf..75bdd5bec2 100644
--- a/onnxruntime/core/providers/cpu/tensor/size.cc
+++ b/onnxruntime/core/providers/cpu/tensor/size.cc
@@ -41,7 +41,8 @@ ONNX_CPU_OPERATOR_KERNEL(
                                                                DataTypeImpl::GetTensorType<uint32_t>(),
                                                                DataTypeImpl::GetTensorType<uint64_t>(),
                                                                DataTypeImpl::GetTensorType<std::string>(),
-                                                               DataTypeImpl::GetTensorType<bool>()})),
+                                                               DataTypeImpl::GetTensorType<bool>()}))
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Size);
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc
index 984f490ade..1b0ab391fb 100644
--- a/onnxruntime/core/providers/cpu/tensor/tile.cc
+++ b/onnxruntime/core/providers/cpu/tensor/tile.cc
@@ -34,7 +34,8 @@ ONNX_CPU_OPERATOR_KERNEL(
                                             DataTypeImpl::GetTensorType<uint16_t>(),
                                             DataTypeImpl::GetTensorType<uint32_t>(),
                                             DataTypeImpl::GetTensorType<uint64_t>(),
-                                            DataTypeImpl::GetTensorType<bool>()}),
+                                            DataTypeImpl::GetTensorType<bool>()})
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
     Tile);
 
 Status TileCoreForFixedSizeTypes(const Tensor& input_tensor, Tensor& output_tensor, const int64_t* repeats, TensorAxisCounters& input_counters, const TensorPitches& output_pitches, size_t element_size) {
diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
index 16f6246b3d..6f679c8a6c 100644
--- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
@@ -92,6 +92,17 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, int
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       x<T>);
 
+#define BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(x, ver, T)                     \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
+      x,                                                                        \
+      kOnnxDomain,                                                              \
+      ver,                                                                      \
+      T,                                                                        \
+      kCudaExecutionProvider,                                                   \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+                        .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()), \
+      x<T>);
+
 #define BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(x, startver, endver, T) \
   ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                         \
       x,                                                                           \
@@ -127,6 +138,11 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, int
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, T) \
   BINARY_ELEMENTWISE_COMPUTE(name, T)
 
+#define BINARY_LOGICALOP_TYPED(name, ver, T)                    \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, T) \
+  BINARY_ELEMENTWISE_COMPUTE(name, T)
+
+
 // since different ops has different types, we cannot use BINARY_OPS() directly
 // the postfix of means the types supported by the op:
 // B: uint8_t
@@ -155,10 +171,15 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, int
   BINARY_OP_HFD(name, ver)
 
 #define BINARY_OP_REGISTER_OIL(name, ver)                        \
-  BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, bool)  \
+  BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, bool)      \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int32_t)   \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int64_t)
 
+#define BINARY_LOGICALOP_REGISTER_OIL(name, ver)                          \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, bool)     \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int32_t)  \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int64_t)
+
 #define BINARY_OP_REGISTER_HFD(name, ver)                        \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, MLFloat16) \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, float)     \
@@ -171,6 +192,15 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, int
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int64_t)  \
   BINARY_OP_REGISTER_HFD(name, ver)
 
+#define BINARY_LOGICALOP_REGISTER_UZILHFD(name, ver)                        \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, uint32_t)   \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, uint64_t)   \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int32_t)    \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int64_t)    \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, MLFloat16)  \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, float)      \
+  BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, double)
+
 #define BINARY_OP_REGISTER_VERSIONED_HFD(name, startver, endver)                        \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, MLFloat16) \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, float)     \
@@ -188,9 +218,9 @@ BINARY_OP_UZILHFD(Sub, 7)
 BINARY_OP_UZILHFD(Mul, 7)
 BINARY_OP_UZILHFD(Div, 7)
 BINARY_OP_HFD(Pow, 7)
-BINARY_OP_TYPED(And, 7, bool)
-BINARY_OP_TYPED(Or, 7, bool)
-BINARY_OP_TYPED(Xor, 7, bool)
+BINARY_LOGICALOP_TYPED(And, 7, bool)
+BINARY_LOGICALOP_TYPED(Or, 7, bool)
+BINARY_LOGICALOP_TYPED(Xor, 7, bool)
 BINARY_OP_HFD(PRelu, 7)
 
 template <typename T>
@@ -440,7 +470,7 @@ Status Equal<T>::ComputeInternal(OpKernelContext* context) const {
 
 BINARY_OP_REGISTER_UZILHFD(Sum, 8)
 BINARY_OP_REGISTER_VERSIONED_UZILHFD(Sum, 6, 7)
-BINARY_OP_REGISTER_UZILHFD(Greater, 9)
+BINARY_LOGICALOP_REGISTER_UZILHFD(Greater, 9)
 BINARY_OP_REGISTER_OIL(Equal, 7)
 BINARY_OP_REGISTER_VERSIONED_HFD(Greater, 7, 8)
 BINARY_OP_REGISTER_HFD(Max, 8)
diff --git a/onnxruntime/core/providers/cuda/tensor/compress.cc b/onnxruntime/core/providers/cuda/tensor/compress.cc
index 4e33a42184..9e23ad6a5f 100644
--- a/onnxruntime/core/providers/cuda/tensor/compress.cc
+++ b/onnxruntime/core/providers/cuda/tensor/compress.cc
@@ -13,7 +13,8 @@ ONNX_OPERATOR_KERNEL_EX(
     kOnnxDomain,
     9,
     kCudaExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
+                      .TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
     Compress);
 
 Status Compress::ComputeInternal(OpKernelContext* ctx) const {
diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc
index 390d9139de..854c784c8a 100644
--- a/onnxruntime/core/providers/cuda/tensor/tile.cc
+++ b/onnxruntime/core/providers/cuda/tensor/tile.cc
@@ -17,7 +17,8 @@ namespace cuda {
       kCudaExecutionProvider,                                     \
       KernelDefBuilder()                                          \
           .InputMemoryType<OrtMemTypeCPUInput>(1)                 \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()), \
       Tile<T>);
 
 template <typename T>
diff --git a/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.cc b/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.cc
index c94060d5b2..2cc7e112a2 100644
--- a/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.cc
+++ b/onnxruntime/core/providers/mkldnn/mkldnn_provider_factory.cc
@@ -27,6 +27,7 @@ std::unique_ptr<IExecutionProvider> MkldnnProviderFactory::CreateProvider() {
 
 std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory_Mkldnn(int device_id) {
   return std::make_shared<onnxruntime::MkldnnProviderFactory>(device_id);
+  //TODO: This is apparently a bug. The consructor parameter is create-arena-flag, not the device-id
 }
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index b22b274518..88d2620c13 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -290,11 +290,84 @@ void addGlobalMethods(py::module& m) {
         return ONNX_NAMESPACE::OpSchemaRegistry::get_all_schemas_with_history();
       },
       "Return a vector of OpSchema all registed operators");
+  m.def(
+      "get_all_opkernel_def", []() -> const std::vector<onnxruntime::KernelDef> {
+        std::vector<onnxruntime::KernelDef> result;
+
+        // default logger is needed to create the MklDNNExecutionProvider
+        std::string default_logger_id{"DefaultLogger"};
+        std::unique_ptr<onnxruntime::logging::LoggingManager> default_logging_manager = 
+                  std::make_unique<LoggingManager>(
+                          std::unique_ptr<onnxruntime::logging::ISink>{ new onnxruntime::logging::CLogSink {}}, 
+                          onnxruntime::logging::Severity::kWARNING, 
+                          false,
+                          onnxruntime::logging::LoggingManager::InstanceType::Default, 
+                          &default_logger_id, 
+                          /*default_max_vlog_level*/ -1);
+     
+        std::vector<std::shared_ptr<onnxruntime::IExecutionProviderFactory>> factories = {
+          onnxruntime::CreateExecutionProviderFactory_CPU(0),
+#ifdef USE_CUDA
+          onnxruntime::CreateExecutionProviderFactory_CUDA(0),
 #endif
+#ifdef USE_MKLDNN
+          onnxruntime::CreateExecutionProviderFactory_Mkldnn(1),
+#endif
+#ifdef USE_NGRAPH
+          onnxruntime::CreateExecutionProviderFactory_NGraph("CPU"),
+#endif
+#ifdef USE_OPENVINO
+          onnxruntime::CreateExecutionProviderFactory_OpenVINO("CPU"),
+#endif    
+#ifdef  USE_TENSORRT    
+          onnxruntime::CreateExecutionProviderFactory_Tensorrt()
+#endif          
+        };
+
+      for (const auto& f: factories){
+        for (const auto& m: f->CreateProvider()
+                       ->GetKernelRegistry()
+                       ->GetKernelCreateMap()){
+          result.emplace_back(*(m.second.kernel_def)); 
+        }
+      }
+
+      return result;
+    },
+    "Return a vector of KernelDef for all registered OpKernels"
+  );
+#endif //onnxruntime_PYBIND_EXPORT_OPSCHEMA
 }
 
 #ifdef onnxruntime_PYBIND_EXPORT_OPSCHEMA
 
+void addOpKernelSubmodule(py::module& m){
+  auto opkernel = m.def_submodule("opkernel");
+  opkernel.doc() = "OpKernel submodule";
+  py::class_<onnxruntime::KernelDef> kernel_def(opkernel, "KernelDef");
+  kernel_def.def_property_readonly("op_name", &onnxruntime::KernelDef::OpName)
+            .def_property_readonly("domain", &onnxruntime::KernelDef::Domain)
+            .def_property_readonly("provider", &onnxruntime::KernelDef::Provider)
+            .def_property_readonly("version_range", 
+              [](const onnxruntime::KernelDef& kernelDef) -> std::pair<int, int> {
+                return kernelDef.onnxruntime::KernelDef::SinceVersion();
+              })
+            .def_property_readonly("type_constraints", 
+              [](const onnxruntime::KernelDef& kernelDef) -> std::unordered_map<std::string, std::vector<std::string> > {
+                std::unordered_map<std::string, std::vector<std::string> > result;
+                const auto& tempResult = kernelDef.TypeConstraints();
+                for (const auto& tc: tempResult){
+                  result[tc.first] = std::vector<std::string>();
+                  for (const auto& dt: tc.second){
+                    result[tc.first].emplace_back(onnxruntime::DataTypeImpl::ToString(dt));  
+                  }
+                }
+                return result;
+              })
+            ;
+}
+
+
 void addOpSchemaSubmodule(py::module& m) {
   auto schemadef = m.def_submodule("schemadef");
   schemadef.doc() = "Schema submodule";
@@ -641,6 +714,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
       });
 }
 
+
 PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
   m.doc() = "pybind11 stateful interface to ONNX runtime";
 
@@ -670,6 +744,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
 
 #ifdef onnxruntime_PYBIND_EXPORT_OPSCHEMA
   addOpSchemaSubmodule(m);
+  addOpKernelSubmodule(m);
 #endif
 }
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index bbaa891ef2..c8d555fb96 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -778,23 +778,43 @@ def build_protoc_for_host(cmake_path, source_dir, build_dir, args):
 
 def generate_documentation(source_dir, build_dir, configs):
     operator_doc_path = os.path.join(source_dir, 'docs', 'ContribOperators.md')
+    opkernel_doc_path = os.path.join(source_dir, 'docs', 'OperatorKernels.md')
     for config in configs:
         #copy the gen_doc.py
         shutil.copy(os.path.join(source_dir,'tools','python','gen_doc.py'),
                     os.path.join(build_dir,config, config))
+        shutil.copy(os.path.join(source_dir,'tools','python','gen_opkernel_doc.py'),
+                    os.path.join(build_dir,config, config))
+
         run_subprocess([
                         sys.executable,
                         'gen_doc.py',
                         '--output_path', operator_doc_path
                     ],
                     cwd = os.path.join(build_dir,config, config))
+
+        run_subprocess([
+                        sys.executable,
+                        'gen_opkernel_doc.py',
+                        '--output_path', opkernel_doc_path
+                    ],
+                    cwd = os.path.join(build_dir,config, config))
+
+    docdiff = ''
+    try:
+        docdiff = subprocess.check_output(['git', 'diff', opkernel_doc_path])
+    except subprocess.CalledProcessError:
+        print('git diff returned non-zero error code')
+    if len(docdiff) > 0:
+        # Show warning instead of throwing exception, because it is dependent on build configuration for including execution propviders 
+        log.warning('The updated opkernel document file '+str(opkernel_doc_path)+' is different from the checked in version. Consider regenrating the file with CPU, MKLDNN and CUDA providers enabled.')
+        log.debug('diff:\n'+str(docdiff))
+
     docdiff = ''
     try:
         docdiff = subprocess.check_output(['git', 'diff', operator_doc_path])
     except subprocess.CalledProcessError:
         print('git diff returned non-zero error code')
-
-
     if len(docdiff) > 0:
         raise BuildError('The updated operator document file '+str(operator_doc_path)+' must be checked in.\n diff:\n'+str(docdiff))
 
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index bdaa8f0cf5..d844c6c586 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -4,7 +4,7 @@ jobs:
     AgentPool : 'Win-CPU'
     DoDebugBuild: 'true'
     DoCompliance: 'false'
-    BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_tvm --enable_pybind --use_mkldnn --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests --gen_doc'
+    BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe --use_tvm --enable_pybind --use_mkldnn --use_openmp --build_shared_lib --build_csharp --enable_onnx_tests'
     JobName: 'Windows_CI_Dev'
     DoNugetPack:  'false'
     NuPackScript : ''
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index 96a8f6c797..fdbd7f9105 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -4,7 +4,7 @@ jobs:
     AgentPool : 'Win-GPU-CUDA10'
     DoDebugBuild: 'true'
     DoCompliance: 'false'
-    BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe  --enable_pybind --use_openmp --use_mkldnn --use_mkldnn --build_shared_lib  --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10_trt515dll" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --msvc_toolset=14.11'
+    BuildCommand: '$(Build.SourcesDirectory)\tools\ci_build\build.py --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --cmake_path $(Build.BinariesDirectory)\cmake\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake\bin\ctest.exe  --enable_pybind --use_openmp --use_mkldnn --use_mkldnn --build_shared_lib  --build_csharp --enable_onnx_tests --use_cuda --cuda_version=10.0 --cuda_home="C:\local\cuda_10.0.130_win10_trt515dll" --cudnn_home="C:\local\cudnn-10.0-windows10-x64-v7.3.1.20\cuda" --msvc_toolset=14.11 --gen_doc'
     JobName: 'Windows_CI_GPU_Dev'
     DoNugetPack:  'false'
     NuPackScript : ''
diff --git a/tools/python/gen_opkernel_doc.py b/tools/python/gen_opkernel_doc.py
new file mode 100644
index 0000000000..8fd004a2ee
--- /dev/null
+++ b/tools/python/gen_opkernel_doc.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+
+from collections import defaultdict
+import io
+import os
+import sys
+import argparse
+
+
+import onnxruntime as rt
+import onnxruntime.capi.onnxruntime_pybind11_state as rtpy 
+from onnxruntime.capi.onnxruntime_pybind11_state import opkernel
+from onnxruntime.capi.onnxruntime_pybind11_state import schemadef 
+from onnxruntime.capi.onnxruntime_pybind11_state.opkernel import KernelDef 
+from onnxruntime.capi.onnxruntime_pybind11_state.schemadef import OpSchema 
+
+
+def format_version_range(v):
+    if (v[1] >= 2147483647):
+        return str(v[0])+'+'
+    else:
+        return '['+str(v[0])+', '+str(v[1])+']'    
+
+def format_type_constraints(tc):
+    counter = 0
+    tcstr = ''
+    firsttcitem = True
+    for tcitem in tc:
+        counter += 1
+        if firsttcitem:
+            firsttcitem = False
+        else:
+            tcstr += ', '
+        tcstr += tcitem
+    return tcstr
+
+def format_param_strings(params):
+    firstparam = True
+    s = ''
+    if params:
+        for param in params:
+            if firstparam:
+                firstparam = False
+            else:
+                s += ' or '
+            s += param
+    return s
+    
+def main(args):  # type: (Type[Args]) -> None
+    
+    with io.open(args.output, 'w', newline='', encoding="utf-8") as fout:
+        fout.write('## Supported Operators Data Types\n')
+        fout.write(
+            "*This file is automatically generated from the\n"
+            "            [def files](/onnxruntime/core/providers/cpu/cpu_execution_provider.cc) via [this script](/tools/python/gen_opkernel_doc.py).\n"
+            "            Do not modify directly and instead edit operator definitions.*\n")
+        opdef = rtpy.get_all_operator_schema()
+        paramdict = {}
+        for schema in opdef:
+            inputs = schema.inputs
+            domain = schema.domain
+            if (domain == ''):
+                domain = 'ai.onnx.ml'
+            fullname = domain+'.'+schema.name
+            paramstr = '('
+            firstinput = True
+            if inputs:
+                for inp in inputs:
+                    if firstinput:
+                        firstinput = False
+                    else:
+                        paramstr += ', '
+                    paramstr += '*in* {}:**{}**'.format(inp.name, inp.typeStr)
+
+            outputs = schema.outputs
+            if outputs:
+                for outp in outputs:
+                    if firstinput:
+                        firstinput = False
+                    else:
+                        paramstr += ', '
+                    paramstr += '*out* {}:**{}**'.format(outp.name, outp.typeStr)
+
+            paramstr += ')'
+            paramset = paramdict.get(fullname,None)
+            if paramset == None:
+                paramdict[fullname] = set()
+            
+            paramdict[fullname].add(paramstr)
+
+        index = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) 
+        for op in rtpy.get_all_opkernel_def():
+            domain = op.domain
+            if (domain == ''):
+                domain = 'ai.onnx.ml'
+            index[op.provider][domain][op.op_name].append(op)
+
+               
+        fout.write('\n')
+        for provider, domainmap in sorted(index.items()):
+            fout.write('\n\n## Operators implemented by '+provider+'\n\n')
+            fout.write('| Op Name | Parameters | OpSet Version | Types Supported |\n')
+            fout.write('|---------|------------|---------------|-----------------|\n')
+            for domain, namemap in sorted(domainmap.items()):
+                fout.write('**Operator Domain:** *'+domain+'*\n')
+                for name, ops in sorted(namemap.items()):
+                    last_version = (0,0)
+                    version_type_index = defaultdict(lambda: defaultdict(set))
+                    for op in ops: 
+                        formatted_version_range = format_version_range(op.version_range)
+                        for tname,tclist in op.type_constraints.items():
+                            for c in tclist:
+                                version_type_index[formatted_version_range][tname].add(c)
+
+                    namefirsttime = True
+                    for version, typemap in sorted(version_type_index.items()):
+                        versionfirsttime = True
+                        for tname, tcset in sorted(typemap.items()):
+                            if (namefirsttime):
+                                params = paramdict.get(domain+'.'+name, None)
+                                fout.write('|'+name+'|'+format_param_strings(params) +'|')
+                                namefirsttime = False
+                            else:
+                                fout.write('| | |')
+                            if (versionfirsttime):
+                                versionfirsttime = False
+                                fout.write(version+'|')
+                            else:
+                                fout.write('|')
+
+                            tclist = []
+                            for tc in tcset:
+                                tclist.append(tc)
+                            fout.write('**'+tname+'** = '+format_type_constraints(tclist)+'|\n')
+                        
+                fout.write('| |\n| |\n')
+        
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ONNX Runtime Operator Kernel Documentation Generator')
+    parser.add_argument('--output_path', help='output markdown file path', 
+                        default=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'OperatorKernels.md')
+                       )
+    args = parser.parse_args()
+
+
+    class Args(object):
+        output = args.output_path
+    main(Args)