diff --git a/onnxruntime/python/tools/microbench/attention.py b/onnxruntime/python/tools/microbench/attention.py
index dc8291309f..285b42b7cb 100644
--- a/onnxruntime/python/tools/microbench/attention.py
+++ b/onnxruntime/python/tools/microbench/attention.py
@@ -23,6 +23,7 @@ class BenchmarkAttention(BenchmarkOp):
     def __init__(self, args):
         BenchmarkOp.__init__(self, args)
 
+    @classmethod
     def create_inputs_outputs(cls, op_param):
         np.random.seed(0)
         input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
@@ -50,6 +51,7 @@ class BenchmarkAttention(BenchmarkOp):
         op_param = OpParam(1, 384, 768, 768 * 3, data_type)
         self.add_case(op_param, model)
 
+    @classmethod
     def case_profile(cls, op_param, time):
         profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms"
         return profile
diff --git a/onnxruntime/python/tools/microbench/cast.py b/onnxruntime/python/tools/microbench/cast.py
index 86219a99ac..968338b080 100644
--- a/onnxruntime/python/tools/microbench/cast.py
+++ b/onnxruntime/python/tools/microbench/cast.py
@@ -30,6 +30,7 @@ class BenchmarkCast(BenchmarkOp):
     def __init__(self, args):
         BenchmarkOp.__init__(self, args)
 
+    @classmethod
     def create_inputs_outputs(cls, op_param):
         np.random.seed(0)
         input_data = np.random.rand(op_param.x, op_param.y, op_param.m, op_param.n).astype(op_param.input_data_type)
@@ -89,6 +90,7 @@ class BenchmarkCast(BenchmarkOp):
         model_param = ModelParam(32, 1024)
         self.add_model_cases(model_param, model, input_data_type, output_data_type)
 
+    @classmethod
     def case_profile(cls, op_param, time):
         profile = f"(x y m n input_data_type) = ({op_param.x} {op_param.y} {op_param.m} {op_param.n} {op_param.input_data_type}), {time:7.4f} ms"
         return profile
diff --git a/onnxruntime/python/tools/microbench/conv.py b/onnxruntime/python/tools/microbench/conv.py
new file mode 100644
index 0000000000..d3a4c6867f
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/conv.py
@@ -0,0 +1,62 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import argparse
+from dataclasses import dataclass
+
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    n: int
+    cout: int
+    cin: int
+    h: int
+    w: int
+
+    data_type: type
+
+
+class BenchmarkConv(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    @classmethod
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.n, op_param.cin, op_param.h, op_param.w).astype(op_param.data_type)
+        weight = np.random.rand(op_param.cout, op_param.cin, 3, 3).astype(op_param.data_type)
+        bias = np.random.rand(op_param.cout).astype(op_param.data_type)
+        output = np.random.rand(op_param.n, op_param.cout, op_param.h, op_param.w).astype(op_param.data_type)
+        inputs = {"input": input_data, "weight": weight, "bias": bias}
+        outputs = {"conv": output}
+        return inputs, outputs
+
+    def create_cases(self):
+        # attributes of model : kernel_shape(3,3), group(1), pads(1,1), strides(1,1), dilations(1,1)
+        model = "models/conv_fp16.onnx" if self.args.precision == "fp16" else "models/conv_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+
+        # change here to test your data shape
+        self.add_case(OpParam(2, 320, 320, 64, 64, data_type), model)
+
+    @classmethod
+    def case_profile(cls, op_param, time):
+        profile = f"( n cout cin h w ) = ( {op_param.n} {op_param.cout} {op_param.cin} {op_param.h} {op_param.w} ), {time * 1000:7.4f} us"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkConv(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/microbench/fast_gelu.py b/onnxruntime/python/tools/microbench/fast_gelu.py
index 82f86020a6..5c6cb764f2 100644
--- a/onnxruntime/python/tools/microbench/fast_gelu.py
+++ b/onnxruntime/python/tools/microbench/fast_gelu.py
@@ -30,6 +30,7 @@ class BenchmarkFastGelu(BenchmarkOp):
     def __init__(self, args):
         BenchmarkOp.__init__(self, args)
 
+    @classmethod
     def create_inputs_outputs(cls, op_param):
         np.random.seed(0)
         a = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type)
@@ -52,6 +53,7 @@ class BenchmarkFastGelu(BenchmarkOp):
         )
         self.add_case(op_param, model)
 
+    @classmethod
     def case_profile(cls, op_param, time):
         profile = f"(dim1 dim2 dim3) = ({op_param.dim1} {op_param.dim2} {op_param.dim3}), {time:7.4f} ms"
         return profile
diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py
index cdac59cbbf..2e40b55c91 100644
--- a/onnxruntime/python/tools/microbench/matmul.py
+++ b/onnxruntime/python/tools/microbench/matmul.py
@@ -34,6 +34,7 @@ class BenchmarkMatMul(BenchmarkOp):
     def __init__(self, args):
         BenchmarkOp.__init__(self, args)
 
+    @classmethod
     def create_inputs_outputs(cls, op_param):
         np.random.seed(0)
         a = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.k).astype(op_param.data_type)
@@ -85,6 +86,7 @@ class BenchmarkMatMul(BenchmarkOp):
         model_param = ModelParam(1, 384, 768, 768 * 4, 12, data_type)
         self.add_model_cases(model_param, model)
 
+    @classmethod
     def case_profile(cls, op_param, time):
         tflops = op_param.b1 * op_param.b2 * op_param.m * op_param.k * op_param.n * 2 / time / 1000000000
         profile = f"(b1 b2 m k n) = ({op_param.b1} {op_param.b2} {op_param.m} {op_param.k} {op_param.n}), {time:7.4f} ms, {tflops:4.2f} tflops"
diff --git a/onnxruntime/python/tools/microbench/models/conv_fp16.onnx b/onnxruntime/python/tools/microbench/models/conv_fp16.onnx
new file mode 100644
index 0000000000..ab72aabf7c
Binary files /dev/null and b/onnxruntime/python/tools/microbench/models/conv_fp16.onnx differ
diff --git a/onnxruntime/python/tools/microbench/models/conv_fp32.onnx b/onnxruntime/python/tools/microbench/models/conv_fp32.onnx
new file mode 100644
index 0000000000..3f9692c433
Binary files /dev/null and b/onnxruntime/python/tools/microbench/models/conv_fp32.onnx differ
diff --git a/onnxruntime/python/tools/microbench/models/nhwcConv_fp16.onnx b/onnxruntime/python/tools/microbench/models/nhwcConv_fp16.onnx
new file mode 100644
index 0000000000..d5e57160aa
Binary files /dev/null and b/onnxruntime/python/tools/microbench/models/nhwcConv_fp16.onnx differ
diff --git a/onnxruntime/python/tools/microbench/models/nhwcConv_fp32.onnx b/onnxruntime/python/tools/microbench/models/nhwcConv_fp32.onnx
new file mode 100644
index 0000000000..e8a5dda2c1
Binary files /dev/null and b/onnxruntime/python/tools/microbench/models/nhwcConv_fp32.onnx differ
diff --git a/onnxruntime/python/tools/microbench/nhwcConv.py b/onnxruntime/python/tools/microbench/nhwcConv.py
new file mode 100644
index 0000000000..502a292614
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/nhwcConv.py
@@ -0,0 +1,62 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import argparse
+from dataclasses import dataclass
+
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    n: int
+    cout: int
+    cin: int
+    h: int
+    w: int
+
+    data_type: type
+
+
+class BenchmarkNhwcConv(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    @classmethod
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.n, op_param.h, op_param.w, op_param.cin).astype(op_param.data_type)
+        weight = np.random.rand(op_param.cout, 3, 3, op_param.cin).astype(op_param.data_type)
+        bias = np.random.rand(op_param.cout).astype(op_param.data_type)
+        output = np.random.rand(op_param.n, op_param.h, op_param.w, op_param.cout).astype(op_param.data_type)
+        inputs = {"input": input_data, "weight": weight, "bias": bias}
+        outputs = {"conv": output}
+        return inputs, outputs
+
+    def create_cases(self):
+        # attributes of model : kernel_shape(3,3), group(1), pads(1,1), strides(1,1), dilations(1,1)
+        model = "models/nhwcConv_fp16.onnx" if self.args.precision == "fp16" else "models/nhwcConv_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+
+        # change here to test your data shape
+        self.add_case(OpParam(2, 320, 320, 64, 64, data_type), model)
+
+    @classmethod
+    def case_profile(cls, op_param, time):
+        profile = f"( n cout cin h w ) = ( {op_param.n} {op_param.cout} {op_param.cin} {op_param.h} {op_param.w} ), {time * 1000:7.4f} us"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkNhwcConv(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/microbench/skip_layer_norm.py b/onnxruntime/python/tools/microbench/skip_layer_norm.py
index dbfda7ef30..1509f4ce92 100644
--- a/onnxruntime/python/tools/microbench/skip_layer_norm.py
+++ b/onnxruntime/python/tools/microbench/skip_layer_norm.py
@@ -22,6 +22,7 @@ class BenchmarkSkipLayerNorm(BenchmarkOp):
     def __init__(self, args):
         BenchmarkOp.__init__(self, args)
 
+    @classmethod
     def create_inputs_outputs(cls, op_param):
         np.random.seed(0)
         input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(
@@ -55,6 +56,7 @@ class BenchmarkSkipLayerNorm(BenchmarkOp):
         op_param = OpParam(1, 384, 1024, data_type)
         self.add_case(op_param, model)
 
+    @classmethod
     def case_profile(cls, op_param, time):
         profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms"
         return profile