From 7c5bfbaaab083aa270a4ca6c54dc88f2ff2b6d1a Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Fri, 29 Jan 2021 19:28:09 -0800
Subject: [PATCH] Lochi/refactor yolov3 quantization (#6290)

* Refactor the code and move data reader, preprocessing, evaluation to
E2E_example_mode

* Refactor the code.

Move data reader, preprocessing, evaluation to model specific example
under E2E_example_mode

* refactor code

* Move yolov3 example to specific folder and add additional pre/post
processing
---
 .../trt/yolov3/data_reader.py                 |  11 +-
 .../trt/yolov3/e2e_user_yolov3_example.py     | 122 +++++---
 .../object_detection/trt/yolov3/evaluate.py   |  70 ++++-
 .../trt/yolov3/postprocessing.py              | 296 ++++++++++++++++++
 .../trt/yolov3/preprocessing.py               |  62 +++-
 .../python/tools/quantization/calibrate.py    |   9 +-
 6 files changed, 525 insertions(+), 45 deletions(-)
 create mode 100644 onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/postprocessing.py

diff --git a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/data_reader.py b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/data_reader.py
index 2adc698f36..dfd2eececc 100644
--- a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/data_reader.py
+++ b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/data_reader.py
@@ -1,5 +1,5 @@
 from onnxruntime.quantization import CalibrationDataReader
-from preprocessing import yolov3_preprocess_func, yolov3_vision_preprocess_func
+from preprocessing import yolov3_preprocess_func, yolov3_variant_preprocess_func
 import onnxruntime
 from argparse import Namespace
 import os
@@ -165,7 +165,7 @@ class YoloV3DataReader(ObejctDetectionDataReader):
         return batches
 
 
-class YoloV3VisionDataReader(YoloV3DataReader):
+class YoloV3VariantDataReader(YoloV3DataReader):
     def __init__(self,
                  calibration_image_folder,
                  width=608,
@@ -179,14 +179,17 @@ class YoloV3VisionDataReader(YoloV3DataReader):
                  annotations='./annotations/instances_val2017.json'):
         YoloV3DataReader.__init__(self, calibration_image_folder, width, height, start_index, end_index, stride,
                                   batch_size, model_path, is_evaluation, annotations)
-        self.input_name = 'images'
+        self.input_name = '000_net'
+        # self.input_name = 'images'
 
     def load_serial(self):
         width = self.width
         height = self.height
         input_name = self.input_name
-        nchw_data_list, filename_list, image_size_list = yolov3_vision_preprocess_func(
+        nchw_data_list, filename_list, image_size_list = yolov3_variant_preprocess_func(
             self.image_folder, height, width, self.start_index, self.stride)
+        # nchw_data_list, filename_list, image_size_list = yolov3_variant_2_preprocess_func(
+            # self.image_folder, height, width, self.start_index, self.stride)
 
         data = []
         if self.is_evaluation:
diff --git a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/e2e_user_yolov3_example.py b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/e2e_user_yolov3_example.py
index f968c5bfa1..0e8de6ca4a 100644
--- a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/e2e_user_yolov3_example.py
+++ b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/e2e_user_yolov3_example.py
@@ -1,29 +1,7 @@
 import os
 from onnxruntime.quantization import get_calibrator, write_calibration_table, generate_calibration_table
-from data_reader import YoloV3DataReader, YoloV3VisionDataReader
-from evaluate import YoloV3Evaluator, YoloV3VisionEvaluator
-from dataset_utils import *
-
-
-def get_prediction_evaluation(model_path, validation_dataset, providers):
-    data_reader = YoloV3DataReader(validation_dataset,
-                                   stride=1000,
-                                   batch_size=1,
-                                   model_path=model_path,
-                                   is_evaluation=True)
-    evaluator = YoloV3Evaluator(model_path, data_reader, providers=providers)
-
-    # data_reader = YoloV3VisionDataReader(validation_dataset, width=608, height=384, stride=1000, batch_size=1, model_path=model_path, is_evaluation=True)
-    # evaluator = YoloV3VisionEvaluator(model_path, data_reader, width=608, height=384, providers=providers)
-
-    evaluator.predict()
-    result = evaluator.get_result()
-
-    annotations = './annotations/instances_val2017.json'
-    # annotations = './annotations/instances_val2017_person.json'
-    print(result)
-    evaluator.evaluate(result, annotations)
-
+from data_reader import YoloV3DataReader, YoloV3VariantDataReader
+from evaluate import YoloV3Evaluator, YoloV3VariantEvaluator
 
 def get_calibration_table(model_path, augmented_model_path, calibration_dataset):
 
@@ -62,27 +40,101 @@ def get_calibration_table(model_path, augmented_model_path, calibration_dataset)
     '''
 
     # data_reader = YoloV3DataReader(calibration_dataset, stride=1000, batch_size=20, model_path=augmented_model_path)
-    # data_reader = YoloV3VisionDataReader(calibration_dataset, width=512, height=288, stride=1000, batch_size=20, model_path=augmented_model_path)
-    # data_reader = YoloV3VisionDataReader(calibration_dataset, width=608, height=384, stride=1000, batch_size=20, model_path=augmented_model_path)
     # calibrator.set_data_reader(data_reader)
     # generate_calibration_table(calibrator, model_path, augmented_model_path, True, data_reader)
 
     write_calibration_table(calibrator.get_calibration_cache())
     print('calibration table generated and saved.')
 
+def get_prediction_evaluation(model_path, validation_dataset, providers):
+    data_reader = YoloV3DataReader(validation_dataset,
+                                   stride=1000,
+                                   batch_size=1,
+                                   model_path=model_path,
+                                   is_evaluation=True)
+    evaluator = YoloV3Evaluator(model_path, data_reader, providers=providers)
+
+    evaluator.predict()
+    result = evaluator.get_result()
+
+    annotations = './annotations/instances_val2017.json'
+    print(result)
+    evaluator.evaluate(result, annotations)
+
+def get_calibration_table_yolov3_variant(model_path, augmented_model_path, calibration_dataset):
+
+    calibrator = get_calibrator(model_path, None, augmented_model_path=augmented_model_path)
+
+    # DataReader can handle dataset with batch or serial processing depends on its implementation
+    # Following examples show two different ways to generate calibration table
+    '''
+    1. Use serial processing
+    
+    We can use only one DataReader to do serial processing, however,
+    some machines don't have sufficient memory to hold all dataset images and all intermediate output.
+    So let multiple DataReader do handle different stride of dataset one by one.
+    DataReader will use serial processing when batch_size is 1.
+    '''
+
+    total_data_size = len(os.listdir(calibration_dataset))
+    start_index = 0
+    stride = 25 
+    for i in range(0, total_data_size, stride):
+        data_reader = YoloV3VariantDataReader(calibration_dataset,
+                                       width=608,
+                                       height=608,
+                                       start_index=start_index,
+                                       end_index=start_index + stride,
+                                       stride=stride,
+                                       batch_size=1,
+                                       model_path=augmented_model_path)
+        calibrator.set_data_reader(data_reader)
+        generate_calibration_table(calibrator, model_path, augmented_model_path, False, data_reader)
+        start_index += stride
+    '''
+    2. Use batch processing (much faster)
+    
+    Batch processing requires less memory for intermediate output, therefore let only one DataReader to handle dataset in batch. 
+    However, if encountering OOM, we can make multiple DataReader to do the job just like serial processing does. 
+    DataReader will use batch processing when batch_size > 1.
+    '''
+
+    # data_reader = YoloV3VariantDataReader(calibration_dataset, width=608, height=608, stride=1000, batch_size=20, model_path=augmented_model_path)
+    # calibrator.set_data_reader(data_reader)
+    # generate_calibration_table(calibrator, model_path, augmented_model_path, True, data_reader)
+
+    write_calibration_table(calibrator.get_calibration_cache())
+    print('calibration table generated and saved.')
+
+def get_prediction_evaluation_yolov3_variant(model_path, validation_dataset, providers):
+    data_reader = YoloV3VariantDataReader(validation_dataset, width=608, height=608, stride=1000, batch_size=1, model_path=model_path, is_evaluation=True)
+    evaluator = YoloV3VariantEvaluator(model_path, data_reader, width=608, height=608, providers=providers)
+
+    evaluator.predict()
+    result = evaluator.get_result()
+
+    annotations = './annotations/instances_val2017.json'
+    print(result)
+    evaluator.evaluate(result, annotations)
 
 if __name__ == '__main__':
 
-    model_path = 'yolov3_new.onnx'
-    # model_path = 'yolov3_288x512_batch_nms.onnx'
-    # model_path = 'yolov3_384x608_batch_nms.onnx'
-
+    yolov3 = 'model zoo'
     augmented_model_path = 'augmented_model.onnx'
-
     calibration_dataset = './test2017'
-
     validation_dataset = './val2017'
-    # validation_dataset = './val2017person'
 
-    get_calibration_table(model_path, augmented_model_path, calibration_dataset)
-    get_prediction_evaluation(model_path, validation_dataset, ["TensorrtExecutionProvider"])
+    if yolov3 == 'model zoo':
+        # ONNX Model Zoo yolov3
+        model_path = 'yolov3.onnx'
+        get_calibration_table(model_path, augmented_model_path, calibration_dataset)
+        get_prediction_evaluation(model_path, validation_dataset, ["TensorrtExecutionProvider"])
+    else:
+        # Yolov3 variants from here
+        # https://github.com/jkjung-avt/tensorrt_demos.git
+        model_path = 'yolov3-608.onnx'
+        get_calibration_table_yolov3_variant(model_path, augmented_model_path, calibration_dataset)
+        get_prediction_evaluation_yolov3_variant(model_path, validation_dataset, ["TensorrtExecutionProvider"])
+
+
+
diff --git a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/evaluate.py b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/evaluate.py
index 05d5c654f4..87074d0410 100644
--- a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/evaluate.py
+++ b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/evaluate.py
@@ -184,8 +184,76 @@ class YoloV3Evaluator:
         cocoEval.accumulate()
         cocoEval.summarize()
 
+class YoloV3VariantEvaluator(YoloV3Evaluator): 
+    def __init__(self, model_path,
+                       data_reader: CalibrationDataReader,
+                       width=608,
+                       height=384,
+                       providers=["CUDAExecutionProvider"],
+                       ground_truth_object_class_file="./coco-object-categories-2017.json",
+                       onnx_object_class_file="./onnx_coco_classes.txt"):
 
-class YoloV3VisionEvaluator(YoloV3Evaluator):
+        YoloV3Evaluator.__init__(self, model_path, data_reader,width, height, providers, ground_truth_object_class_file, onnx_object_class_file)
+
+    def predict(self):
+        from postprocessing import PostprocessYOLOWrapper 
+        session = onnxruntime.InferenceSession(self.model_path, providers=self.providers)
+        outputs = []
+
+        image_id_list = []
+        image_id_batch = []
+        image_size_list = []
+        image_size_batch = []
+
+        postprocess_yolo = PostprocessYOLOWrapper('yolov3', (608, 608))
+
+        while True:
+            inputs = self.data_reader.get_next()
+            if not inputs:
+                break
+            image_size_list = inputs["image_size"]
+            image_id_list = inputs["image_id"]
+            del inputs["image_size"]
+            del inputs["image_id"]
+
+            # in the case of batch size is 1
+            if type(image_id_list) == int:
+                image_size_list = [image_size_list]
+                image_id_list = [image_id_list]
+
+
+            image_size_batch.append(image_size_list)
+            image_id_batch.append(image_id_list)
+            outputs.append(session.run(None, inputs))
+
+        for i in range(len(outputs)):
+            output = outputs[i]
+            
+            for batch_i in range(self.data_reader.get_batch_size()):
+
+                if batch_i > len(image_size_batch[i])-1 or batch_i > len(image_id_batch[i])-1:
+                    continue
+
+                image_height = image_size_batch[i][batch_i][0]
+                image_width= image_size_batch[i][batch_i][1]
+                image_id = image_id_batch[i][batch_i]
+
+                boxes, classes, scores = postprocess_yolo.postprocessor.process(
+                output, (image_width, image_height), 0.01)
+
+                for j in range(len(boxes)):
+                    box = boxes[j]
+                    class_name = self.onnx_class_list[int(classes[j])]
+                    if class_name in self.identical_class_map:
+                        class_name = self.identical_class_map[class_name]
+                    id = self.class_to_id[class_name]
+                    x = float(box[0])
+                    y = float(box[1])
+                    w = float(box[2] - box[0] + 1)
+                    h = float(box[3] - box[1] + 1)
+                    self.prediction_result_list.append({"image_id":int(image_id), "category_id":int(id), "bbox":[x,y,w,h], "score":scores[j]})
+
+class YoloV3Variant2Evaluator(YoloV3Evaluator):
     def __init__(self,
                  model_path,
                  data_reader: CalibrationDataReader,
diff --git a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/postprocessing.py b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/postprocessing.py
new file mode 100644
index 0000000000..7b4a84380d
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/postprocessing.py
@@ -0,0 +1,296 @@
+import numpy as np
+class PostprocessYOLO(object):
+    """Class for post-processing the three output tensors from YOLO."""
+
+    def __init__(self,
+                 yolo_masks,
+                 yolo_anchors,
+                 nms_threshold,
+                 yolo_input_resolution,
+                 category_num=80):
+        """Initialize with all values that will be kept when processing
+        several frames.  Assuming 3 outputs of the network in the case
+        of (large) YOLO, or 2 for the Tiny YOLO.
+
+        Keyword arguments:
+        yolo_masks -- a list of 3 (or 2) three-dimensional tuples for the YOLO masks
+        yolo_anchors -- a list of 9 (or 6) two-dimensional tuples for the YOLO anchors
+        object_threshold -- threshold for object coverage, float value between 0 and 1
+        nms_threshold -- threshold for non-max suppression algorithm,
+        float value between 0 and 1
+        input_wh -- tuple (W, H) for the target network
+        category_num -- number of output categories/classes
+        """
+        self.masks = yolo_masks
+        self.anchors = yolo_anchors
+        self.nms_threshold = nms_threshold
+        self.input_wh = (yolo_input_resolution[1], yolo_input_resolution[0])
+        self.category_num = category_num
+
+    def process(self, outputs, resolution_raw, conf_th):
+        """Take the YOLO outputs generated from a TensorRT forward pass, post-process them
+        and return a list of bounding boxes for detected object together with their category
+        and their confidences in separate lists.
+
+        Keyword arguments:
+        outputs -- outputs from a TensorRT engine in NCHW format
+        resolution_raw -- the original spatial resolution from the input PIL image in WH order
+        conf_th -- confidence threshold, e.g. 0.3
+        """
+        outputs_reshaped = list()
+        for output in outputs:
+            outputs_reshaped.append(self._reshape_output(output))
+
+        boxes_xywh, categories, confidences = self._process_yolo_output(
+            outputs_reshaped, resolution_raw, conf_th)
+
+        if len(boxes_xywh) > 0:
+            # convert (x, y, width, height) to (x1, y1, x2, y2)
+            img_w, img_h = resolution_raw
+            xx = boxes_xywh[:, 0].reshape(-1, 1)
+            yy = boxes_xywh[:, 1].reshape(-1, 1)
+            ww = boxes_xywh[:, 2].reshape(-1, 1)
+            hh = boxes_xywh[:, 3].reshape(-1, 1)
+            boxes = np.concatenate([xx, yy, xx+ww, yy+hh], axis=1) + 0.5
+            boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0., float(img_w-1))
+            boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0., float(img_h-1))
+            boxes = boxes.astype(np.int)
+        else:
+            boxes = np.zeros((0, 4), dtype=np.int)  # empty
+
+        return boxes, categories, confidences
+
+    def _reshape_output(self, output):
+        """Reshape a TensorRT output from NCHW to NHWC format (with expected C=255),
+        and then return it in (height,width,3,85) dimensionality after further reshaping.
+
+        Keyword argument:
+        output -- an output from a TensorRT engine after inference
+        """
+        output = np.transpose(output, [0, 2, 3, 1])
+        _, height, width, _ = output.shape
+        dim1, dim2 = height, width
+        dim3 = 3
+        # There are CATEGORY_NUM=80 object categories:
+        dim4 = (4 + 1 + self.category_num)
+        return np.reshape(output, (dim1, dim2, dim3, dim4))
+
+    def _process_yolo_output(self, outputs_reshaped, resolution_raw, conf_th):
+        """Take in a list of three reshaped YOLO outputs in (height,width,3,85) shape and return
+        return a list of bounding boxes for detected object together with their category and their
+        confidences in separate lists.
+
+        Keyword arguments:
+        outputs_reshaped -- list of three reshaped YOLO outputs as NumPy arrays
+        with shape (height,width,3,85)
+        resolution_raw -- the original spatial resolution from the input PIL image in WH order
+        conf_th -- confidence threshold
+        """
+
+        # E.g. in YOLOv3-608, there are three output tensors, which we associate with their
+        # respective masks. Then we iterate through all output-mask pairs and generate candidates
+        # for bounding boxes, their corresponding category predictions and their confidences:
+        boxes, categories, confidences = list(), list(), list()
+        for output, mask in zip(outputs_reshaped, self.masks):
+            box, category, confidence = self._process_feats(output, mask)
+            box, category, confidence = self._filter_boxes(box, category, confidence, conf_th)
+            boxes.append(box)
+            categories.append(category)
+            confidences.append(confidence)
+
+        boxes = np.concatenate(boxes)
+        categories = np.concatenate(categories)
+        confidences = np.concatenate(confidences)
+
+        # Scale boxes back to original image shape:
+        width, height = resolution_raw
+        image_dims = [width, height, width, height]
+        boxes = boxes * image_dims
+
+        # Using the candidates from the previous (loop) step, we apply the non-max suppression
+        # algorithm that clusters adjacent bounding boxes to a single bounding box:
+        nms_boxes, nms_categories, nscores = list(), list(), list()
+        for category in set(categories):
+            idxs = np.where(categories == category)
+            box = boxes[idxs]
+            category = categories[idxs]
+            confidence = confidences[idxs]
+
+            keep = self._nms_boxes(box, confidence)
+
+            nms_boxes.append(box[keep])
+            nms_categories.append(category[keep])
+            nscores.append(confidence[keep])
+
+        if not nms_categories and not nscores:
+            return (np.empty((0, 4), dtype=np.float32),
+                    np.empty((0, 1), dtype=np.float32),
+                    np.empty((0, 1), dtype=np.float32))
+
+        boxes = np.concatenate(nms_boxes)
+        categories = np.concatenate(nms_categories)
+        confidences = np.concatenate(nscores)
+
+        return boxes, categories, confidences
+
+    def _process_feats(self, output_reshaped, mask):
+        """Take in a reshaped YOLO output in height,width,3,85 format together with its
+        corresponding YOLO mask and return the detected bounding boxes, the confidence,
+        and the class probability in each cell/pixel.
+
+        Keyword arguments:
+        output_reshaped -- reshaped YOLO output as NumPy arrays with shape (height,width,3,85)
+        mask -- 2-dimensional tuple with mask specification for this output
+        """
+
+        def sigmoid_v(array):
+            return np.reciprocal(np.exp(-array) + 1.0)
+
+        def exponential_v(array):
+            return np.exp(array)
+
+        grid_h, grid_w, _, _ = output_reshaped.shape
+
+        anchors = [self.anchors[i] for i in mask]
+
+        # Reshape to N, height, width, num_anchors, box_params:
+        anchors_tensor = np.reshape(anchors, [1, 1, len(anchors), 2])
+        box_xy = sigmoid_v(output_reshaped[..., 0:2])
+        box_wh = exponential_v(output_reshaped[..., 2:4]) * anchors_tensor
+        box_confidence = sigmoid_v(output_reshaped[..., 4:5])
+        box_class_probs = sigmoid_v(output_reshaped[..., 5:])
+
+        col = np.tile(np.arange(0, grid_w), grid_h).reshape(-1, grid_w)
+        row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_w)
+
+        col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
+        row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2)
+        grid = np.concatenate((col, row), axis=-1)
+
+        box_xy += grid
+        box_xy /= (grid_w, grid_h)
+        box_wh /= self.input_wh
+        box_xy -= (box_wh / 2.)
+        boxes = np.concatenate((box_xy, box_wh), axis=-1)
+
+        # boxes: centroids, box_confidence: confidence level, box_class_probs:
+        # class confidence
+        return boxes, box_confidence, box_class_probs
+
+    def _filter_boxes(self, boxes, box_confidences, box_class_probs, conf_th):
+        """Take in the unfiltered bounding box descriptors and discard each cell
+        whose score is lower than the object threshold set during class initialization.
+
+        Keyword arguments:
+        boxes -- bounding box coordinates with shape (height,width,3,4); 4 for
+        x,y,height,width coordinates of the boxes
+        box_confidences -- bounding box confidences with shape (height,width,3,1); 1 for as
+        confidence scalar per element
+        box_class_probs -- class probabilities with shape (height,width,3,CATEGORY_NUM)
+        conf_th -- confidence threshold
+        """
+        box_scores = box_confidences * box_class_probs
+        box_classes = np.argmax(box_scores, axis=-1)
+        box_class_scores = np.max(box_scores, axis=-1)
+        pos = np.where(box_class_scores >= conf_th)
+
+        boxes = boxes[pos]
+        classes = box_classes[pos]
+        scores = box_class_scores[pos]
+
+        return boxes, classes, scores
+
+    def _nms_boxes(self, boxes, box_confidences):
+        """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their
+        confidence scores and return an array with the indexes of the bounding boxes we want to
+        keep (and display later).
+
+        Keyword arguments:
+        boxes -- a NumPy array containing N bounding-box coordinates that survived filtering,
+        with shape (N,4); 4 for x,y,height,width coordinates of the boxes
+        box_confidences -- a Numpy array containing the corresponding confidences with shape N
+        """
+        x_coord = boxes[:, 0]
+        y_coord = boxes[:, 1]
+        width = boxes[:, 2]
+        height = boxes[:, 3]
+
+        areas = width * height
+        ordered = box_confidences.argsort()[::-1]
+
+        keep = list()
+        while ordered.size > 0:
+            # Index of the current element:
+            i = ordered[0]
+            keep.append(i)
+            xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]])
+            yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]])
+            xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]])
+            yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]])
+
+            width1 = np.maximum(0.0, xx2 - xx1 + 1)
+            height1 = np.maximum(0.0, yy2 - yy1 + 1)
+            intersection = width1 * height1
+            union = (areas[i] + areas[ordered[1:]] - intersection)
+
+            # Compute the Intersection over Union (IoU) score:
+            iou = intersection / union
+
+            # The goal of the NMS algorithm is to reduce the number of adjacent bounding-box
+            # candidates to a minimum. In this step, we keep only those elements whose overlap
+            # with the current bounding box is lower than the threshold:
+            indexes = np.where(iou <= self.nms_threshold)[0]
+            ordered = ordered[indexes + 1]
+
+        keep = np.array(keep)
+        return keep
+
+class PostprocessYOLOWrapper(object):
+    """This class encapsulates things needed to run yolo."""
+    """Reference from here https://github.com/jkjung-avt/tensorrt_demos/blob/3fb15c908b155d5edc1bf098c6b8c31886cd8e8d/utils/yolo.py"""
+
+    def _init_yolov3_postprocessor(self):
+        h, w = self.input_shape
+        filters = (self.category_num + 5) * 3
+        if 'tiny' in self.model:
+            self.output_shapes = [(1, filters, h // 32, w // 32),
+                                  (1, filters, h // 16, w // 16)]
+        else:
+            self.output_shapes = [(1, filters, h // 32, w // 32),
+                                  (1, filters, h // 16, w // 16),
+                                  (1, filters, h //  8, w //  8)]
+        if 'tiny' in self.model:
+            postprocessor_args = {
+                # A list of 2 three-dimensional tuples for the Tiny YOLO masks
+                'yolo_masks': [(3, 4, 5), (0, 1, 2)],
+                # A list of 6 two-dimensional tuples for the Tiny YOLO anchors
+                'yolo_anchors': [(10, 14), (23, 27), (37, 58),
+                                 (81, 82), (135, 169), (344, 319)],
+                # Threshold for non-max suppression algorithm, float
+                # value between 0 and 1
+                'nms_threshold': 0.5,
+                'yolo_input_resolution': self.input_shape,
+                'category_num': self.category_num
+            }
+        else:
+            postprocessor_args = {
+                # A list of 3 three-dimensional tuples for the YOLO masks
+                'yolo_masks': [(6, 7, 8), (3, 4, 5), (0, 1, 2)],
+                # A list of 9 two-dimensional tuples for the YOLO anchors
+                'yolo_anchors': [(10, 13), (16, 30), (33, 23),
+                                 (30, 61), (62, 45), (59, 119),
+                                 (116, 90), (156, 198), (373, 326)],
+                # Threshold for non-max suppression algorithm, float
+                # value between 0 and 1
+                'nms_threshold': 0.5,
+                'yolo_input_resolution': self.input_shape,
+                'category_num': self.category_num
+            }
+        self.postprocessor = PostprocessYOLO(**postprocessor_args)
+
+    def __init__(self, model, input_shape, category_num=80):
+        self.model = model
+        self.input_shape = input_shape
+        self.category_num = category_num
+        self.postprocessor = None
+        self._init_yolov3_postprocessor()
diff --git a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/preprocessing.py b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/preprocessing.py
index 1dad1e13e5..95c409d924 100644
--- a/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/preprocessing.py
+++ b/onnxruntime/python/tools/quantization/E2E_example_model/object_detection/trt/yolov3/preprocessing.py
@@ -65,8 +65,68 @@ def yolov3_preprocess_func(images_folder, height, width, start_index=0, size_lim
     batch_data = np.concatenate(np.expand_dims(unconcatenated_batch_data, axis=0), axis=0)
     return batch_data, batch_filenames, image_size_list
 
+def yolov3_variant_preprocess_func(images_folder, height, width, start_index=0, size_limit=0):
+    '''
+    Loads a batch of images and preprocess them
+    parameter images_folder: path to folder storing images
+    parameter height: image height in pixels
+    parameter width: image width in pixels
+    parameter size_limit: number of images to load. Default is 0 which means all images are picked.
+    return: list of matrices characterizing multiple images
+    '''
 
-def yolov3_vision_preprocess_func(images_folder, height, width, start_index=0, size_limit=0):
+    # reference from here:
+    # https://github.com/jkjung-avt/tensorrt_demos/blob/3fb15c908b155d5edc1bf098c6b8c31886cd8e8d/utils/yolo.py#L60
+    def _preprocess_yolo(img, input_shape):
+        """Preprocess an image before TRT YOLO inferencing.
+        # Args
+            img: int8 numpy array of shape (img_h, img_w, 3)
+            input_shape: a tuple of (H, W)
+        # Returns
+            preprocessed img: float32 numpy array of shape (3, H, W)
+        """
+        img = cv2.resize(img, (input_shape[1], input_shape[0]))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = img.transpose((2, 0, 1)).astype(np.float32)
+        img /= 255.0
+        return img
+
+    image_names = os.listdir(images_folder)
+    if start_index >= len(image_names):
+        return np.asanyarray([]), np.asanyarray([]), np.asanyarray([])
+    elif size_limit > 0 and len(image_names) >= size_limit:
+        end_index = start_index + size_limit
+        if end_index > len(image_names):
+            end_index = len(image_names)
+
+        batch_filenames = [image_names[i] for i in range(start_index, end_index)]
+    else:
+        batch_filenames = image_names
+
+    unconcatenated_batch_data = []
+    image_size_list = []
+
+    print(batch_filenames)
+    print("size: %s" % str(len(batch_filenames)))
+
+    for image_name in batch_filenames:
+        image_filepath = images_folder + '/' + image_name
+        model_image_size = (height, width)
+
+        img = cv2.imread(image_filepath)
+        image_data = _preprocess_yolo(img, tuple(model_image_size)) 
+        image_data = np.ascontiguousarray(image_data)
+        image_data = np.expand_dims(image_data, 0)
+        unconcatenated_batch_data.append(image_data)
+        _height, _width, _ = img.shape
+        image_size_list.append(img.shape[0:2])  # img.shape is h, w, c
+
+    batch_data = np.concatenate(np.expand_dims(unconcatenated_batch_data, axis=0), axis=0)
+    return batch_data, batch_filenames, image_size_list
+
+
+# This is for special tuned yolov3 model
+def yolov3_variant_2_preprocess_func(images_folder, height, width, start_index=0, size_limit=0):
     def letterbox(img, new_shape=(416, 416), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
         # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
         shape = img.shape[:2]  # current shape [height, width]
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index bdd4d44063..fa6372d877 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -77,10 +77,12 @@ class ONNXCalibrater:
         value_infos = {vi.name: vi for vi in model.graph.value_info}
         value_infos.update({ot.name: ot for ot in model.graph.output})
         value_infos.update({it.name: it for it in model.graph.input})
+        initializer = set(init.name for init in model.graph.initializer)
 
         added_nodes = []
         added_outputs = []
         tensors_to_calibrate = set()
+        tensor_type_to_calibrate = set([TensorProto.FLOAT, TensorProto.FLOAT16])
 
         for node in model.graph.node:
             should_be_calibrate = ((node.op_type in self.calibrate_op_types) and
@@ -89,8 +91,7 @@ class ONNXCalibrater:
                 for tensor_name in itertools.chain(node.input, node.output):
                     if tensor_name in value_infos.keys():
                         vi = value_infos[tensor_name]
-                        if vi.type.HasField('tensor_type') and vi.type.tensor_type.elem_type == TensorProto.FLOAT and (
-                                tensor_name not in model.graph.initializer):
+                        if vi.type.HasField('tensor_type') and (vi.type.tensor_type.elem_type in tensor_type_to_calibrate) and (tensor_name not in initializer):
                             tensors_to_calibrate.add(tensor_name)
 
         # If augmenting all ops, it's possible that some nodes' input value are 0.
@@ -332,10 +333,10 @@ def calculate_calibration_data(model,
                                     augmented_model_path=augmented_model_path)
 
     if not os.path.exists(augmented_model_path):
-        augmented_model = calibrator.augment_graph(augment_all_ops=True)
+        augmented_model = calibrator.augment_graph()
         onnx.save(augmented_model, augmented_model_path)
 
-    calibrator.get_intermediate_outputs(providers=["CUDAExecutionProvider"])
+    calibrator.get_intermediate_outputs(providers=["CUDAExecutionProvider"], ort_graph_optimization_enable=False)
 
 
 def generate_calibration_table(calibrator,