From c8151b40372135da8d32f6313acb1f3b5f8a8be8 Mon Sep 17 00:00:00 2001 From: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Date: Wed, 3 Nov 2021 22:18:51 -0700 Subject: [PATCH] Add percentile method for PTQ (#9342) * Add percentile method for calibration * Update configuration --- .../python/tools/quantization/calibrate.py | 155 ++++++++++++++++-- 1 file changed, 140 insertions(+), 15 deletions(-) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 4c147ebcfd..e82eb46d8f 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -25,7 +25,7 @@ import itertools class CalibrationMethod(Enum): MinMax = 0 Entropy = 1 - + Percentile = 2 class CalibrationDataReader(metaclass=abc.ABCMeta): @classmethod @@ -269,19 +269,31 @@ class MinMaxCalibrater(CalibraterBase): return self.calibrate_tensors_range -class EntropyCalibrater(CalibraterBase): - def __init__(self, model, op_types_to_calibrate=[], augmented_model_path='augmented_model.onnx'): +class HistogramCalibrater(CalibraterBase): + def __init__(self, + model, + op_types_to_calibrate=[], + augmented_model_path='augmented_model.onnx', + method='percentile', + num_quantized_bins=128, + percentile=99.99): ''' :param model: ONNX model to calibrate. It can be a ModelProto or a model path :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors. :param augmented_model_path: save augmented model to this path. + :param method: A string. One of ['entropy', 'percentile']. + :param num_quantized_bins: number of quantized bins. Default 128. + :param percentile: A float number between [0, 100]. Default 99.99. ''' - super(EntropyCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path) + super(HistogramCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path) self.intermediate_outputs = [] self.calibrate_tensors_range = None self.num_model_outputs = len(self.model.graph.output) self.model_original_outputs = set(output.name for output in self.model.graph.output) self.collector = None + self.method = method + self.num_quantized_bins = num_quantized_bins + self.percentile = percentile def augment_graph(self): ''' @@ -334,7 +346,9 @@ class EntropyCalibrater(CalibraterBase): clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict if i not in self.model_original_outputs) if not self.collector: - self.collector = HistogramCollector() + self.collector = HistogramCollector(method=self.method, + num_quantized_bins=self.num_quantized_bins, + percentile=self.percentile) self.collector.collect(clean_merged_dict) self.clear_collected_data() @@ -347,8 +361,44 @@ class EntropyCalibrater(CalibraterBase): if not self.collector: raise ValueError("No collector created and can't generate calibration data.") - return self.collector.get_optimal_collection_result() + return self.collector.compute_collection_result() +class EntropyCalibrater(HistogramCalibrater): + def __init__(self, + model, + op_types_to_calibrate=[], + augmented_model_path='augmented_model.onnx', + method='entropy', + num_quantized_bins=128): + ''' + :param model: ONNX model to calibrate. It can be a ModelProto or a model path + :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors. + :param augmented_model_path: save augmented model to this path. + :param method: A string. One of ['entropy', 'percentile']. + :param num_quantized_bins: number of quantized bins. Default 128. + ''' + super(EntropyCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, + method=method, num_quantized_bins=num_quantized_bins) + +class PercentileCalibrater(HistogramCalibrater): + def __init__(self, + model, + op_types_to_calibrate=[], + augmented_model_path='augmented_model.onnx', + method='percentile', + num_quantized_bins=2048, + percentile=99.999): + ''' + :param model: ONNX model to calibrate. It can be a ModelProto or a model path + :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors. + :param augmented_model_path: save augmented model to this path. + :param method: A string. One of ['entropy', 'percentile']. + :param num_quantized_bins: number of quantized bins. Default 128. + :param percentile: A float number between [0, 100]. Default 99.99. + ''' + super(PercentileCalibrater, self).__init__(model, op_types_to_calibrate, augmented_model_path, + method=method, num_quantized_bins=num_quantized_bins, + percentile=percentile) class CalibrationDataCollector(metaclass=abc.ABCMeta): """ @@ -365,7 +415,7 @@ class CalibrationDataCollector(metaclass=abc.ABCMeta): raise NotImplementedError @abc.abstractmethod - def get_optimal_collection_result(self): + def compute_collection_result(self): """ Get the optimal result among collection data. """ @@ -373,18 +423,57 @@ class CalibrationDataCollector(metaclass=abc.ABCMeta): class HistogramCollector(CalibrationDataCollector): """ - Implementation of collecting histogram data as dict for each tensor targeting on entropy calibration. + Collecting histogram for each tensor. Percentile and Entropy method are supported. ref: https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py + ref: https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/_modules/ + pytorch_quantization/calib/histogram.html """ - def __init__(self, num_quantized_bins=128): + def __init__(self, method, num_quantized_bins, percentile): self.histogram_dict = {} + self.method = method self.num_quantized_bins= num_quantized_bins + self.percentile = percentile def get_histogram_dict(self): return self.histogram_dict def collect(self, name_to_arr): + # TODO: Currently we have different collect() for percentile and percentile method respectively. + # Need unified collect in the future. + if self.method == 'entropy': + return self.collect_for_entropy(name_to_arr) + elif self.method == 'percentile': + return self.collect_for_percentile(name_to_arr) + else: + raise ValueError('Only \'entropy\' or \'percentile\' method are supported') + + def collect_for_percentile(self, name_to_arr): + for tensor, data_arr in name_to_arr.items(): + data_arr = np.asarray(data_arr) + data_arr = data_arr.flatten() + data_arr = np.absolute(data_arr) # only consider absolute value + + if tensor not in self.histogram_dict: + # first time it uses num_quantized_bins to compute histogram. + hist, hist_edges = np.histogram(data_arr, bins=self.num_quantized_bins) + self.histogram_dict[tensor] = (hist, hist_edges) + else: + old_histogram = self.histogram_dict[tensor] + old_hist = old_histogram[0] + old_hist_edges = old_histogram[1] + temp_amax = np.max(data_arr) + if temp_amax > old_hist_edges[-1]: + # increase the number of bins + width = old_hist_edges[1] - old_hist_edges[0] + # NOTE: np.arange may create an extra bin after the one containing temp_amax + new_bin_edges = np.arange(old_hist_edges[-1] + width, temp_amax + width, width) + old_hist_edges = np.hstack((old_hist_edges, new_bin_edges)) + hist, hist_edges = np.histogram(data_arr, bins=old_hist_edges) + hist[:len(old_hist)] += old_hist + self.histogram_dict[tensor] = (hist, hist_edges) + + def collect_for_entropy(self, name_to_arr): for tensor, data_arr in name_to_arr.items(): data_arr = np.asarray(data_arr) data_arr = data_arr.flatten() @@ -402,7 +491,6 @@ class HistogramCollector(CalibrationDataCollector): old_histogram = self.histogram_dict[tensor] self.histogram_dict[tensor] = self.merge_histogram(old_histogram, data_arr, min_value, max_value, threshold) else: - # hist, hist_edges = np.histogram(data_arr, self.num_quantized_bins, range=(min_value, max_value)) hist, hist_edges = np.histogram(data_arr, self.num_quantized_bins, range=(-threshold, threshold)) self.histogram_dict[tensor] = (hist, hist_edges, min_value, max_value, threshold) @@ -415,8 +503,8 @@ class HistogramCollector(CalibrationDataCollector): return (new_hist + old_hist, old_hist_edges, min(old_min, new_min), max(old_max, new_max), old_threshold) else: if old_threshold == 0: - hist, hist_edges = np.histogram(data_arr, new_num_bins, range=(-new_threshold, new_threshold)) - hist[len(hist) // 2] += len(old_hist) + hist, hist_edges = np.histogram(data_arr, len(old_hist), range=(-new_threshold, new_threshold)) + hist += old_hist else: old_num_bins = len(old_hist) old_stride = 2 * old_threshold / old_num_bins @@ -427,19 +515,54 @@ class HistogramCollector(CalibrationDataCollector): hist[half_increased_bins:new_num_bins-half_increased_bins] += old_hist return (hist, hist_edges, min(old_min, new_min), max(old_max, new_max), new_threshold) - def get_optimal_collection_result(self): + def compute_collection_result(self): + if not self.histogram_dict or len(self.histogram_dict) == 0: + raise ValueError("Histogram has not been collected. Please run collect() first.") + + if self.method == 'entropy': + return self.compute_entropy() + elif self.method == 'percentile': + return self.compute_percentile() + else: + raise ValueError('Only \'entropy\' or \'percentile\' method are supported') + + def compute_percentile(self): + if self.percentile < 0 or self.percentile > 100: + raise ValueError("Invalid percentile. Must be in range 0 <= percentile <= 100.") + + histogram_dict = self.histogram_dict + percentile = self.percentile + + thresholds_dict = {} # per tensor thresholds + + for tensor, histogram in histogram_dict.items(): + hist = histogram[0] + hist_edges = histogram[1] + total = hist.sum() + cdf = np.cumsum(hist/total) + idx = np.searchsorted(cdf, percentile/100) + thresholds_dict[tensor] = (float(hist_edges[idx]), float(hist_edges[idx])) + + return thresholds_dict + + def compute_entropy(self): histogram_dict = self.histogram_dict num_quantized_bins = self.num_quantized_bins thresholds_dict = {} # per tensor thresholds for tensor, histogram in histogram_dict.items(): - optimal_threshold = self.get_optimal_threshold(histogram, num_quantized_bins) + optimal_threshold = self.get_entropy_threshold(histogram, num_quantized_bins) thresholds_dict[tensor] = optimal_threshold return thresholds_dict - def get_optimal_threshold(self, histogram, num_quantized_bins): + def get_entropy_threshold(self, histogram, num_quantized_bins): + """Given a dataset, find the optimal threshold for quantizing it. + The reference distribution is `q`, and the candidate distribution is `p`. + `q` is a truncated version of the original distribution. + Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf + """ from scipy.stats import entropy import copy @@ -513,5 +636,7 @@ def create_calibrator(model, return MinMaxCalibrater(model, op_types_to_calibrate, augmented_model_path) elif calibrate_method == CalibrationMethod.Entropy: return EntropyCalibrater(model, op_types_to_calibrate, augmented_model_path) + elif calibrate_method == CalibrationMethod.Percentile: + return PercentileCalibrater(model, op_types_to_calibrate, augmented_model_path) raise ValueError('Unsupported calibration method {}'.format(calibrate_method))