From 7df8776322bc66bda9bb1bff1502fcceb8596efc Mon Sep 17 00:00:00 2001
From: duanshengliu <44742794+duanshengliu@users.noreply.github.com>
Date: Thu, 29 Aug 2024 05:29:17 +0800
Subject: [PATCH] Add overflow protection for quantization bias to reduce
 quantization precision loss (#21645)

### Description
<!-- Describe your changes. -->

When the scale of the bias is too small, the quantized bias may exceed
the range of `int32`, leading to significant loss of precision.
Therefore, before converting quantized bias to `int32`, it needs to be
clipped within the range of `int32` to reduce the loss of quantization
precision.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Fix the issue https://github.com/microsoft/onnxruntime/issues/21000
---
 onnxruntime/python/tools/quantization/base_quantizer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py
index d48964203c..b20af5137d 100644
--- a/onnxruntime/python/tools/quantization/base_quantizer.py
+++ b/onnxruntime/python/tools/quantization/base_quantizer.py
@@ -230,7 +230,9 @@ class BaseQuantizer:
             # TODO: This formula should be explained including why the scale is not estimated for the bias as well.
             bias_scale = input_scale * weight_scale * beta
 
-            quantized_data = (np.asarray(bias_data) / bias_scale).round().astype(np.int32)
+            quantized_data = (np.asarray(bias_data) / bias_scale).round()
+            quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max)
+            quantized_data = quantized_data.astype(np.int32)
 
             # update bias initializer
             bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims)