From 731b50dfc4f8074185dc70f3a10236fa4fdfc0aa Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Wed, 10 Jan 2024 15:13:04 -0800
Subject: [PATCH] Support INT4 weight only quantize, including RTN and GPTQ 2
 algorithms (#17390)

### Description
Support INT4 weight only quantize (WOQ) via Intel Neural Compressor,
including RTN and GPTQ 2 algorithms.

**Note:**
Please install `neural-compressor==2.3` for weight only quantize.

### Motivation and Context
As large language models (LLMs) become more prevalent, there is a
growing need for new and improved quantization methods that can meet the
computational demands of these modern architectures while maintaining
the accuracy. Compared to normal quantization like W8A8, weight only
quantization is probably a better trade-off to balance the performance
and the accuracy.
RTN is the most straightforward way to quantize weight.
GPTQ algorithm provides more accurate quantization but requires more
computational resources.

### Evaluation results
The following table shows the accuracy results of Llama-2 models
evaluated on [lambada_openai](https://huggingface.co/datasets/lambada)
task. `GPTQ W4G32Asym` in configuration column means GPTQ algorithm is
used for 4-bit weight only quantization, setting group_size=32 and
scheme=asym.
<table class="tg">
<thead>
  <tr>
    <th rowspan="2">Model name</th>
    <th rowspan="2">Configuration</th>
    <th colspan="2">Lambada_openai</th>
    <th rowspan="2">Accuracy Ratio<br>[WOQ/FP32]</th>
  </tr>
  <tr>
    <th>Accuracy</th>
    <th>Perplexity</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-chat-hf</td>
    <td>FP32</td>
    <td>0.7058</td>
    <td>3.2788</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7025</td>
    <td>3.4489</td>
    <td>99.53%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-hf</td>
    <td>FP32</td>
    <td>0.7392</td>
    <td>3.3950</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7326</td>
    <td>3.5286</td>
    <td>99.11%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-chat-hf</td>
    <td>FP32</td>
    <td>0.7312</td>
    <td>2.9163</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G128Asym</td>
    <td>0.7289</td>
    <td>3.0061</td>
    <td>99.56%</td>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-hf</td>
    <td>FP32</td>
    <td>0.7677</td>
    <td>3.0438</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7607</td>
    <td>3.1562</td>
    <td>99.09%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-chat-hf</td>
    <td>FP32</td>
    <td>0.7543</td>
    <td>2.6181</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7489</td>
    <td>2.6850</td>
    <td>99.28%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-hf</td>
    <td>FP32</td>
    <td>0.7964</td>
    <td>2.6612</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7896</td>
    <td>2.7546</td>
    <td>99.15%</td>
  </tr>
</tbody>
</table>

---------

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: Wang, Mengni <mengni.wang@intel.com>
---
 .../quantization/matmul_4bits_quantizer.py    | 185 ++++++++++++++++--
 .../python/tools/quantization/quantize.py     |  11 +-
 .../quantization/test_op_matmul_4bits.py      |  70 ++++++-
 3 files changed, 244 insertions(+), 22 deletions(-)
diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 6293bcbbf9..3e9f9a6544 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -7,6 +7,8 @@
 from __future__ import annotations
 
 import argparse
+import copy
+import importlib
 import logging
 import os
 
@@ -14,9 +16,11 @@ import numpy as np
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
 from .quant_utils import attribute_to_kwarg
 
@@ -24,24 +28,98 @@ logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s",
 logger = logging.getLogger(__name__)
 
 
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+        """
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        blocksize=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            blocksize (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+        """
+        super().__init__(
+            algorithm="GPTQ",
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.blocksize = blocksize
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
     def __init__(
         self,
-        model: ModelProto,
+        model: ModelProto | str,
         block_size: int,
         is_symmetric: bool,
         accuracy_level: int | None = None,
-        nodes_to_exclude: list[str] | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
-        self.model = ONNXModel(model)
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.accuracy_level = accuracy_level
         self.nodes_to_exclude = set(nodes_to_exclude)
+        self.algo_config = algo_config
 
     @staticmethod
     def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -176,20 +254,99 @@ class MatMul4BitsQuantizer:
         graph_stack.pop()
         return graph
 
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.blocksize
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
+
     def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
+        if self.algo_config is None:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+            opset_import = self.model.opset_import()
 
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+            has_ms_domain = False
+            for opset in opset_import:
+                if opset.domain == "com.microsoft":
+                    has_ms_domain = True
+            if not has_ms_domain:
+                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
 
-        self._process_subgraph(graph_stack)
-        self.model.clean_initializers()
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
 
 
 def parse_args():
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index aed46563c2..1bd2ef4215 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -466,7 +466,6 @@ def quantize_static(
 
         import copy
 
-        import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
 
         def inc_dataloader():
@@ -478,13 +477,11 @@ def quantize_static(
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
         del dataloader
-        model = sq.transform(
-            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
-        ).model
-        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
-        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
-        onnx.save_model(model, model_input, save_as_external_data=True)
+        model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 02f51cc4fa..73dae08af8 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -71,13 +71,16 @@ class TestOpMatMul4Bits(unittest.TestCase):
         output_name = "output"
         initializers = []
 
-        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+        def make_matmul(
+            input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str
+        ):
             weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
             initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
             return onnx.helper.make_node(
                 "MatMul",
                 [input_name, weight_name],
                 [output_name],
+                node_name,
             )
 
         in_features = 52
@@ -88,6 +91,7 @@ class TestOpMatMul4Bits(unittest.TestCase):
             [in_features, out_features],
             "linear1.weight",
             output_name,
+            "MatMul_0",
         )
 
         # make graph
@@ -139,6 +143,48 @@ class TestOpMatMul4Bits(unittest.TestCase):
             else:
                 raise exception
 
+    def quant_test_with_algo(
+        self,
+        algorithm: str,
+        model_fp32_path: str,
+        data_reader: TestDataFeeds,
+        block_size: int,
+        is_symmetric: bool,
+    ):
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to int4 model
+        from onnxruntime.quantization import matmul_4bits_quantizer
+
+        algo_config = None
+        if algorithm == "RTN":
+            # test RTN algorithm
+            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+        elif algorithm == "GPTQ":
+            # test GPTQ algorithm
+            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
+        quant.process()
+        quant.model.save_model_to_file(model_int4_path, False)
+
+        quant_nodes = {"MatMulNBits": 1}
+        check_op_type_count(self, model_int4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
+        except Exception as exception:
+            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
+                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
+                pass
+            else:
+                raise exception
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
@@ -159,6 +205,28 @@ class TestOpMatMul4Bits(unittest.TestCase):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_rtn_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_gptq_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()

Model name	Configuration	Lambada_openai		Accuracy Ratio [WOQ/FP32]
Model name	Configuration	Accuracy	Perplexity	Accuracy Ratio [WOQ/FP32]
meta-llama/Llama-2-7b-chat-hf	FP32	0.7058	3.2788	/
meta-llama/Llama-2-7b-chat-hf	GPTQ W4G32Asym	0.7025	3.4489	99.53%
meta-llama/Llama-2-7b-hf	FP32	0.7392	3.3950	/
meta-llama/Llama-2-7b-hf	GPTQ W4G32Asym	0.7326	3.5286	99.11%
meta-llama/Llama-2-13b-chat-hf	FP32	0.7312	2.9163	/
meta-llama/Llama-2-13b-chat-hf	GPTQ W4G128Asym	0.7289	3.0061	99.56%
meta-llama/Llama-2-13b-hf	FP32	0.7677	3.0438	/
meta-llama/Llama-2-13b-hf	GPTQ W4G32Asym	0.7607	3.1562	99.09%
meta-llama/Llama-2-70b-chat-hf	FP32	0.7543	2.6181	/
meta-llama/Llama-2-70b-chat-hf	RTN W4G32Sym	0.7489	2.6850	99.28%
meta-llama/Llama-2-70b-hf	FP32	0.7964	2.6612	/
meta-llama/Llama-2-70b-hf	RTN W4G32Sym	0.7896	2.7546	99.15%