quantization: introduce inplace parameter for convert function (#855)

XiaobingSuper · web-flow · commit 15678cc43ba4 · 2022-06-14T13:58:19.000+08:00
diff --git a/intel_extension_for_pytorch/ao/quantization/_quantize.py b/intel_extension_for_pytorch/ao/quantization/_quantize.py
@@ -1,26 +1,28 @@
-import torch
 import copy
 from typing import Tuple, Any
+import warnings
 
-import intel_extension_for_pytorch._C as core
+import torch
+from torch.ao.quantization import PlaceholderObserver
 import torch.fx.experimental.optimization as optimization
-from ._quantize_utils import auto_prepare, auto_convert
-import warnings
+
+import intel_extension_for_pytorch._C as core
+from ._quantize_utils import auto_prepare, auto_convert, copy_prepared_model 
 from ... import nn
 
 def prepare(
     model,
     configure,
     example_inputs,
-    inplace=True):
+    inplace=False):
     r"""
     Prepare an FP32 torch.nn.Module model to do calibration or to convert to quantized model.
     Args:
         model (torch.nn.Module): The FP32 model to be prepared.
         configure (torch.quantization.qconfig.QConfig): The observer settings about activation and weight.
         example_inputs (tuple or torch.Tensor): A tuple of example inputs that
-            will be passed to the function while running to init quantizaiton state. 
-        inplace: (bool): It will do overide the original model.
+            will be passed to the function while running to init quantization state. 
+        inplace: (bool): It will change the given model in-place if True. The default value is ``False``.
     Returns:
         torch.nn.Module
     """
@@ -43,20 +45,47 @@ def prepare(
         example_inputs = tuple(example_inputs)
     return auto_prepare(prepare_model, configure, example_inputs)
 
-def convert(model):
+def convert(
+    model,
+    inplace=False):
     r"""
     Convert an FP32 prepared model to a model which will automatically insert fake quant
     before a quantizable module or operator.
     Args:
         model (torch.nn.Module): The FP32 model to be convert.
+        inplace: (bool): It will change the given model in-place if True. The default value is ``False``.
     Returns:
         torch.torch.nn.Module
     """
-
     assert isinstance(model, torch.nn.Module), "Only support nn.Module convert for quantization path"
-    # Vonvert linear and weight's dtype when use autocast, which will reduce the dtype conversion.
+    assert hasattr(model, 'q_config'), "Please do prepare the model before doing convert"
+
+    if inplace:
+        convert_model = model
+    else:
+        try:
+            convert_model = copy_prepared_model(model)
+        except:
+            assert False, "The model's copy is failed, please try set inplace to True to do the convert"
+
+    # If the module's activation's qconfig is PlaceholderObserver,
+    # we can say that the module want to run dynamic quantization path.
+    if isinstance(convert_model.q_config.activation(), PlaceholderObserver):
+        qconfig_spec = {
+            torch.nn.Linear : convert_model.q_config,
+            torch.nn.LSTM : convert_model.q_config,
+            torch.nn.GRU : convert_model.q_config,
+            torch.nn.LSTMCell : convert_model.q_config,
+            torch.nn.RNNCell : convert_model.q_config,
+            torch.nn.GRUCell : convert_model.q_config,
+        }
+        return torch.quantization.quantize_dynamic(convert_model, qconfig_spec=qconfig_spec, inplace=True)
+
+    # Convert linear, conv, and Embedding's weight dtype when use autocast,
+    # which will reduce the dtype conversion.
     # TODO: check whether can be removed or not?
     if torch.is_autocast_cpu_enabled() and core.get_autocast_dtype() == torch.bfloat16:
-        model = nn.utils._model_convert.convert_module_data_type(model, torch.bfloat16)
-    convert_model = auto_convert(model)
+        convert_model = nn.utils._model_convert.convert_module_data_type(convert_model, torch.bfloat16)
+
+    convert_model = auto_convert(convert_model)
     return convert_model
diff --git a/intel_extension_for_pytorch/ao/quantization/_quantize_utils.py b/intel_extension_for_pytorch/ao/quantization/_quantize_utils.py
@@ -1,4 +1,5 @@
 import os
+import copy
 from typing import List, Dict, Tuple, Any, Optional
 import torch
 import torch.nn.functional as F
@@ -8,7 +9,8 @@
 
 from ._utils import get_torch_function_hook_type, HookType, get_module_hook_type, OpQuantizeabilityType, \
     attach_op_convert_info_to_model, save_quant_state, attach_scale_zp_values_to_model, convert_quant_state_map_to_nodes, \
-        sync_pool_and_lstm_input_output_scale_zp, module_call_to_function_call, quantized_modules_has_weights, load_qconf_summary_to_model
+        sync_pool_and_lstm_input_output_scale_zp, module_call_to_function_call, quantized_modules_has_weights, \
+        load_qconf_summary_to_model, get_fqn_valid_for_module_dict_key
 from ._quantization_state import AutoQuantizationState, AutoQuantizationStateModuleDict, init_model_quant_state
 from ._recipe import get_default_recipe
 from ._module_swap_utils import swap_child_modules
@@ -343,7 +345,26 @@ def load_qconf_summary(self, qconf_summary):
         model(*example_inputs)
     return model
 
-def auto_convert(module : torch.nn.Module) -> torch.nn.Module:
+def copy_prepared_model(model):
+    copied_model = copy.deepcopy(model)
+    copied_model.q_config = model.q_config
+    if isinstance(copied_model.q_config.activation(), PlaceholderObserver):
+        return copied_model
+    copied_model._fqn_to_auto_quant_state_map = copy.deepcopy(model._fqn_to_auto_quant_state_map)
+    named_modules = list(copied_model.named_modules())
+    for fqn, v in named_modules:
+        fqn_to_use_for_key = get_fqn_valid_for_module_dict_key(fqn)
+        if fqn_to_use_for_key in copied_model._fqn_to_auto_quant_state_map:
+            auto_quant_state = copied_model._fqn_to_auto_quant_state_map[fqn_to_use_for_key]
+            object.__setattr__(v, '_auto_quant_state', auto_quant_state)
+    if hasattr(model, '_qconf_summary'):
+        copied_model._qconf_summary = copy.deepcopy(model._qconf_summary)
+    copied_model.__class__ = model.__class__
+    return copied_model
+
+def auto_convert(
+    module : torch.nn.Module,
+    ) -> torch.nn.Module:
     def convert_to_dispatch_proxy(x):
         if isinstance(x, torch.Tensor):
             return x.as_subclass(QuantizationConvertTensorProxy)  # type: ignore[arg-type]
@@ -528,19 +549,7 @@ def unwrap_proxy(a):
             finally:
                 torch.nn.Module.__call__ = orig_module_call
                 torch.nn.Sequential.forward = orig_nn_sequential_forward  # type: ignore[assignment]
-
-    # If the module's activation's qconfig is PlaceholderObserver, we can say that the module want to run dynamic quantization path.
-    if isinstance(module.q_config.activation(), PlaceholderObserver):
-        qconfig_spec = {
-            torch.nn.Linear : module.q_config,
-            torch.nn.LSTM : module.q_config,
-            torch.nn.GRU : module.q_config,
-            torch.nn.LSTMCell : module.q_config,
-            torch.nn.RNNCell : module.q_config,
-            torch.nn.GRUCell : module.q_config,
-        }
-        return torch.quantization.quantize_dynamic(module, qconfig_spec=qconfig_spec)
-
+ 
     # If module doesn't have a configure_file attr, we can say that user has run save_qconf_summary method which have
     # computed the scales and zp, or use the user's setting from a given json file(load_qconf_summary), we need to compute
     # the scale and zp here.
diff --git a/tests/cpu/test_ao_jit_ipex_quantization.py b/tests/cpu/test_ao_jit_ipex_quantization.py
@@ -247,6 +247,25 @@ def _lstm_params_list():
             self.assertGraphContainsExactly(graph, 'ipex::quantized_lstm', 1)
 
 class TestIpexQuantizationConvertAPI(JitLlgaTestCase):
+    def test_inplace_preapre(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(128,1)
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        x = torch.rand(1,128)
+        for inplace in [False, True]:
+            m = M()
+            prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=inplace)
+            if inplace:
+                self.assertEqual(m.linear.weight.data_ptr(), prepared_model.linear.weight.data_ptr())
+            else:
+                self.assertNotEqual(m.linear.weight.data_ptr(), prepared_model.linear.weight.data_ptr())
+
     def test_inplace_convert(self):
         class M(nn.Module):
             def __init__(self):
@@ -264,7 +283,8 @@ def forward(self, x):
             for inplace in [False, True]:
                 orgin_model_weight_dtype = m_.linear.weight.dtype
                 orgin_model_bias_dtype = m_.linear.bias.dtype
-                _, _, ori_model = self.prepareModel(m_, x, qconfig=static_qconfig[1], int8_bf16=int8_bf16, inplace=inplace)
+                _, _, ori_model = self.prepareModel(m_, x, qconfig=static_qconfig[1], int8_bf16=int8_bf16,
+                        prepare_inplace=True, convert_inplace=inplace)
                 if inplace and int8_bf16:
                     if m_.linear.weight.dtype == orgin_model_weight_dtype or m_.linear.bias.dtype == orgin_model_bias_dtype:
                         print("model should have changed")
@@ -291,20 +311,20 @@ def forward(self, x):
         prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=False)
         prepared_model(x)
         with tempfile.TemporaryDirectory() as tmp:
-                path = os.path.join(tmp, "configure.json")
-                prepared_model.save_qconf_summary(path)
-                convert_model = ipex.quantization.convert(prepared_model)
-                traced_model = torch.jit.trace(convert_model, x).eval()
-                traced_model = torch.jit.freeze(traced_model)
-                y_before = traced_model(x)
-                # load the saved qconf
-                prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=False)
-                prepared_model.load_qconf_summary(path)
-                convert_model = ipex.quantization.convert(prepared_model)
-                traced_model = torch.jit.trace(convert_model, x).eval()
-                traced_model = torch.jit.freeze(traced_model)
-                y_after = traced_model(x)
-                self.assertEqual(y_before, y_after)
+            path = os.path.join(tmp, "configure.json")
+            prepared_model.save_qconf_summary(path)
+            convert_model = ipex.quantization.convert(prepared_model)
+            traced_model = torch.jit.trace(convert_model, x).eval()
+            traced_model = torch.jit.freeze(traced_model)
+            y_before = traced_model(x)
+            # load the saved qconf
+            prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=False)
+            prepared_model.load_qconf_summary(path)
+            convert_model = ipex.quantization.convert(prepared_model)
+            traced_model = torch.jit.trace(convert_model, x).eval()
+            traced_model = torch.jit.freeze(traced_model)
+            y_after = traced_model(x)
+            self.assertEqual(y_before, y_after)
 
 class TestRemoveMutate(JitLlgaTestCase):
     def test_mutated_value_alive_after_inplace_op(self):
@@ -373,6 +393,21 @@ def forward(self, x):
             graph = self.checkQuantizeTrace(m, [x], atol=2e-1, qconfig=qconfig)
             FileCheck().check_not("aten:linear").check("quantized::linear_dynamic").run(graph)
     
+    def test_linear_dynamic_bf16(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.linear = nn.Linear(3, 3)
+
+            def forward(self, x):
+                x = self.linear(x)
+                return x
+
+        x = torch.randn(3, 3)
+        m = M().eval()
+        graph, _, _ = self.prepareModel(m, [x], qconfig=dynamic_qconfig[0], int8_bf16=True)
+        FileCheck().check_not("aten:linear").check("quantized::linear_dynamic").run(graph)
+       
     def test_lstm_dynamic(self):
         class M(nn.Module):
             def __init__(self):
diff --git a/tests/cpu/test_ao_jit_llga_quantization_fuser.py b/tests/cpu/test_ao_jit_llga_quantization_fuser.py
@@ -181,14 +181,14 @@ def forward(self, x):
 
         for bias in [True]: # TODO：[True, False] when supported in backend
             x = torch.randn(2, 15)
-            m = M(bias)
 
             patterns = [
                 ["aten::to", "aten::quantize_per_tensor"],
                 ["aten::dequantize", "aten::to", "aten::linear"],
             ]
 
             for qconfig in static_qconfig:
+                m = M(bias)
                 graph = self.checkQuantizeTrace(m, [x], atol=2e-1, qconfig=qconfig, int8_bf16=True)
                 self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
                 # single aten::to won't be rewritten by llga backend
diff --git a/tests/cpu/test_ao_jit_llga_utils.py b/tests/cpu/test_ao_jit_llga_utils.py
@@ -102,7 +102,8 @@ def assertFused(self, graph, fused_patterns):
         for pat in fused_patterns:
             self.assertGraphContainsExactly(graph, pat, 0)
 
-    def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, remove_dropout=False, x_var=None, qconfig=default_static_qconfig, int8_bf16=False):
+    def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, remove_dropout=False, x_var=None,
+            qconfig=default_static_qconfig, int8_bf16=False):
         graph, traced_model, fp32_model = self.prepareModel(model, x, remove_dropout, qconfig, int8_bf16)
         with torch.no_grad():
             y = fp32_model(*x)
@@ -119,23 +120,24 @@ def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, remove_dropout=Fals
 
             return graph
 
-    def prepareModel(self, model, x, remove_dropout=False, qconfig=default_static_qconfig, int8_bf16=False, inplace=False):
+    def prepareModel(self, model, x, remove_dropout=False, qconfig=default_static_qconfig,
+            int8_bf16=False, prepare_inplace=True, convert_inplace=True,):
         model.eval()
         fp32_model = copy.deepcopy(model)
         with torch.no_grad(), torch._jit_internal._disable_emit_hooks():
             # fold conv bn
             if remove_dropout:
                 ipex.nn.utils._model_convert.replace_dropout_with_identity(model)
-            model = ipex.quantization.prepare(model, qconfig, x, inplace=inplace)
+            model = ipex.quantization.prepare(model, qconfig, x, inplace=prepare_inplace)
             # do calibration
             y = model(*x)
             # jit trace to insert quant/dequant
             if int8_bf16:
                 with torch.cpu.amp.autocast():
-                    convert_model = ipex.quantization.convert(model)
+                    convert_model = ipex.quantization.convert(model, inplace=convert_inplace)
                     traced_model = torch.jit.trace(convert_model, x)
             else:
-                convert_model = ipex.quantization.convert(model)
+                convert_model = ipex.quantization.convert(model, inplace=convert_inplace)
                 traced_model = torch.jit.trace(convert_model, x)
             traced_model = torch.jit.freeze(traced_model)