quantization: fix issue of scales/zps are not updated after quantization model load qconf_summary and re-do calibration (#1245)

XiaobingSuper · web-flow · commit a89ab64617de · 2022-10-28T13:05:03.000+08:00
diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py
@@ -10,7 +10,7 @@
 from ._utils import get_torch_function_hook_type, HookType, get_module_hook_type, OpQuantizeabilityType, \
     attach_op_convert_info_to_model, save_quant_state, attach_scale_zp_values_to_model, convert_quant_state_map_to_nodes, \
         sync_pool_and_lstm_input_output_scale_zp, module_call_to_function_call, quantized_modules_has_weights, \
-        load_qconf_summary_to_model, get_fqn_valid_for_module_dict_key
+        load_qconf_summary_to_model, get_fqn_valid_for_module_dict_key, check_model_obsever_has_run
 from ._quantization_state import AutoQuantizationState, AutoQuantizationStateModuleDict, init_model_quant_state
 from ._recipe import get_default_recipe
 from ._module_swap_utils import swap_child_modules
@@ -322,6 +322,13 @@ def save_qconf_summary(self, qconf_summary):
                 # pooling and lstm's input and output should have same scale_zp.
                 sync_pool_and_lstm_input_output_scale_zp(quant_state_map, nodes)
                 get_default_recipe(nodes)
+            else:
+                if check_model_obsever_has_run(model):
+                    # re-compute the scales and zp if user load a json file and re-do the calibration step.
+                    attach_scale_zp_values_to_model(model)
+                else:
+                    # do nothing if user just loaded a json file and not re-do the calibration step
+                    pass
             # Setting model qconf_summary attr which can be easily to check the whether the scale/zp has been computed.
             self._qconf_summary = qconf_summary
             save_quant_state(quant_state_map, qconf_summary)
@@ -550,8 +557,8 @@ def unwrap_proxy(a):
                 torch.nn.Module.__call__ = orig_module_call
                 torch.nn.Sequential.forward = orig_nn_sequential_forward  # type: ignore[assignment]
  
-    # If module doesn't have a configure_file attr, we can say that user has run save_qconf_summary method which have
-    # computed the scales and zp, or use the user's setting from a given json file(load_qconf_summary), we need to compute
+    # If module doesn't have a configure_file attr, we can say that user didn't run save_qconf_summary method which have
+    # computed the scales and zp, or didn't use the user's setting from a given json file(load_qconf_summary), we need to compute
     # the scale and zp here.
     if not hasattr(module, '_qconf_summary'):
         quant_state_map = module._fqn_to_auto_quant_state_map
@@ -562,10 +569,16 @@ def unwrap_proxy(a):
         sync_pool_and_lstm_input_output_scale_zp(quant_state_map, nodes)
         get_default_recipe(nodes)
     else:
-        # Clear observer if module have, this will works when the user's json setting is loaded.
-        for _, v in module._fqn_to_auto_quant_state_map.items():
-            v.tensor_id_to_observer.clear()
-            v.weight_tensor_id_to_observer.clear()
+        if check_model_obsever_has_run(module):
+            # re-compute the scales and zp if user load a json file and re-do the calibration step.
+            attach_scale_zp_values_to_model(module)
+        else:
+            # clear observer if module have, this will works when the user's json setting is loaded
+            # and not re-do the calibration step.
+            for _, v in module._fqn_to_auto_quant_state_map.items():
+                v.tensor_id_to_observer.clear()
+                v.weight_tensor_id_to_observer.clear()
+
     # Attach quant_info to parent each module
     attach_op_convert_info_to_model(module)
     swap_child_modules(module)
diff --git a/intel_extension_for_pytorch/quantization/_utils.py b/intel_extension_for_pytorch/quantization/_utils.py
@@ -191,16 +191,57 @@ def attach_scale_zp_values_to_model(
             if observer.dtype in quantized_dtype:
                 scale, zp = observer.calculate_qparams()
                 qstate.tensor_id_to_scale_zp[int(tensor_id)] = (scale, zp)
+            else:
+                assert False, "The observer's dtype only can be torch.quint8 or torch.qint8"
         for tensor_id, observer in qstate.weight_tensor_id_to_observer.items():
             if observer.dtype in quantized_dtype:
                 scale, zp = observer.calculate_qparams()
                 qstate.weight_tensor_id_to_scale_zp[tensor_id] = (scale, zp)
+            else:
+                assert False, "The observer's dtype only can be torch.quint8 or torch.qint8"
         qstate.tensor_id_to_observer.clear()
         qstate.weight_tensor_id_to_observer.clear()
 
     for _, child in module.named_children():
         attach_scale_zp_values_to_model(child)
 
+
+def _check_observer_has_run(observer):
+    if observer.min_val.numel() == 0 or observer.max_val.numel() == 0:
+        return False
+    if (observer.min_val.dim() == 0 or observer.max_val.dim() == 0) and \
+            observer.min_val == float("inf") and observer.max_val == float("-inf"):
+        return False
+    return True
+
+
+def check_model_obsever_has_run(
+    module: torch.nn.Module,
+) -> None:
+    """
+    This function is about check whether the module's observer has been run by checking the
+    observer's min_value and max_max_value is the init value or not.
+    """
+    if hasattr(module, '_auto_quant_state'):
+        qstate: AutoQuantizationState = module._auto_quant_state  # type: ignore[assignment]
+        quantized_dtype =  [torch.quint8, torch.qint8]
+        for tensor_id, observer in qstate.tensor_id_to_observer.items():
+            if observer.dtype in quantized_dtype:
+                return _check_observer_has_run(observer)
+            else:
+                assert False, "The observer's dtype only can be torch.quint8 or torch.qint8"
+        for tensor_id, observer in qstate.weight_tensor_id_to_observer.items():
+            if observer.dtype in quantized_dtype:
+                return _check_observer_has_run(observer)
+            else:
+                assert False, "The observer's dtype only can be torch.quint8 or torch.qint8"
+
+    for _, child in module.named_children():
+        check_model_obsever_has_run(child)
+
+    return True
+
+
 def attach_op_convert_info_to_model(
     module: torch.nn.Module,
 ) -> None:
diff --git a/tests/cpu/test_ao_jit_ipex_quantization.py b/tests/cpu/test_ao_jit_ipex_quantization.py
@@ -307,19 +307,21 @@ def forward(self, x):
         prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=False)
         prepared_model(x)
         with tempfile.TemporaryDirectory() as tmp:
+            # case1: save qconf and load qconf.
             path = os.path.join(tmp, "configure.json")
             prepared_model.save_qconf_summary(path)
             convert_model = ipex.quantization.convert(prepared_model)
-            traced_model = torch.jit.trace(convert_model, x).eval()
-            traced_model = torch.jit.freeze(traced_model)
-            y_before = traced_model(x)
+            traced_model_ref = torch.jit.trace(convert_model, x).eval()
+            traced_model_ref = torch.jit.freeze(traced_model_ref)
             # load the saved qconf
             prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x, inplace=False)
             prepared_model.load_qconf_summary(path)
             convert_model = ipex.quantization.convert(prepared_model)
             traced_model = torch.jit.trace(convert_model, x).eval()
             traced_model = torch.jit.freeze(traced_model)
-            y_after = traced_model(x)
+            for i in range(2):
+                y_before = traced_model_ref(x)
+                y_after = traced_model(x)
             self.assertEqual(y_before, y_after)
             # save and load qconf again to make sure we didn't lost something
             path2 = os.path.join(tmp, "configure_new.json")
@@ -329,14 +331,45 @@ def forward(self, x):
             convert_model = ipex.quantization.convert(prepared_model)
             traced_model = torch.jit.trace(convert_model, x).eval()
             traced_model = torch.jit.freeze(traced_model)
-            y_after = traced_model(x)
+            for i in range(2):
+                y_after = traced_model(x)
             self.assertEqual(y_before, y_after)
             # make sure the new saved json is same as old one.
             with open(path, 'r') as f:
                 old_json = json.load(f)
             with open(path2, 'r') as f:
                 new_json = json.load(f)
-            self.assertTrue(old_json == new_json) 
+            self.assertTrue(old_json == new_json)
+
+            # case2: load qconf and re-do calibration, make sure the scales/zps is updated.
+            x_new = torch.rand(1, 3, 2, 2) * 10
+            # do ref quantization
+            prepared_model= ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x_new, inplace=False)
+            prepared_model(x_new)
+            ref_path = os.path.join(tmp, "configure_ref.json")
+            prepared_model.save_qconf_summary(ref_path)
+            convert_model = ipex.quantization.convert(prepared_model)
+            traced_model_ref = torch.jit.trace(convert_model, x_new).eval()
+            traced_model_ref = torch.jit.freeze(traced_model_ref)
+            # load qconf, and re-do calibration
+            prepared_model = ipex.quantization.prepare(m, static_qconfig[0], example_inputs=x_new, inplace=False)
+            prepared_model.load_qconf_summary(path2)
+            prepared_model(x_new)
+            new_path = os.path.join(tmp, "configure_new.json")
+            prepared_model.save_qconf_summary(new_path)
+            traced_model_new = torch.jit.trace(convert_model, x_new).eval()
+            traced_model_new = torch.jit.freeze(traced_model_new)
+            for i in range(2):
+                y_ref = traced_model_ref(x_new)
+                y_new = traced_model_new(x_new)
+            self.assertEqual(y_ref, y_new)
+            # make sure the new saved json is same as ref one.
+            with open(ref_path, 'r') as f:
+                old_json = json.load(f)
+            with open(new_path, 'r') as f:
+                new_json = json.load(f)
+            self.assertTrue(old_json == new_json)
+
 
 class TestRemoveMutate(JitLlgaTestCase):
     def test_mutated_value_alive_after_inplace_op(self):