fix CPU UTs (#2531)

xuhancn · web-flow · commit b8848b5392f0 · 2023-04-11T10:16:25.000+08:00
diff --git a/intel_extension_for_pytorch/quantization/_quantization_state_utils.py b/intel_extension_for_pytorch/quantization/_quantization_state_utils.py
@@ -24,10 +24,10 @@
     F.conv3d,
     torch.conv2d,
     torch.conv3d,
-    #F.conv_transpose2d,   #TODO
-    #F.conv_transpose3d,   #TODO
-    #torch.conv_transpose2d,  #TODO
-    #torch.conv_transpose3d,  #TODO
+    F.conv_transpose2d,
+    F.conv_transpose3d,
+    torch.conv_transpose2d,
+    torch.conv_transpose3d,
     torch.relu,
     F.relu,
     #torch.sigmoid,  # TODO
@@ -50,8 +50,8 @@
 module_types_supported_by_quantization = set([
     torch.nn.Conv2d,
     torch.nn.Conv3d,
-    #torch.nn.ConvTranspose2d,
-    #torch.nn.ConvTranspose3d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
     torch.nn.Linear,
     torch.nn.MaxPool2d,
     torch.nn.MaxPool3d,
@@ -90,10 +90,10 @@
     str(F.conv3d),
     str(torch.conv2d),
     str(torch.conv3d),
-    #str(F.conv_transpose2d),
-    #str(F.conv_transpose3d),
-    #str(torch.conv_transpose2d),
-    #str(torch.conv_transpose3d),
+    str(F.conv_transpose2d),
+    str(F.conv_transpose3d),
+    str(torch.conv_transpose2d),
+    str(torch.conv_transpose3d),
     str(F.linear), 
     str(torch._C._nn.linear),
     ]
@@ -102,8 +102,8 @@
     #str(torch.nn.Conv1d) # it will be enabled at next step.
     str(torch.nn.Conv2d),
     str(torch.nn.Conv3d),
-    #str(torch.nn.ConvTranspose2d),
-    #str(torch.nn.ConvTranspose3d),
+    str(torch.nn.ConvTranspose2d),
+    str(torch.nn.ConvTranspose3d),
     str(torch.nn.Linear),
     ]
 
diff --git a/intel_extension_for_pytorch/quantization/_quantize.py b/intel_extension_for_pytorch/quantization/_quantize.py
@@ -30,6 +30,10 @@ def prepare(
         torch.nn.Module
     """
     assert isinstance(model, torch.nn.Module), "Only support nn.Module prepare for quantization path"
+    # auto model channels_last memory format conversion
+    from ..frontend import auto_channels_last, _convert_convNd_weight_memory_format
+    if auto_channels_last:
+        _convert_convNd_weight_memory_format(model)
     try:
         prepare_model = optimization.fuse(model, inplace=inplace)
         prepare_model = linear_bn_fuse(prepare_model, inplace=inplace)
diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py
@@ -6,6 +6,7 @@
 from torch.fx.node import map_aggregate
 from torch.ao.quantization import PlaceholderObserver
 from torch.quantization.qconfig import QConfig
+from torch.nn.utils.rnn import PackedSequence
 
 from ._utils import get_torch_function_hook_type, HookType, get_module_hook_type, OpQuantizeabilityType, \
     attach_op_convert_info_to_model, save_quant_state, attach_scale_zp_values_to_model, convert_quant_state_map_to_nodes, \
@@ -36,6 +37,25 @@ def _check_add_has_scalar_input(args):
             return True
     return False
 
+def _convert_PackedSequence_to_tuple_lstm(args):
+     if isinstance(args, tuple) and len(args) == 2:   # (PackedSequence, hx)
+        input, batch_sizes, sorted_indices, unsorted_indices = args[0]
+        args = (input, batch_sizes, sorted_indices, unsorted_indices, args[-1])
+     elif isinstance(args, tuple) and len(args) == 1:   # (PackedSequence, )
+         input, batch_sizes, sorted_indices, unsorted_indices = args[0]
+         args = (input, batch_sizes, sorted_indices, unsorted_indices)
+     else:
+         assert False, "_convert_PackedSequence_to_tuple args should be a tuple with size 2 or PackedSequence"
+     return args
+
+def _convert_tuple_to_PackedSequence_lstm(args):
+    assert isinstance(args, tuple) and len(args) >= 4 and len(args) <=5, "_convert_tuple_to_PackedSequence input should be a tuple(5=<size >=4)"
+    if len(args) == 4:
+        return (PackedSequence(*args),)
+    else:
+        return (PackedSequence(*args[:-1]), args[-1])
+    
+
 def auto_prepare(
     model : torch.nn.Module,
     configure: QConfig,
@@ -212,7 +232,9 @@ def _patched_module_call(self, *args, **kwargs):
                         old_global_disable_torch_function_override = \
                             global_disable_torch_function_override
                         global_disable_torch_function_override = True
-
+                        is_lstm_packed_input = isinstance(cur_module, torch.nn.LSTM) and isinstance(args[0], PackedSequence)
+                        if is_lstm_packed_input:
+                            args = _convert_PackedSequence_to_tuple_lstm(args)
                         if first_call:
                             # mypy ignore is used instead of assert because this
                             # runs on every forward and assert has a performance cost
@@ -226,19 +248,28 @@ def _patched_module_call(self, *args, **kwargs):
                             args, kwargs = parent_qstate.op_prepare_before_hook(
                                 cur_module, args, kwargs)  # type: ignore[arg-type]
 
+                        if is_lstm_packed_input:
+                            args = _convert_tuple_to_PackedSequence_lstm(args)
+
                         # original forward
                         output = orig_module_call(self, *args, **kwargs)
                         # Re-enable the overrides.
                         global_disable_torch_function_override = \
                             old_global_disable_torch_function_override
 
                         # after hooks
+                        if is_lstm_packed_input:
+                            output = _convert_PackedSequence_to_tuple_lstm(output)
                         if first_call:
                             output = parent_qstate.first_call_op_prepare_after_hook(
                                 cur_module, output, args, qtensor_id, OpQuantizeabilityType.QUANTIZEABLE)
                         else:
                             output = parent_qstate.op_prepare_after_hook(
                                 cur_module, output, args, global_op_idx)
+                        
+                        if is_lstm_packed_input:
+                            output = _convert_tuple_to_PackedSequence_lstm(output)
+
                         parent_qstate.mark_cur_op_complete(cur_module)
                     elif hook_type is HookType.MODULE_IO_HOOKS:
                         cur_qstate = cur_module._auto_quant_state
@@ -500,17 +531,25 @@ def _patched_module_call(self, *args, **kwargs):
                         old_global_disable_torch_function_override = \
                             global_disable_torch_function_override
                         global_disable_torch_function_override = True
+                        is_lstm_packed_input = isinstance(cur_module, torch.nn.LSTM) and isinstance(args[0], PackedSequence)
+                        if is_lstm_packed_input:
+                            args = _convert_PackedSequence_to_tuple_lstm(args)
                         _, args, kwargs = qstate.op_convert_before_hook(
                             cur_module, args, kwargs, cur_module)
+                        if is_lstm_packed_input:
+                            args = _convert_tuple_to_PackedSequence_lstm(args)
                         if type(cur_module) in quantized_modules_has_weights:
                             weights = qstate.op_weight_convert_before_hook(cur_module)
                             output = module_call_to_function_call(self, args, weights)
                         else:
                              output = orig_module_call(self, *args, **kwargs)
                         # after hooks
+                        if is_lstm_packed_input:
+                            output = _convert_PackedSequence_to_tuple_lstm(output)
                         output = qstate.op_convert_after_hook(
                             cur_module, output)
-
+                        if is_lstm_packed_input:
+                            output = _convert_tuple_to_PackedSequence_lstm(output)
                         # Re-enable the override.
                         global_disable_torch_function_override = \
                             old_global_disable_torch_function_override
diff --git a/intel_extension_for_pytorch/quantization/_recipe.py b/intel_extension_for_pytorch/quantization/_recipe.py
@@ -16,6 +16,9 @@
 conv_gemm_ops = [str(F.conv2d), str(nn.Conv2d), str(F.conv3d), str(nn.Conv3d), str(torch.conv2d), str(torch.conv3d), \
     str(F.conv_transpose2d), str(torch.nn.ConvTranspose2d), str(F.conv_transpose3d), str(torch.nn.ConvTranspose3d),
     str(torch.conv_transpose2d), str(torch.conv_transpose2d), str(F.linear), str(nn.Linear), str(torch.matmul), str(torch.Tensor.matmul)]
+conv_ops = [str(F.conv2d), str(nn.Conv2d), str(F.conv3d), str(nn.Conv3d), str(torch.conv2d), str(torch.conv3d), \
+    str(F.conv_transpose2d), str(torch.nn.ConvTranspose2d), str(F.conv_transpose3d), str(torch.nn.ConvTranspose3d),
+    str(torch.conv_transpose2d), str(torch.conv_transpose2d)]
 rnn_ops = [str(torch.nn.LSTM)]
 
 # Those ops only support s8->s8 path, and also require the qscheme is per_tensor_symmetric.
@@ -60,6 +63,17 @@ def _default_recipe_init(nodes):
                             tensor_info.inf_dtype = tensor_info.orig_dtype
                             node.input_tensor_force_inf_dtype[idx] = tensor_info.inf_dtype
 
+            # For LSTM, if it's input is a PackedSequence, we don't support ot now.
+            # TODO: support PackedSequence input for quantization LSTM.
+            if node.type in rnn_ops and len(node.input_tensor_infos) > 2:
+                for idx, tensor_info in enumerate(node.input_tensor_infos):
+                    if tensor_info is not None:
+                        tensor_info.inf_dtype = tensor_info.orig_dtype
+                        node.input_tensor_force_inf_dtype[idx] = tensor_info.inf_dtype
+                for idx, tensor_info in enumerate(node.weight_tensor_infos):
+                    if tensor_info is not None:
+                        tensor_info.inf_dtype = tensor_info.orig_dtype
+
 #TODO: making fusion pattern check more general.
 def _find_fused_node_with_cur_elt_wise(node, ops):
     r"""
@@ -198,6 +212,20 @@ def _check_has_quantizable_node_before_node(node):
                 # for none ipex customer op, if have a qconfig, we can say it is a quantizable op.
                 return True
 
+def _check_has_quantizable_node_after_node(node):
+    r"""
+    This function is about check whether all quantizable nodes after the given node,
+    which is used to check whether insert fake quant before one quantizable node or not.
+    """
+    if len(node.post_nodes) > 0:
+        output = True
+        for i in range(len(node.post_nodes)):
+            if node.post_nodes[i].qconfig is None:
+                output = False
+        return output
+    else:
+        return False
+
 def _add_recipe(node):
     '''
     Case1: add has pre gemm node.
@@ -233,6 +261,7 @@ def reset_input_inf_dtype_to_orig_dtype(node, input_idx):
                 node.input_tensor_force_inf_dtype[input_idx] = node.input_tensor_infos[input_idx].inf_dtype
 
     conv_gemm_node = _find_fused_node_with_cur_add(node, conv_gemm_ops)
+    conv_node = _find_fused_node_with_cur_add(node, conv_ops)
     if conv_gemm_node is None:
         #  If pre_nodes don't have gemm node, need to check whether have quantizable node before it,
         #  if does't have quantizable node before it, we will not insert fake quant before add.
@@ -255,13 +284,17 @@ def reset_input_inf_dtype_to_orig_dtype(node, input_idx):
         if node.input_tensor_infos[0] is not None and node.input_tensor_infos[0] in conv_gemm_node.output_tensor_infos:
             node.input_tensor_infos[0].inf_dtype = node.input_tensor_infos[0].orig_dtype
             node.input_tensor_force_inf_dtype[0] = node.input_tensor_infos[0].inf_dtype
-            # set another input's dtype, if another's input is from non-quantizable op, we can remove the fake quant.
-            reset_input_inf_dtype_to_orig_dtype(node, 1)
+            # TODO: set another input's dtype for conv nodes when oneDNN is ready.
+            if conv_node is None or not _check_has_quantizable_node_after_node(node):
+                # set another input's dtype, if another's input is from non-quantizable op, we can remove the fake quant.
+                reset_input_inf_dtype_to_orig_dtype(node, 1)
         elif node.input_tensor_infos[1] is not None and node.input_tensor_infos[1] in conv_gemm_node.output_tensor_infos:
             node.input_tensor_infos[1].inf_dtype = node.input_tensor_infos[1].orig_dtype
             node.input_tensor_force_inf_dtype[1] = node.input_tensor_infos[1].inf_dtype
-            # set another input's dtype, if another's input is from non-quantizable op, we can remove the fake quant.
-            reset_input_inf_dtype_to_orig_dtype(node, 0)
+            # TODO: set another input's dtype for conv nodes when oneDNN is ready.
+            if conv_node is None or not _check_has_quantizable_node_after_node(node):
+                # set another input's dtype, if another's input is from non-quantizable op, we can remove the fake quant.
+                reset_input_inf_dtype_to_orig_dtype(node, 0)
 
 # get a default recipe
 def get_default_recipe(nodes):
diff --git a/intel_extension_for_pytorch/quantization/_utils.py b/intel_extension_for_pytorch/quantization/_utils.py
@@ -403,7 +403,7 @@ def set_node_output_quantized(nodes):
     # output's infe dtype is not int8, set it and also set insert_fake_quant_after_output to True.
     """
     def _reset_post_node_input_infos(node):
-        # make sure the post node will node insert fake quant if we add fake quant by cur node' output
+        # make sure the post node will insert fake quant if we add fake quant by cur node' output
         if len(node.post_nodes) > 0:
             for post_node in node.post_nodes:
                 if post_node.qconfig is not None:
@@ -434,10 +434,12 @@ def _reset_post_node_input_infos(node):
                     node.insert_fake_quant_after_outputs[0] = True
                     _reset_post_node_input_infos(node)
             else:
-                if node.input_tensor_force_inf_dtype[0] in [torch.qint8, torch.quint8] and not post_node_are_quantized:
-                   node.output_tensor_infos[0].inf_dtype = node.input_tensor_force_inf_dtype[0]
-                   node.insert_fake_quant_after_outputs[0] = True
-                   _reset_post_node_input_infos(node)
+                # TODO: enable PackedSequence input for LSTM.
+                if not (node.type in [nn.LSTM] and len(node.input_tensor_infos) > 2):
+                    if node.input_tensor_force_inf_dtype[0] in [torch.qint8, torch.quint8] and not post_node_are_quantized:
+                        node.output_tensor_infos[0].inf_dtype = node.input_tensor_force_inf_dtype[0]
+                        node.insert_fake_quant_after_outputs[0] = True
+                        _reset_post_node_input_infos(node)
 
 qscheme_dict = {
     str(torch.per_tensor_affine): torch.per_tensor_affine,
@@ -794,7 +796,7 @@ def module_call_to_function_call(module, args, weights):
         output = F.embedding_bag(args[0], weights[0], args[1], module.max_norm, \
             module.norm_type, module.scale_grad_by_freq, module.mode, module.sparse,
             args[2] if len(args) == 3 else None, module.include_last_offset, module.padding_idx)
-    elif isinstance(module, torch.nn.ConvTranspose2d) or isinstance(module, torch.nn.ConvTranspose2d):
+    elif isinstance(module, torch.nn.ConvTranspose2d) or isinstance(module, torch.nn.ConvTranspose3d):
         if module.padding_mode != 'zeros':
             raise ValueError('Only `zeros` padding mode is supported for ConvTranspose2d')
         assert isinstance(module.padding, tuple)