fix woq int4 oom issue by deleting old weight (#4631) (#4650)

rogerxfeng8 · Zhenhuan Chen · web-flow · commit 1f09fc2bbca8 · 2024-08-14T09:03:27.000+08:00
Co-authored-by: Zhenhuan Chen &lt;zhenhuan.chen@intel.com&gt;
diff --git a/intel_extension_for_pytorch/transformers/models/xpu/optimize_transformers/modules/transformer_modules/QuantizedMlp.py b/intel_extension_for_pytorch/transformers/models/xpu/optimize_transformers/modules/transformer_modules/QuantizedMlp.py
@@ -213,14 +213,20 @@ def transpose_inner(self):
         self.mlp_silu_qweight = torch.stack(
             (self.fc_out_quant.qweight, self.fc_in_quant.qweight)
         ).contiguous()
+        del self.fc_out_quant.qweight
+        del self.fc_in_quant.qweight
         self.mlp_silu_scales = torch.stack(
             (self.fc_out_quant.scales, self.fc_in_quant.scales)
         ).contiguous()
+        del self.fc_out_quant.scales
+        del self.fc_in_quant.scales
         self.mlp_silu_qzeros = None
         if self.fc_out_quant.qzeros is not None:
             self.mlp_silu_qzeros = torch.stack(
                 (self.fc_out_quant.qzeros, self.fc_in_quant.qzeros)
             ).contiguous()
+            del self.fc_out_quant.qzeros
+            del self.fc_in_quant.qzeros
 
     def inter_mm(self, hidden_states):
         assert self.fc_in_quant.blocksize == self.fc_out_quant.blocksize