From edbfae5172b0c2af2aaa3183bc31a694c15ed955 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 27 Nov 2025 15:57:10 +0000
Subject: [PATCH 01/11] initial

---
 src/transformers/integrations/fp_quant.py     | 67 ++++++++++++++++++-
 .../quantizers/quantizer_fp_quant.py          |  6 +-
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index ccf933796165..ff72af717ca0 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -17,13 +17,78 @@
     is_fp_quant_available,
 )
 
+import torch
+from typing import Optional
 
 if is_fp_quant_available():
     from fp_quant import FPQuantConfig as FPQuantLinearConfig
     from fp_quant import FPQuantDtype
 
 from transformers.utils.quantization_config import FPQuantConfig
+from ..quantizers.quantizers_utils import get_module_from_name
+from ..core_model_loading import ConversionOps
 
+class FpQuantQuantize(ConversionOps):
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]:
+        target_key, value = tuple(input_dict.items())[0]
+        value = value[0] if isinstance(value, list) else value
+
+        module, _ = get_module_from_name(model, target_key)
+
+        # TODO: check if we need this or not, commented for now
+        # if target_device == "cpu" and param_name.endswith("weight"):
+        #     # Works agains hard-coded missing key dispatch to CPU
+        #     return
+        # The module holds either:
+        #  * `weight` when `store_master_weights=True`
+        #  * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
+        #  * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
+        if target_key.endswith(".qweight"):
+            # Loading a real quantized checkpoint without master weights
+            qweight = torch.nn.Parameter(
+                value,
+                requires_grad=False,
+            )
+            
+            weight_key = target_key.rsplit(".", 1)[0] + ".weight"
+            dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight"
+
+            return {f"{target_key}": qweight,
+                    }
+
+        if target_key.endswith(".dqweight"):
+            # Loading a pseudo-quantized checkpoint without master weights
+            dqweight = torch.nn.Parameter(value)
+            
+            weight_key = target_key.rsplit(".", 1)[0] + ".weight"
+            dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight"
+            scales_key = target_key.rsplit(".", 1)[0] + ".scales"
+            
+            return {
+                f"{target_key}": dqweight,
+                f"{weight_key}": torch.nn.Parameter(torch.empty(0)),
+                f"{dqweight_key}": torch.nn.Parameter(torch.empty(0)),
+                f"{scales_key}": torch.nn.Parameter(torch.empty(0))
+                }
+
+        # Loading master weights or an unquantized checkpoint
+        weight = torch.nn.Parameter(value)
+        module.weight = weight
+        # Let pre-forward handle the quantization and set None where necessary
+        module.pre_forward()
+        
+        prefix_target_key = target_key.rsplit(".", 1)[0]
+
+        return {target_key: weight,
+                f"{prefix_target_key}.act_global_scale": module.act_global_scale,
+                f"{prefix_target_key}.backward_hadamard_matrix": module.backward_hadamard_matrix,
+                f"{prefix_target_key}.forward_hadamard_matrix": module.forward_hadamard_matrix,
+                f"{prefix_target_key}.qweight": module.qweight,
+                f"{prefix_target_key}.scales": module.scales
+                }
 
 def adapt_fp_quant_config(config: FPQuantConfig):
     if config.forward_dtype == "mxfp4":
@@ -51,4 +116,4 @@ def adapt_fp_quant_config(config: FPQuantConfig):
         pseudoquantization=config.pseudoquantization,
         transform_init=config.transform_init,
         modules_to_not_convert=config.modules_to_not_convert,
-    )
+    )
\ No newline at end of file
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index b5c9f2c8179f..85c5115d7645 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -140,7 +140,7 @@ def _process_model_before_weight_loading(
 
         replace_with_fp_quant_linear(
             model,
-            fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config),
+            fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config)
         )
         model.config.quantization_config = self.quantization_config
 
@@ -178,3 +178,7 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
             return True
         else:
             return False
+
+    def get_quantize_ops(self):
+        from ..integrations.fp_quant import FpQuantQuantize
+        return FpQuantQuantize(self)
\ No newline at end of file

From 53a2f598b1d1a26675927547acdc5dea7c5ca870 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 27 Nov 2025 17:29:00 +0000
Subject: [PATCH 02/11] quantization fixed

---
 src/transformers/integrations/fp_quant.py | 25 ++++++++++++++---------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index ff72af717ca0..e658f9688a6d 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -56,10 +56,10 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
             weight_key = target_key.rsplit(".", 1)[0] + ".weight"
             dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight"
 
-            return {f"{target_key}": qweight,
-                    }
+            return {f"{target_key}": qweight}
 
         if target_key.endswith(".dqweight"):
+            print(f"target_key: {target_key}")
             # Loading a pseudo-quantized checkpoint without master weights
             dqweight = torch.nn.Parameter(value)
             
@@ -77,18 +77,23 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
         # Loading master weights or an unquantized checkpoint
         weight = torch.nn.Parameter(value)
         module.weight = weight
+
+        # print(f"module.state_dict(): {module.state_dict()}")
         # Let pre-forward handle the quantization and set None where necessary
         module.pre_forward()
-        
+
         prefix_target_key = target_key.rsplit(".", 1)[0]
 
-        return {target_key: weight,
-                f"{prefix_target_key}.act_global_scale": module.act_global_scale,
-                f"{prefix_target_key}.backward_hadamard_matrix": module.backward_hadamard_matrix,
-                f"{prefix_target_key}.forward_hadamard_matrix": module.forward_hadamard_matrix,
-                f"{prefix_target_key}.qweight": module.qweight,
-                f"{prefix_target_key}.scales": module.scales
-                }
+        # keys are set inside the module.pre_forward() method, we don't need remove them from the missing keys list
+        missing_keys.discard(target_key)
+        missing_keys.discard(f"{prefix_target_key}.backward_hadamard_matrix")
+        missing_keys.discard(f"{prefix_target_key}.forward_hadamard_matrix")
+        missing_keys.discard(f"{prefix_target_key}.act_global_scale")
+        missing_keys.discard(f"{prefix_target_key}.weight_global_scale")
+        missing_keys.discard(f"{prefix_target_key}.qweight")
+        missing_keys.discard(f"{prefix_target_key}.scales")
+        missing_keys.discard(f"{prefix_target_key}.dqweight")
+        return {}
 
 def adapt_fp_quant_config(config: FPQuantConfig):
     if config.forward_dtype == "mxfp4":

From 29febd45b5da89bcb5044585a0d31dab1328107d Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Fri, 28 Nov 2025 09:27:43 +0000
Subject: [PATCH 03/11] up

---
 src/transformers/conversion_mapping.py            | 8 ++++----
 src/transformers/core_model_loading.py            | 7 +++----
 src/transformers/quantizers/quantizer_fp_quant.py | 2 ++
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 5968bd08d406..9fb552c75a82 100644
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -142,12 +142,12 @@ def _build_checkpoint_conversion_mapping():
     if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
         mapping["legacy"] += [
             WeightRenaming(
-                source_patterns="weight_g",
-                target_patterns="parametrizations.weight.original0",
+                source_keys=r"weight_g$",
+                target_keys="parametrizations.weight.original0",
             ),
             WeightRenaming(
-                source_patterns="weight_v",
-                target_patterns="parametrizations.weight.original1",
+                source_keys=r"weight_v$",
+                target_keys="parametrizations.weight.original1",
             ),
         ]
     else:
diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index 673b0caf2dd0..b65098090e86 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -905,10 +905,9 @@ def convert_and_load_state_dict_in_model(
                             mapping.distributed_operation,
                             hf_quantizer,
                         )
-
-                # Cleanup the tensors
-                mapping.reset()
-            except SkipLayer:
+            except SkipLayer as e:
+                print(e)
+                raise e
                 continue
 
     # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index 85c5115d7645..74c4353c0764 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -107,6 +107,7 @@ def create_quantized_param(
         #  * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
 
         if param_name.endswith(".qweight"):
+            print(f"param_value qweight: {param_value.shape}")
             # Loading a real quantized checkpoint without master weights
             module.qweight = torch.nn.Parameter(
                 param_value.to(target_device),
@@ -117,6 +118,7 @@ def create_quantized_param(
             return
 
         if param_name.endswith(".dqweight"):
+            print(f"param_value dqweight: {param_value.shape}")
             # Loading a pseudo-quantized checkpoint without master weights
             module.dqweight = torch.nn.Parameter(param_value.to(target_device))
             module.weight = None

From 5824ecbe4ea93d27eefaa2074099658b64e457b0 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:44:17 +0000
Subject: [PATCH 04/11] working

---
 src/transformers/conversion_mapping.py        |  8 +-
 src/transformers/integrations/fp_quant.py     | 87 ++++++++++---------
 .../quantizers/quantizer_fp_quant.py          | 24 ++++-
 3 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 9fb552c75a82..e7c660cbe3f4 100644
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -142,12 +142,12 @@ def _build_checkpoint_conversion_mapping():
     if hasattr(torch.nn.utils.parametrizations, "weight_norm"):
         mapping["legacy"] += [
             WeightRenaming(
-                source_keys=r"weight_g$",
-                target_keys="parametrizations.weight.original0",
+                source_patterns=r"weight_g$",
+                target_patterns="parametrizations.weight.original0",
             ),
             WeightRenaming(
-                source_keys=r"weight_v$",
-                target_keys="parametrizations.weight.original1",
+                source_patterns=r"weight_v$",
+                target_patterns="parametrizations.weight.original1",
             ),
         ]
     else:
diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index e658f9688a6d..91ae09303c4e 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -34,53 +34,16 @@ def __init__(self, hf_quantizer):
 
     def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]:
         target_key, value = tuple(input_dict.items())[0]
-        value = value[0] if isinstance(value, list) else value
-
-        module, _ = get_module_from_name(model, target_key)
-
-        # TODO: check if we need this or not, commented for now
-        # if target_device == "cpu" and param_name.endswith("weight"):
-        #     # Works agains hard-coded missing key dispatch to CPU
-        #     return
-        # The module holds either:
-        #  * `weight` when `store_master_weights=True`
-        #  * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
-        #  * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
-        if target_key.endswith(".qweight"):
-            # Loading a real quantized checkpoint without master weights
-            qweight = torch.nn.Parameter(
-                value,
-                requires_grad=False,
-            )
-            
-            weight_key = target_key.rsplit(".", 1)[0] + ".weight"
-            dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight"
-
-            return {f"{target_key}": qweight}
-
-        if target_key.endswith(".dqweight"):
-            print(f"target_key: {target_key}")
-            # Loading a pseudo-quantized checkpoint without master weights
-            dqweight = torch.nn.Parameter(value)
-            
-            weight_key = target_key.rsplit(".", 1)[0] + ".weight"
-            dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight"
-            scales_key = target_key.rsplit(".", 1)[0] + ".scales"
-            
-            return {
-                f"{target_key}": dqweight,
-                f"{weight_key}": torch.nn.Parameter(torch.empty(0)),
-                f"{dqweight_key}": torch.nn.Parameter(torch.empty(0)),
-                f"{scales_key}": torch.nn.Parameter(torch.empty(0))
-                }
-
+        value = value[0]
         # Loading master weights or an unquantized checkpoint
         weight = torch.nn.Parameter(value)
+        module, _ = get_module_from_name(model, target_key)
         module.weight = weight
 
-        # print(f"module.state_dict(): {module.state_dict()}")
         # Let pre-forward handle the quantization and set None where necessary
-        module.pre_forward()
+        # This operation will quantize the weights internally
+        with torch.cuda.device(value.device):
+            module.pre_forward()
 
         prefix_target_key = target_key.rsplit(".", 1)[0]
 
@@ -95,6 +58,46 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
         missing_keys.discard(f"{prefix_target_key}.dqweight")
         return {}
 
+class FpQuantDeserialize(ConversionOps):
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
+
+    def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, full_layer_name: str | None = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]:
+        target_key, value = tuple(input_dict.items())[0]
+        value = value[0] if isinstance(value, list) else value
+        module, _ = get_module_from_name(model, target_key)
+        # The module holds either:
+        #  * `weight` when `store_master_weights=True`
+        #  * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
+        #  * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
+        if target_key == ".qweight":
+            # Loading a real quantized checkpoint without master weights
+            qweight = torch.nn.Parameter(
+                value,
+                requires_grad=False,
+            )
+
+            return {
+                ".qweight": qweight,
+                # the way the FPQuantLinear module is designed, these parameters are expected in the model
+                # even though they are not used so we need to set them to zeros
+                ".weight": torch.nn.Parameter(torch.zeros(0)),
+                ".qweight": torch.nn.Parameter(torch.zeros(0)),
+            }
+
+        if target_key == ".dqweight":
+            # Loading a pseudo-quantized checkpoint without master weights
+            dqweight = torch.nn.Parameter(value)
+
+            return {
+                    ".dqweight": dqweight,
+                    # the way the FPQuantLinear module ips designed, these parameters are expected in the model
+                    # even though they are not used so we need to set them to zeros
+                    ".weight": torch.nn.Parameter(torch.zeros(0)),
+                    ".qweight": torch.nn.Parameter(torch.zeros(0)),
+                    ".scales": torch.nn.Parameter(torch.zeros(0))
+                }
+
 def adapt_fp_quant_config(config: FPQuantConfig):
     if config.forward_dtype == "mxfp4":
         forward_dtype = FPQuantDtype.MXFP4
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index 74c4353c0764..a4d36501f7ad 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -183,4 +183,26 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
 
     def get_quantize_ops(self):
         from ..integrations.fp_quant import FpQuantQuantize
-        return FpQuantQuantize(self)
\ No newline at end of file
+        return FpQuantQuantize(self)
+
+    def get_weight_conversions(self):
+        from ..integrations.fp_quant import FpQuantDeserialize
+        from ..core_model_loading import WeightConverter
+        if self.pre_quantized:
+            if self.quantization_config.pseudoquantization:
+                return [
+                    WeightConverter(
+                        source_patterns=[".dqweight"],
+                        target_patterns=".dqweight",
+                        operations=[FpQuantDeserialize(self)],
+                    ),
+                ]
+            else:
+                return [
+                    WeightConverter(
+                        source_patterns=[".qweight"],
+                        target_patterns=".qweight",
+                        operations=[FpQuantDeserialize(self)],
+                    ),
+                ]
+        return []
\ No newline at end of file

From f86696f4566f431a505894e26356d2fdc5aaef9a Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:45:07 +0000
Subject: [PATCH 05/11] fix

---
 src/transformers/integrations/fp_quant.py         | 14 +++++++++-----
 src/transformers/quantizers/quantizer_fp_quant.py |  4 ++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index 91ae09303c4e..651e9f64f3e2 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -13,20 +13,24 @@
 # limitations under the License.
 "FP-Quant integration file"
 
+from typing import Optional
+
+import torch
+
 from ..utils import (
     is_fp_quant_available,
 )
 
-import torch
-from typing import Optional
 
 if is_fp_quant_available():
     from fp_quant import FPQuantConfig as FPQuantLinearConfig
     from fp_quant import FPQuantDtype
 
 from transformers.utils.quantization_config import FPQuantConfig
-from ..quantizers.quantizers_utils import get_module_from_name
+
 from ..core_model_loading import ConversionOps
+from ..quantizers.quantizers_utils import get_module_from_name
+
 
 class FpQuantQuantize(ConversionOps):
     def __init__(self, hf_quantizer):
@@ -82,7 +86,7 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
                 # the way the FPQuantLinear module is designed, these parameters are expected in the model
                 # even though they are not used so we need to set them to zeros
                 ".weight": torch.nn.Parameter(torch.zeros(0)),
-                ".qweight": torch.nn.Parameter(torch.zeros(0)),
+                ".dqweight": torch.nn.Parameter(torch.zeros(0)),
             }
 
         if target_key == ".dqweight":
@@ -124,4 +128,4 @@ def adapt_fp_quant_config(config: FPQuantConfig):
         pseudoquantization=config.pseudoquantization,
         transform_init=config.transform_init,
         modules_to_not_convert=config.modules_to_not_convert,
-    )
\ No newline at end of file
+    )
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index a4d36501f7ad..80390405b0a7 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -186,8 +186,8 @@ def get_quantize_ops(self):
         return FpQuantQuantize(self)
 
     def get_weight_conversions(self):
-        from ..integrations.fp_quant import FpQuantDeserialize
         from ..core_model_loading import WeightConverter
+        from ..integrations.fp_quant import FpQuantDeserialize
         if self.pre_quantized:
             if self.quantization_config.pseudoquantization:
                 return [
@@ -205,4 +205,4 @@ def get_weight_conversions(self):
                         operations=[FpQuantDeserialize(self)],
                     ),
                 ]
-        return []
\ No newline at end of file
+        return []

From 58b8c99785284429a98e34976745bd79c25ac213 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:45:16 +0000
Subject: [PATCH 06/11] style

---
 src/transformers/integrations/fp_quant.py     | 33 ++++++++++++++-----
 .../quantizers/quantizer_fp_quant.py          |  7 ++--
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py
index 651e9f64f3e2..af7821786d6c 100644
--- a/src/transformers/integrations/fp_quant.py
+++ b/src/transformers/integrations/fp_quant.py
@@ -36,7 +36,13 @@ class FpQuantQuantize(ConversionOps):
     def __init__(self, hf_quantizer):
         self.hf_quantizer = hf_quantizer
 
-    def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]:
+    def convert(
+        self,
+        input_dict: torch.Tensor,
+        model: Optional[torch.nn.Module] = None,
+        missing_keys: Optional[list[str]] = None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
         target_key, value = tuple(input_dict.items())[0]
         value = value[0]
         # Loading master weights or an unquantized checkpoint
@@ -62,11 +68,19 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
         missing_keys.discard(f"{prefix_target_key}.dqweight")
         return {}
 
+
 class FpQuantDeserialize(ConversionOps):
     def __init__(self, hf_quantizer):
         self.hf_quantizer = hf_quantizer
 
-    def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, full_layer_name: str | None = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]:
+    def convert(
+        self,
+        input_dict: torch.Tensor,
+        model: Optional[torch.nn.Module] = None,
+        full_layer_name: str | None = None,
+        missing_keys: Optional[list[str]] = None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
         target_key, value = tuple(input_dict.items())[0]
         value = value[0] if isinstance(value, list) else value
         module, _ = get_module_from_name(model, target_key)
@@ -94,13 +108,14 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N
             dqweight = torch.nn.Parameter(value)
 
             return {
-                    ".dqweight": dqweight,
-                    # the way the FPQuantLinear module ips designed, these parameters are expected in the model
-                    # even though they are not used so we need to set them to zeros
-                    ".weight": torch.nn.Parameter(torch.zeros(0)),
-                    ".qweight": torch.nn.Parameter(torch.zeros(0)),
-                    ".scales": torch.nn.Parameter(torch.zeros(0))
-                }
+                ".dqweight": dqweight,
+                # the way the FPQuantLinear module ips designed, these parameters are expected in the model
+                # even though they are not used so we need to set them to zeros
+                ".weight": torch.nn.Parameter(torch.zeros(0)),
+                ".qweight": torch.nn.Parameter(torch.zeros(0)),
+                ".scales": torch.nn.Parameter(torch.zeros(0)),
+            }
+
 
 def adapt_fp_quant_config(config: FPQuantConfig):
     if config.forward_dtype == "mxfp4":
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index 80390405b0a7..4f462bc83bd0 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -140,10 +140,7 @@ def _process_model_before_weight_loading(
 
         from ..integrations.fp_quant import adapt_fp_quant_config
 
-        replace_with_fp_quant_linear(
-            model,
-            fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config)
-        )
+        replace_with_fp_quant_linear(model, fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config))
         model.config.quantization_config = self.quantization_config
 
     def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
@@ -183,11 +180,13 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **
 
     def get_quantize_ops(self):
         from ..integrations.fp_quant import FpQuantQuantize
+
         return FpQuantQuantize(self)
 
     def get_weight_conversions(self):
         from ..core_model_loading import WeightConverter
         from ..integrations.fp_quant import FpQuantDeserialize
+
         if self.pre_quantized:
             if self.quantization_config.pseudoquantization:
                 return [

From 2e02c2da83d1ce732b61bdcf06d2e5a2eeec8af6 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:47:09 +0000
Subject: [PATCH 07/11] clean

---
 src/transformers/core_model_loading.py            | 4 +---
 src/transformers/quantizers/quantizer_fp_quant.py | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index b65098090e86..5ab067a58fe9 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -905,9 +905,7 @@ def convert_and_load_state_dict_in_model(
                             mapping.distributed_operation,
                             hf_quantizer,
                         )
-            except SkipLayer as e:
-                print(e)
-                raise e
+            except SkipLayer:
                 continue
 
     # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index 4f462bc83bd0..db2ceec321c3 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -107,7 +107,6 @@ def create_quantized_param(
         #  * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
 
         if param_name.endswith(".qweight"):
-            print(f"param_value qweight: {param_value.shape}")
             # Loading a real quantized checkpoint without master weights
             module.qweight = torch.nn.Parameter(
                 param_value.to(target_device),
@@ -118,7 +117,6 @@ def create_quantized_param(
             return
 
         if param_name.endswith(".dqweight"):
-            print(f"param_value dqweight: {param_value.shape}")
             # Loading a pseudo-quantized checkpoint without master weights
             module.dqweight = torch.nn.Parameter(param_value.to(target_device))
             module.weight = None

From 9072c057cf041ddebfa997f2081660537c2114b2 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:48:28 +0000
Subject: [PATCH 08/11] reset

---
 src/transformers/core_model_loading.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index 5ab067a58fe9..5f07030afbc9 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -905,6 +905,9 @@ def convert_and_load_state_dict_in_model(
                             mapping.distributed_operation,
                             hf_quantizer,
                         )
+ 
+                # Cleanup the tensors
+                mapping.reset()
             except SkipLayer:
                 continue
 

From 97a82931d421a492f278c9136bedbb9e6ba3fe2b Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Thu, 4 Dec 2025 10:49:04 +0000
Subject: [PATCH 09/11] style

---
 src/transformers/core_model_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py
index 5f07030afbc9..673b0caf2dd0 100644
--- a/src/transformers/core_model_loading.py
+++ b/src/transformers/core_model_loading.py
@@ -905,7 +905,7 @@ def convert_and_load_state_dict_in_model(
                             mapping.distributed_operation,
                             hf_quantizer,
                         )
- 
+
                 # Cleanup the tensors
                 mapping.reset()
             except SkipLayer:

From 3b5077f02cccf3bdcb2b815208a42bf5ec609303 Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Mon, 8 Dec 2025 09:05:49 +0000
Subject: [PATCH 10/11] rm duplicate

---
 src/transformers/quantizers/quantizer_fp_quant.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py
index 4ae1d5b8a2aa..8b21b8a16694 100644
--- a/src/transformers/quantizers/quantizer_fp_quant.py
+++ b/src/transformers/quantizers/quantizer_fp_quant.py
@@ -121,16 +121,6 @@ def is_trainable(self, model: Optional["PreTrainedModel"] = None):
     def is_serializable(self, **kwargs):
         return True
 
-    def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
-        from fp_quant import FPQuantLinear
-
-        module, tensor_name = get_module_from_name(model, param_name)
-        if isinstance(module, FPQuantLinear) and tensor_name in ["weight", "qweight", "dqweight"]:
-            # Only quantize weights of FPQuantLinear modules that are not already quantized
-            return True
-        else:
-            return False
-
     def get_quantize_ops(self):
         from ..integrations.fp_quant import FpQuantQuantize
 

From ebd8ad92141c2ceed3163bc5f91397463ab3c73a Mon Sep 17 00:00:00 2001
From: medmekk <mekk.cyber@gmail.com>
Date: Mon, 8 Dec 2025 20:56:09 +0000
Subject: [PATCH 11/11] ci: empty commit