From edbfae5172b0c2af2aaa3183bc31a694c15ed955 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 27 Nov 2025 15:57:10 +0000 Subject: [PATCH 01/11] initial --- src/transformers/integrations/fp_quant.py | 67 ++++++++++++++++++- .../quantizers/quantizer_fp_quant.py | 6 +- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index ccf933796165..ff72af717ca0 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -17,13 +17,78 @@ is_fp_quant_available, ) +import torch +from typing import Optional if is_fp_quant_available(): from fp_quant import FPQuantConfig as FPQuantLinearConfig from fp_quant import FPQuantDtype from transformers.utils.quantization_config import FPQuantConfig +from ..quantizers.quantizers_utils import get_module_from_name +from ..core_model_loading import ConversionOps +class FpQuantQuantize(ConversionOps): + def __init__(self, hf_quantizer): + self.hf_quantizer = hf_quantizer + + def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]: + target_key, value = tuple(input_dict.items())[0] + value = value[0] if isinstance(value, list) else value + + module, _ = get_module_from_name(model, target_key) + + # TODO: check if we need this or not, commented for now + # if target_device == "cpu" and param_name.endswith("weight"): + # # Works agains hard-coded missing key dispatch to CPU + # return + # The module holds either: + # * `weight` when `store_master_weights=True` + # * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False` + # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True` + if target_key.endswith(".qweight"): + # Loading a real quantized checkpoint without master weights + qweight = torch.nn.Parameter( + value, + requires_grad=False, + ) + + weight_key = target_key.rsplit(".", 1)[0] + ".weight" + dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight" + + return {f"{target_key}": qweight, + } + + if target_key.endswith(".dqweight"): + # Loading a pseudo-quantized checkpoint without master weights + dqweight = torch.nn.Parameter(value) + + weight_key = target_key.rsplit(".", 1)[0] + ".weight" + dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight" + scales_key = target_key.rsplit(".", 1)[0] + ".scales" + + return { + f"{target_key}": dqweight, + f"{weight_key}": torch.nn.Parameter(torch.empty(0)), + f"{dqweight_key}": torch.nn.Parameter(torch.empty(0)), + f"{scales_key}": torch.nn.Parameter(torch.empty(0)) + } + + # Loading master weights or an unquantized checkpoint + weight = torch.nn.Parameter(value) + module.weight = weight + # Let pre-forward handle the quantization and set None where necessary + module.pre_forward() + + prefix_target_key = target_key.rsplit(".", 1)[0] + + return {target_key: weight, + f"{prefix_target_key}.act_global_scale": module.act_global_scale, + f"{prefix_target_key}.backward_hadamard_matrix": module.backward_hadamard_matrix, + f"{prefix_target_key}.forward_hadamard_matrix": module.forward_hadamard_matrix, + f"{prefix_target_key}.qweight": module.qweight, + f"{prefix_target_key}.scales": module.scales + } def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": @@ -51,4 +116,4 @@ def adapt_fp_quant_config(config: FPQuantConfig): pseudoquantization=config.pseudoquantization, transform_init=config.transform_init, modules_to_not_convert=config.modules_to_not_convert, - ) + ) \ No newline at end of file diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index b5c9f2c8179f..85c5115d7645 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -140,7 +140,7 @@ def _process_model_before_weight_loading( replace_with_fp_quant_linear( model, - fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config), + fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config) ) model.config.quantization_config = self.quantization_config @@ -178,3 +178,7 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, ** return True else: return False + + def get_quantize_ops(self): + from ..integrations.fp_quant import FpQuantQuantize + return FpQuantQuantize(self) \ No newline at end of file From 53a2f598b1d1a26675927547acdc5dea7c5ca870 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 27 Nov 2025 17:29:00 +0000 Subject: [PATCH 02/11] quantization fixed --- src/transformers/integrations/fp_quant.py | 25 ++++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index ff72af717ca0..e658f9688a6d 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -56,10 +56,10 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N weight_key = target_key.rsplit(".", 1)[0] + ".weight" dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight" - return {f"{target_key}": qweight, - } + return {f"{target_key}": qweight} if target_key.endswith(".dqweight"): + print(f"target_key: {target_key}") # Loading a pseudo-quantized checkpoint without master weights dqweight = torch.nn.Parameter(value) @@ -77,18 +77,23 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N # Loading master weights or an unquantized checkpoint weight = torch.nn.Parameter(value) module.weight = weight + + # print(f"module.state_dict(): {module.state_dict()}") # Let pre-forward handle the quantization and set None where necessary module.pre_forward() - + prefix_target_key = target_key.rsplit(".", 1)[0] - return {target_key: weight, - f"{prefix_target_key}.act_global_scale": module.act_global_scale, - f"{prefix_target_key}.backward_hadamard_matrix": module.backward_hadamard_matrix, - f"{prefix_target_key}.forward_hadamard_matrix": module.forward_hadamard_matrix, - f"{prefix_target_key}.qweight": module.qweight, - f"{prefix_target_key}.scales": module.scales - } + # keys are set inside the module.pre_forward() method, we don't need remove them from the missing keys list + missing_keys.discard(target_key) + missing_keys.discard(f"{prefix_target_key}.backward_hadamard_matrix") + missing_keys.discard(f"{prefix_target_key}.forward_hadamard_matrix") + missing_keys.discard(f"{prefix_target_key}.act_global_scale") + missing_keys.discard(f"{prefix_target_key}.weight_global_scale") + missing_keys.discard(f"{prefix_target_key}.qweight") + missing_keys.discard(f"{prefix_target_key}.scales") + missing_keys.discard(f"{prefix_target_key}.dqweight") + return {} def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": From 29febd45b5da89bcb5044585a0d31dab1328107d Mon Sep 17 00:00:00 2001 From: medmekk Date: Fri, 28 Nov 2025 09:27:43 +0000 Subject: [PATCH 03/11] up --- src/transformers/conversion_mapping.py | 8 ++++---- src/transformers/core_model_loading.py | 7 +++---- src/transformers/quantizers/quantizer_fp_quant.py | 2 ++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 5968bd08d406..9fb552c75a82 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -142,12 +142,12 @@ def _build_checkpoint_conversion_mapping(): if hasattr(torch.nn.utils.parametrizations, "weight_norm"): mapping["legacy"] += [ WeightRenaming( - source_patterns="weight_g", - target_patterns="parametrizations.weight.original0", + source_keys=r"weight_g$", + target_keys="parametrizations.weight.original0", ), WeightRenaming( - source_patterns="weight_v", - target_patterns="parametrizations.weight.original1", + source_keys=r"weight_v$", + target_keys="parametrizations.weight.original1", ), ] else: diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 673b0caf2dd0..b65098090e86 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -905,10 +905,9 @@ def convert_and_load_state_dict_in_model( mapping.distributed_operation, hf_quantizer, ) - - # Cleanup the tensors - mapping.reset() - except SkipLayer: + except SkipLayer as e: + print(e) + raise e continue # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user) diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index 85c5115d7645..74c4353c0764 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -107,6 +107,7 @@ def create_quantized_param( # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True` if param_name.endswith(".qweight"): + print(f"param_value qweight: {param_value.shape}") # Loading a real quantized checkpoint without master weights module.qweight = torch.nn.Parameter( param_value.to(target_device), @@ -117,6 +118,7 @@ def create_quantized_param( return if param_name.endswith(".dqweight"): + print(f"param_value dqweight: {param_value.shape}") # Loading a pseudo-quantized checkpoint without master weights module.dqweight = torch.nn.Parameter(param_value.to(target_device)) module.weight = None From 5824ecbe4ea93d27eefaa2074099658b64e457b0 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:44:17 +0000 Subject: [PATCH 04/11] working --- src/transformers/conversion_mapping.py | 8 +- src/transformers/integrations/fp_quant.py | 87 ++++++++++--------- .../quantizers/quantizer_fp_quant.py | 24 ++++- 3 files changed, 72 insertions(+), 47 deletions(-) diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py index 9fb552c75a82..e7c660cbe3f4 100644 --- a/src/transformers/conversion_mapping.py +++ b/src/transformers/conversion_mapping.py @@ -142,12 +142,12 @@ def _build_checkpoint_conversion_mapping(): if hasattr(torch.nn.utils.parametrizations, "weight_norm"): mapping["legacy"] += [ WeightRenaming( - source_keys=r"weight_g$", - target_keys="parametrizations.weight.original0", + source_patterns=r"weight_g$", + target_patterns="parametrizations.weight.original0", ), WeightRenaming( - source_keys=r"weight_v$", - target_keys="parametrizations.weight.original1", + source_patterns=r"weight_v$", + target_patterns="parametrizations.weight.original1", ), ] else: diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index e658f9688a6d..91ae09303c4e 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -34,53 +34,16 @@ def __init__(self, hf_quantizer): def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]: target_key, value = tuple(input_dict.items())[0] - value = value[0] if isinstance(value, list) else value - - module, _ = get_module_from_name(model, target_key) - - # TODO: check if we need this or not, commented for now - # if target_device == "cpu" and param_name.endswith("weight"): - # # Works agains hard-coded missing key dispatch to CPU - # return - # The module holds either: - # * `weight` when `store_master_weights=True` - # * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False` - # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True` - if target_key.endswith(".qweight"): - # Loading a real quantized checkpoint without master weights - qweight = torch.nn.Parameter( - value, - requires_grad=False, - ) - - weight_key = target_key.rsplit(".", 1)[0] + ".weight" - dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight" - - return {f"{target_key}": qweight} - - if target_key.endswith(".dqweight"): - print(f"target_key: {target_key}") - # Loading a pseudo-quantized checkpoint without master weights - dqweight = torch.nn.Parameter(value) - - weight_key = target_key.rsplit(".", 1)[0] + ".weight" - dqweight_key = target_key.rsplit(".", 1)[0] + ".dqweight" - scales_key = target_key.rsplit(".", 1)[0] + ".scales" - - return { - f"{target_key}": dqweight, - f"{weight_key}": torch.nn.Parameter(torch.empty(0)), - f"{dqweight_key}": torch.nn.Parameter(torch.empty(0)), - f"{scales_key}": torch.nn.Parameter(torch.empty(0)) - } - + value = value[0] # Loading master weights or an unquantized checkpoint weight = torch.nn.Parameter(value) + module, _ = get_module_from_name(model, target_key) module.weight = weight - # print(f"module.state_dict(): {module.state_dict()}") # Let pre-forward handle the quantization and set None where necessary - module.pre_forward() + # This operation will quantize the weights internally + with torch.cuda.device(value.device): + module.pre_forward() prefix_target_key = target_key.rsplit(".", 1)[0] @@ -95,6 +58,46 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N missing_keys.discard(f"{prefix_target_key}.dqweight") return {} +class FpQuantDeserialize(ConversionOps): + def __init__(self, hf_quantizer): + self.hf_quantizer = hf_quantizer + + def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, full_layer_name: str | None = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]: + target_key, value = tuple(input_dict.items())[0] + value = value[0] if isinstance(value, list) else value + module, _ = get_module_from_name(model, target_key) + # The module holds either: + # * `weight` when `store_master_weights=True` + # * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False` + # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True` + if target_key == ".qweight": + # Loading a real quantized checkpoint without master weights + qweight = torch.nn.Parameter( + value, + requires_grad=False, + ) + + return { + ".qweight": qweight, + # the way the FPQuantLinear module is designed, these parameters are expected in the model + # even though they are not used so we need to set them to zeros + ".weight": torch.nn.Parameter(torch.zeros(0)), + ".qweight": torch.nn.Parameter(torch.zeros(0)), + } + + if target_key == ".dqweight": + # Loading a pseudo-quantized checkpoint without master weights + dqweight = torch.nn.Parameter(value) + + return { + ".dqweight": dqweight, + # the way the FPQuantLinear module ips designed, these parameters are expected in the model + # even though they are not used so we need to set them to zeros + ".weight": torch.nn.Parameter(torch.zeros(0)), + ".qweight": torch.nn.Parameter(torch.zeros(0)), + ".scales": torch.nn.Parameter(torch.zeros(0)) + } + def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": forward_dtype = FPQuantDtype.MXFP4 diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index 74c4353c0764..a4d36501f7ad 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -183,4 +183,26 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, ** def get_quantize_ops(self): from ..integrations.fp_quant import FpQuantQuantize - return FpQuantQuantize(self) \ No newline at end of file + return FpQuantQuantize(self) + + def get_weight_conversions(self): + from ..integrations.fp_quant import FpQuantDeserialize + from ..core_model_loading import WeightConverter + if self.pre_quantized: + if self.quantization_config.pseudoquantization: + return [ + WeightConverter( + source_patterns=[".dqweight"], + target_patterns=".dqweight", + operations=[FpQuantDeserialize(self)], + ), + ] + else: + return [ + WeightConverter( + source_patterns=[".qweight"], + target_patterns=".qweight", + operations=[FpQuantDeserialize(self)], + ), + ] + return [] \ No newline at end of file From f86696f4566f431a505894e26356d2fdc5aaef9a Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:45:07 +0000 Subject: [PATCH 05/11] fix --- src/transformers/integrations/fp_quant.py | 14 +++++++++----- src/transformers/quantizers/quantizer_fp_quant.py | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index 91ae09303c4e..651e9f64f3e2 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -13,20 +13,24 @@ # limitations under the License. "FP-Quant integration file" +from typing import Optional + +import torch + from ..utils import ( is_fp_quant_available, ) -import torch -from typing import Optional if is_fp_quant_available(): from fp_quant import FPQuantConfig as FPQuantLinearConfig from fp_quant import FPQuantDtype from transformers.utils.quantization_config import FPQuantConfig -from ..quantizers.quantizers_utils import get_module_from_name + from ..core_model_loading import ConversionOps +from ..quantizers.quantizers_utils import get_module_from_name + class FpQuantQuantize(ConversionOps): def __init__(self, hf_quantizer): @@ -82,7 +86,7 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N # the way the FPQuantLinear module is designed, these parameters are expected in the model # even though they are not used so we need to set them to zeros ".weight": torch.nn.Parameter(torch.zeros(0)), - ".qweight": torch.nn.Parameter(torch.zeros(0)), + ".dqweight": torch.nn.Parameter(torch.zeros(0)), } if target_key == ".dqweight": @@ -124,4 +128,4 @@ def adapt_fp_quant_config(config: FPQuantConfig): pseudoquantization=config.pseudoquantization, transform_init=config.transform_init, modules_to_not_convert=config.modules_to_not_convert, - ) \ No newline at end of file + ) diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index a4d36501f7ad..80390405b0a7 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -186,8 +186,8 @@ def get_quantize_ops(self): return FpQuantQuantize(self) def get_weight_conversions(self): - from ..integrations.fp_quant import FpQuantDeserialize from ..core_model_loading import WeightConverter + from ..integrations.fp_quant import FpQuantDeserialize if self.pre_quantized: if self.quantization_config.pseudoquantization: return [ @@ -205,4 +205,4 @@ def get_weight_conversions(self): operations=[FpQuantDeserialize(self)], ), ] - return [] \ No newline at end of file + return [] From 58b8c99785284429a98e34976745bd79c25ac213 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:45:16 +0000 Subject: [PATCH 06/11] style --- src/transformers/integrations/fp_quant.py | 33 ++++++++++++++----- .../quantizers/quantizer_fp_quant.py | 7 ++-- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/transformers/integrations/fp_quant.py b/src/transformers/integrations/fp_quant.py index 651e9f64f3e2..af7821786d6c 100644 --- a/src/transformers/integrations/fp_quant.py +++ b/src/transformers/integrations/fp_quant.py @@ -36,7 +36,13 @@ class FpQuantQuantize(ConversionOps): def __init__(self, hf_quantizer): self.hf_quantizer = hf_quantizer - def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]: + def convert( + self, + input_dict: torch.Tensor, + model: Optional[torch.nn.Module] = None, + missing_keys: Optional[list[str]] = None, + **kwargs, + ) -> dict[str, torch.Tensor]: target_key, value = tuple(input_dict.items())[0] value = value[0] # Loading master weights or an unquantized checkpoint @@ -62,11 +68,19 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N missing_keys.discard(f"{prefix_target_key}.dqweight") return {} + class FpQuantDeserialize(ConversionOps): def __init__(self, hf_quantizer): self.hf_quantizer = hf_quantizer - def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = None, full_layer_name: str | None = None, missing_keys: Optional[list[str]] = None, **kwargs) -> dict[str, torch.Tensor]: + def convert( + self, + input_dict: torch.Tensor, + model: Optional[torch.nn.Module] = None, + full_layer_name: str | None = None, + missing_keys: Optional[list[str]] = None, + **kwargs, + ) -> dict[str, torch.Tensor]: target_key, value = tuple(input_dict.items())[0] value = value[0] if isinstance(value, list) else value module, _ = get_module_from_name(model, target_key) @@ -94,13 +108,14 @@ def convert(self, input_dict: torch.Tensor, model: Optional[torch.nn.Module] = N dqweight = torch.nn.Parameter(value) return { - ".dqweight": dqweight, - # the way the FPQuantLinear module ips designed, these parameters are expected in the model - # even though they are not used so we need to set them to zeros - ".weight": torch.nn.Parameter(torch.zeros(0)), - ".qweight": torch.nn.Parameter(torch.zeros(0)), - ".scales": torch.nn.Parameter(torch.zeros(0)) - } + ".dqweight": dqweight, + # the way the FPQuantLinear module ips designed, these parameters are expected in the model + # even though they are not used so we need to set them to zeros + ".weight": torch.nn.Parameter(torch.zeros(0)), + ".qweight": torch.nn.Parameter(torch.zeros(0)), + ".scales": torch.nn.Parameter(torch.zeros(0)), + } + def adapt_fp_quant_config(config: FPQuantConfig): if config.forward_dtype == "mxfp4": diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index 80390405b0a7..4f462bc83bd0 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -140,10 +140,7 @@ def _process_model_before_weight_loading( from ..integrations.fp_quant import adapt_fp_quant_config - replace_with_fp_quant_linear( - model, - fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config) - ) + replace_with_fp_quant_linear(model, fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config)) model.config.quantization_config = self.quantization_config def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]: @@ -183,11 +180,13 @@ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, ** def get_quantize_ops(self): from ..integrations.fp_quant import FpQuantQuantize + return FpQuantQuantize(self) def get_weight_conversions(self): from ..core_model_loading import WeightConverter from ..integrations.fp_quant import FpQuantDeserialize + if self.pre_quantized: if self.quantization_config.pseudoquantization: return [ From 2e02c2da83d1ce732b61bdcf06d2e5a2eeec8af6 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:47:09 +0000 Subject: [PATCH 07/11] clean --- src/transformers/core_model_loading.py | 4 +--- src/transformers/quantizers/quantizer_fp_quant.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index b65098090e86..5ab067a58fe9 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -905,9 +905,7 @@ def convert_and_load_state_dict_in_model( mapping.distributed_operation, hf_quantizer, ) - except SkipLayer as e: - print(e) - raise e + except SkipLayer: continue # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user) diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index 4f462bc83bd0..db2ceec321c3 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -107,7 +107,6 @@ def create_quantized_param( # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True` if param_name.endswith(".qweight"): - print(f"param_value qweight: {param_value.shape}") # Loading a real quantized checkpoint without master weights module.qweight = torch.nn.Parameter( param_value.to(target_device), @@ -118,7 +117,6 @@ def create_quantized_param( return if param_name.endswith(".dqweight"): - print(f"param_value dqweight: {param_value.shape}") # Loading a pseudo-quantized checkpoint without master weights module.dqweight = torch.nn.Parameter(param_value.to(target_device)) module.weight = None From 9072c057cf041ddebfa997f2081660537c2114b2 Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:48:28 +0000 Subject: [PATCH 08/11] reset --- src/transformers/core_model_loading.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 5ab067a58fe9..5f07030afbc9 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -905,6 +905,9 @@ def convert_and_load_state_dict_in_model( mapping.distributed_operation, hf_quantizer, ) + + # Cleanup the tensors + mapping.reset() except SkipLayer: continue From 97a82931d421a492f278c9136bedbb9e6ba3fe2b Mon Sep 17 00:00:00 2001 From: medmekk Date: Thu, 4 Dec 2025 10:49:04 +0000 Subject: [PATCH 09/11] style --- src/transformers/core_model_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/core_model_loading.py b/src/transformers/core_model_loading.py index 5f07030afbc9..673b0caf2dd0 100644 --- a/src/transformers/core_model_loading.py +++ b/src/transformers/core_model_loading.py @@ -905,7 +905,7 @@ def convert_and_load_state_dict_in_model( mapping.distributed_operation, hf_quantizer, ) - + # Cleanup the tensors mapping.reset() except SkipLayer: From 3b5077f02cccf3bdcb2b815208a42bf5ec609303 Mon Sep 17 00:00:00 2001 From: medmekk Date: Mon, 8 Dec 2025 09:05:49 +0000 Subject: [PATCH 10/11] rm duplicate --- src/transformers/quantizers/quantizer_fp_quant.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/transformers/quantizers/quantizer_fp_quant.py b/src/transformers/quantizers/quantizer_fp_quant.py index 4ae1d5b8a2aa..8b21b8a16694 100644 --- a/src/transformers/quantizers/quantizer_fp_quant.py +++ b/src/transformers/quantizers/quantizer_fp_quant.py @@ -121,16 +121,6 @@ def is_trainable(self, model: Optional["PreTrainedModel"] = None): def is_serializable(self, **kwargs): return True - def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool: - from fp_quant import FPQuantLinear - - module, tensor_name = get_module_from_name(model, param_name) - if isinstance(module, FPQuantLinear) and tensor_name in ["weight", "qweight", "dqweight"]: - # Only quantize weights of FPQuantLinear modules that are not already quantized - return True - else: - return False - def get_quantize_ops(self): from ..integrations.fp_quant import FpQuantQuantize From ebd8ad92141c2ceed3163bc5f91397463ab3c73a Mon Sep 17 00:00:00 2001 From: medmekk Date: Mon, 8 Dec 2025 20:56:09 +0000 Subject: [PATCH 11/11] ci: empty commit