From 9c6357c7d22d6e35605a1a0c8bb9e9a4f2c054ed Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 21:23:24 +0000 Subject: [PATCH 1/7] support saving/loading multiple sub_processor of the same kind --- src/transformers/processing_utils.py | 110 ++++++++++++++++++---- tests/models/auto/test_processor_auto.py | 113 +++++++++++++++++++++++ 2 files changed, 204 insertions(+), 19 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index f54ddeb1b2a6..85eb5cdd3f9d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -130,6 +130,26 @@ def keys(self): "video_processor": "BaseVideoProcessor", } + +def _get_modality_for_attribute(attribute_name: str) -> str: + """ + Get the canonical modality type for a given attribute name. + + For example: + - "image_processor" -> "image_processor" + - "encoder_image_processor" -> "image_processor" + - "text_tokenizer" -> "tokenizer" + - "my_feature_extractor" -> "feature_extractor" + """ + for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys(): + if modality in attribute_name: + return modality + raise ValueError( + f"Cannot determine modality for attribute '{attribute_name}'. " + f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}" + ) + + if sys.version_info >= (3, 11): Unpack = typing.Unpack else: @@ -664,8 +684,10 @@ def check_argument_for_proper_class(self, argument_name, argument): mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class is returned. """ - if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name: - argument_name = "tokenizer" + # If the exact attribute name is not in the mapping, use its canonical modality + # (e.g., "encoder_tokenizer" -> "tokenizer") + if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING: + argument_name = _get_modality_for_attribute(argument_name) class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name) if isinstance(class_name, tuple): proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None) @@ -696,9 +718,13 @@ def to_dict(self) -> dict[str, Any]: # extra attributes to be kept attrs_to_save += ["auto_map"] + # Remove tokenizers from output - they have their own vocab files and are saved separately. + # All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json. for attribute in self.__class__.get_attributes(): - if "tokenizer" in attribute and attribute in output: - del output[attribute] + if attribute in output: + modality = _get_modality_for_attribute(attribute) + if modality == "tokenizer": + del output[attribute] if "chat_template" in output: del output["chat_template"] @@ -820,13 +846,15 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json` - if attribute_name == "tokenizer": - attribute.save_pretrained(save_directory) - # if a model has multiple tokenizers, save the additional tokenizers in their own folders. - # Note that the additional tokenizers must have "tokenizer" in their attribute name. - elif "tokenizer" in attribute_name: - attribute.save_pretrained(os.path.join(save_directory, attribute_name)) + modality = _get_modality_for_attribute(attribute_name) + is_primary = attribute_name == modality + if modality == "tokenizer": + # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json` + if is_primary: + attribute.save_pretrained(save_directory) + else: + # if a model has multiple tokenizers, save the additional tokenizers in their own folders. + attribute.save_pretrained(os.path.join(save_directory, attribute_name)) elif attribute._auto_class is not None: custom_object_save(attribute, save_directory, config=attribute) @@ -1394,8 +1422,9 @@ def from_pretrained( if token is not None: kwargs["token"] = token - args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs) return cls.from_args_and_dict(args, processor_dict, **kwargs) @classmethod @@ -1406,7 +1435,7 @@ def get_attributes(cls): # don't treat audio_tokenizer as an attribute if sub_processor_type == "audio_tokenizer": continue - if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type: + if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()): attributes.append(sub_processor_type) # Legacy processors may not override `__init__` and instead expose modality @@ -1420,7 +1449,7 @@ def get_attributes(cls): inferred_attribute = attribute_name[: -len("_class")] if inferred_attribute == "audio_tokenizer": continue - if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute: + if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()): attributes.append(inferred_attribute) return attributes @@ -1448,20 +1477,36 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"): cls._auto_class = auto_class @classmethod - def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs): """ Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers, and feature extractors. This method inspects the processor's `__init__` signature to identify parameters that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain - "tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.) - from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like - parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument. + modality names in their attribute name. + + For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`. + Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders. + + For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via + Auto class. Additional ones are instantiated from the config stored in processor_config.json + (passed as processor_dict). + + Args: + pretrained_model_name_or_path: Path or model id to load from. + processor_dict: Optional dict containing processor config (from processor_config.json). + Required when loading additional non-tokenizer sub-processors. """ args = [] + processor_dict = processor_dict if processor_dict is not None else {} + # get args from processor init signature sub_processors = cls.get_attributes() for sub_processor_type in sub_processors: - if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING: + modality = _get_modality_for_attribute(sub_processor_type) + is_primary = sub_processor_type == modality + + if is_primary: + # Primary non-tokenizer sub-processor: load via Auto class auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type] sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs) args.append(sub_processor) @@ -1474,6 +1519,33 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) ) args.append(sub_processor) + elif sub_processor_type in processor_dict: + # Additional non-tokenizer sub-processor: instantiate from config in processor_dict + sub_processor_config = processor_dict[sub_processor_type] + if isinstance(sub_processor_config, dict): + # Determine the class to instantiate + # Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type' + type_key = f"{modality}_type" + class_name = sub_processor_config.get(type_key) + if class_name is None: + raise ValueError( + f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. " + f"Config keys: {list(sub_processor_config.keys())}" + ) + processor_class = cls.get_possibly_dynamic_module(class_name) + sub_processor = processor_class(**sub_processor_config) + args.append(sub_processor) + else: + raise ValueError( + f"Expected dict for {sub_processor_type} in processor_config.json, " + f"got {type(sub_processor_config)}" + ) + else: + raise ValueError( + f"Cannot find config for {sub_processor_type} in processor_config.json. " + f"Available keys: {list(processor_dict.keys())}" + ) + return args @staticmethod diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index 63f28d3dea9d..4e618ea0f9b5 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -35,6 +35,7 @@ AutoTokenizer, BaseVideoProcessor, BertTokenizer, + CLIPImageProcessorFast, FeatureExtractionMixin, ImageProcessingMixin, LlamaTokenizer, @@ -42,6 +43,7 @@ LlavaProcessor, ProcessorMixin, SiglipImageProcessor, + SiglipImageProcessorFast, Wav2Vec2Config, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, @@ -431,6 +433,117 @@ def test_auto_processor_save_load(self): second_processor = AutoProcessor.from_pretrained(tmp_dir) self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__) + def test_processor_with_multiple_tokenizers_save_load(self): + """Test that processors with multiple tokenizers save and load correctly.""" + + class DualTokenizerProcessor(ProcessorMixin): + """A processor with two tokenizers and an image processor.""" + + def __init__(self, tokenizer, decoder_tokenizer, image_processor): + super().__init__(tokenizer, decoder_tokenizer, image_processor) + + # Create processor with multiple tokenizers + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") + decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + image_processor = SiglipImageProcessor() + + processor = DualTokenizerProcessor( + tokenizer=tokenizer, + decoder_tokenizer=decoder_tokenizer, + image_processor=image_processor, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + processor.save_pretrained(tmp_dir) + + # Verify directory structure: primary tokenizer in root, additional in subfolder + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "tokenizer_config.json"))) + self.assertTrue(os.path.isdir(os.path.join(tmp_dir, "decoder_tokenizer"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "decoder_tokenizer", "tokenizer_config.json"))) + + # Verify processor_config.json contains image_processor but not tokenizers + with open(os.path.join(tmp_dir, "processor_config.json")) as f: + processor_config = json.load(f) + self.assertIn("image_processor", processor_config) + self.assertNotIn("tokenizer", processor_config) + self.assertNotIn("decoder_tokenizer", processor_config) + + # Reload the full processor and verify all attributes + loaded_processor = DualTokenizerProcessor.from_pretrained(tmp_dir) + + # Verify the processor has all expected attributes + self.assertTrue(hasattr(loaded_processor, "tokenizer")) + self.assertTrue(hasattr(loaded_processor, "decoder_tokenizer")) + self.assertTrue(hasattr(loaded_processor, "image_processor")) + + # Verify tokenizers loaded correctly + self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size) + self.assertEqual(loaded_processor.decoder_tokenizer.vocab_size, decoder_tokenizer.vocab_size) + + # Verify image processor loaded correctly + self.assertEqual(loaded_processor.image_processor.size, image_processor.size) + + def test_processor_with_multiple_image_processors_save_load(self): + """Test that processors with multiple image processors save and load correctly.""" + + class DualImageProcessorProcessor(ProcessorMixin): + """A processor with two image processors and a tokenizer.""" + + def __init__(self, tokenizer, image_processor, encoder_image_processor): + super().__init__(tokenizer, image_processor, encoder_image_processor) + + # Create processor with multiple image processors + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") + image_processor = SiglipImageProcessorFast(size={"height": 224, "width": 224}) + encoder_image_processor = CLIPImageProcessorFast(size={"height": 384, "width": 384}) + + processor = DualImageProcessorProcessor( + tokenizer=tokenizer, + image_processor=image_processor, + encoder_image_processor=encoder_image_processor, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + processor.save_pretrained(tmp_dir) + + # Verify processor_config.json contains both image processors + with open(os.path.join(tmp_dir, "processor_config.json")) as f: + processor_config = json.load(f) + self.assertIn("image_processor", processor_config) + self.assertIn("encoder_image_processor", processor_config) + self.assertNotIn("tokenizer", processor_config) + + # Verify both image processors have the correct type key for instantiation + self.assertIn("image_processor_type", processor_config["image_processor"]) + self.assertIn("image_processor_type", processor_config["encoder_image_processor"]) + self.assertEqual(processor_config["image_processor"]["image_processor_type"], "SiglipImageProcessorFast") + self.assertEqual( + processor_config["encoder_image_processor"]["image_processor_type"], "CLIPImageProcessorFast" + ) + + # Verify the sizes are different (to ensure they're separate configs) + self.assertEqual(processor_config["image_processor"]["size"], {"height": 224, "width": 224}) + self.assertEqual(processor_config["encoder_image_processor"]["size"], {"height": 384, "width": 384}) + + # Reload the full processor and verify all attributes + loaded_processor = DualImageProcessorProcessor.from_pretrained(tmp_dir) + + # Verify the processor has all expected attributes + self.assertTrue(hasattr(loaded_processor, "tokenizer")) + self.assertTrue(hasattr(loaded_processor, "image_processor")) + self.assertTrue(hasattr(loaded_processor, "encoder_image_processor")) + + # Verify tokenizer loaded correctly + self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size) + + # Verify image processors loaded correctly with their distinct sizes + self.assertEqual(loaded_processor.image_processor.size, {"height": 224, "width": 224}) + self.assertEqual(loaded_processor.encoder_image_processor.size, {"height": 384, "width": 384}) + + # Verify they are different types + self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessorFast) + self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessorFast) + @is_staging_test class ProcessorPushToHubTester(unittest.TestCase): From f3bd01c9156559ce829a7ba3a7e14c82cecaa985 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 21:41:06 +0000 Subject: [PATCH 2/7] standardize all processors --- .../models/audioflamingo3/processing_audioflamingo3.py | 4 ---- src/transformers/models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/processing_auto.py | 2 ++ src/transformers/models/auto/tokenization_auto.py | 3 +++ .../models/phi4_multimodal/processing_phi4_multimodal.py | 2 -- src/transformers/models/pix2struct/processing_pix2struct.py | 4 ---- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/audioflamingo3/processing_audioflamingo3.py b/src/transformers/models/audioflamingo3/processing_audioflamingo3.py index bc14f0d6cde4..b53dcd165464 100644 --- a/src/transformers/models/audioflamingo3/processing_audioflamingo3.py +++ b/src/transformers/models/audioflamingo3/processing_audioflamingo3.py @@ -74,10 +74,6 @@ class AudioFlamingo3Processor(ProcessorMixin): Special token used to represent audio inputs in the chat template. """ - attributes = ["feature_extractor", "tokenizer"] - feature_extractor_class = "WhisperFeatureExtractor" - tokenizer_class = "Qwen2TokenizerFast" - def __init__( self, feature_extractor, diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py index a9008af06ab6..6963447b5b6f 100644 --- a/src/transformers/models/auto/feature_extraction_auto.py +++ b/src/transformers/models/auto/feature_extraction_auto.py @@ -38,6 +38,7 @@ FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( [ ("audio-spectrogram-transformer", "ASTFeatureExtractor"), + ("audioflamingo3", "WhisperFeatureExtractor"), ("clap", "ClapFeatureExtractor"), ("clvp", "ClvpFeatureExtractor"), ("csm", "EncodecFeatureExtractor"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 6d08bf37ebab..88dde801bba3 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -93,6 +93,8 @@ ("kosmos-2", "Kosmos2Processor"), ("kosmos-2.5", "Kosmos2_5Processor"), ("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"), + ("lasr_ctc", "LasrProcessor"), + ("lasr_encoder", "LasrProcessor"), ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("layoutxlm", "LayoutXLMProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 31c6a783726b..bf4de43e30df 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -70,6 +70,7 @@ ("align", "BertTokenizer" if is_tokenizers_available() else None), ("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None), ("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("audioflamingo3", "Qwen2TokenizerFast" if is_tokenizers_available() else None), ("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None), ("bark", "BertTokenizer" if is_tokenizers_available() else None), ("bart", "RobertaTokenizer" if is_tokenizers_available() else None), @@ -183,6 +184,8 @@ ("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None), ("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None), ("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("lasr_ctc", "ParakeetTokenizerFast" if is_tokenizers_available() else None), + ("lasr_encoder", "ParakeetTokenizerFast" if is_tokenizers_available() else None), ("layoutlm", "BertTokenizer" if is_tokenizers_available() else None), ("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None), ("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None), diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py index 8eec69b0448e..cde089821878 100644 --- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py @@ -58,8 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin): The fake audio token pattern. """ - audio_processor_class = "Phi4MultimodalFeatureExtractor" - def __init__( self, image_processor, diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py index 1fe236339a7c..3ce09bf9d7fc 100644 --- a/src/transformers/models/pix2struct/processing_pix2struct.py +++ b/src/transformers/models/pix2struct/processing_pix2struct.py @@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin): An instance of ['T5Tokenizer`]. The tokenizer is a required input. """ - attributes = ["image_processor", "tokenizer"] - image_processor_class = "Pix2StructImageProcessor" - tokenizer_class = ("T5Tokenizer",) - def __init__(self, image_processor, tokenizer): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) From c84b5642dace2eeb70e08c15e6eaa74dda492154 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 21:46:36 +0000 Subject: [PATCH 3/7] remove tokenizer_class from lasr --- src/transformers/models/lasr/processing_lasr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/transformers/models/lasr/processing_lasr.py b/src/transformers/models/lasr/processing_lasr.py index 3396986866e2..7a4661c6a6ce 100644 --- a/src/transformers/models/lasr/processing_lasr.py +++ b/src/transformers/models/lasr/processing_lasr.py @@ -47,8 +47,6 @@ class LasrProcessorKwargs(ProcessingKwargs, total=False): class LasrProcessor(ProcessorMixin): - tokenizer_class = "ParakeetTokenizerFast" - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) From abd038d1886b7f759f593ece420a0df630bbde9c Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 21:51:41 +0000 Subject: [PATCH 4/7] fix modular --- src/transformers/models/lasr/modular_lasr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/lasr/modular_lasr.py b/src/transformers/models/lasr/modular_lasr.py index c02b2ae0f1c3..75170f0009a5 100644 --- a/src/transformers/models/lasr/modular_lasr.py +++ b/src/transformers/models/lasr/modular_lasr.py @@ -97,7 +97,7 @@ def _decode( class LasrProcessor(ParakeetProcessor): - tokenizer_class = "ParakeetTokenizerFast" + pass class LasrEncoderConfig(ParakeetEncoderConfig): From 855627931f289da1a4b9d67dbc580fe28ebc0c42 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 22:20:08 +0000 Subject: [PATCH 5/7] refactor + check init of parent classes --- src/transformers/processing_utils.py | 124 +++++++++++++++++---------- tests/test_processing_common.py | 16 ++-- 2 files changed, 87 insertions(+), 53 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 85eb5cdd3f9d..097d019051a6 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -119,9 +119,9 @@ def keys(self): return self._MAPPING_NAMES.keys() -MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping() +SUBPROCESSOR_TO_AUTO_CLASS_MAPPING = _LazyAutoProcessorMapping() -MODALITY_TO_BASE_CLASS_MAPPING = { +SUBPROCESSOR_TO_BASE_CLASS_MAPPING = { "audio_tokenizer": "DacModel", "audio_processor": "FeatureExtractionMixin", "tokenizer": ("PreTrainedTokenizerBase", "MistralCommonBackend"), @@ -131,9 +131,9 @@ def keys(self): } -def _get_modality_for_attribute(attribute_name: str) -> str: +def _get_subprocessor_type(attribute_name: str) -> str: """ - Get the canonical modality type for a given attribute name. + Get the canonical sub-processor type for a given attribute name. For example: - "image_processor" -> "image_processor" @@ -141,12 +141,13 @@ def _get_modality_for_attribute(attribute_name: str) -> str: - "text_tokenizer" -> "tokenizer" - "my_feature_extractor" -> "feature_extractor" """ - for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys(): - if modality in attribute_name: - return modality + subprocessor_types = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING.keys() + for subprocessor_type in subprocessor_types: + if subprocessor_type in attribute_name: + return subprocessor_type raise ValueError( - f"Cannot determine modality for attribute '{attribute_name}'. " - f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}" + f"Cannot determine sub-processor type for attribute '{attribute_name}'. " + f"Attribute name must contain one of: {list(subprocessor_types)}" ) @@ -684,11 +685,11 @@ def check_argument_for_proper_class(self, argument_name, argument): mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class is returned. """ - # If the exact attribute name is not in the mapping, use its canonical modality + # If the exact attribute name is not in the mapping, use its canonical sub-processor type # (e.g., "encoder_tokenizer" -> "tokenizer") - if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING: - argument_name = _get_modality_for_attribute(argument_name) - class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name) + if argument_name not in SUBPROCESSOR_TO_BASE_CLASS_MAPPING: + argument_name = _get_subprocessor_type(argument_name) + class_name = SUBPROCESSOR_TO_BASE_CLASS_MAPPING.get(argument_name) if isinstance(class_name, tuple): proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None) else: @@ -722,8 +723,8 @@ def to_dict(self) -> dict[str, Any]: # All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json. for attribute in self.__class__.get_attributes(): if attribute in output: - modality = _get_modality_for_attribute(attribute) - if modality == "tokenizer": + subprocessor_type = _get_subprocessor_type(attribute) + if subprocessor_type == "tokenizer": del output[attribute] if "chat_template" in output: @@ -846,9 +847,9 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - modality = _get_modality_for_attribute(attribute_name) - is_primary = attribute_name == modality - if modality == "tokenizer": + subprocessor_type = _get_subprocessor_type(attribute_name) + is_primary = attribute_name == subprocessor_type + if subprocessor_type == "tokenizer": # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json` if is_primary: attribute.save_pretrained(save_directory) @@ -1429,30 +1430,63 @@ def from_pretrained( @classmethod def get_attributes(cls): - args_in_init = inspect.signature(cls.__init__).parameters.keys() + """ + Detect the sub-processor attributes for this processor class. + + Detection priority: + 1. Auto-detection from `__init__` signature parameters across the full class hierarchy (MRO) + 2. `_class` class attributes (legacy pattern, checks full class hierarchy) + + Returns: + List of attribute names corresponding to sub-processors. + """ + subprocessor_types = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING.keys() + + # Priority 1: Auto-detect from __init__ signatures across the full MRO + # This handles inheritance where child classes use *args/**kwargs attributes = [] - for sub_processor_type in args_in_init: - # don't treat audio_tokenizer as an attribute - if sub_processor_type == "audio_tokenizer": + seen_params = set() + for base_class in cls.__mro__: + if not hasattr(base_class, "__init__"): continue - if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()): - attributes.append(sub_processor_type) + sig = inspect.signature(base_class.__init__) + for param_name, param in sig.parameters.items(): + # Skip self, *args, **kwargs + if param_name in ("self",) or param.kind in ( + inspect.Parameter.VAR_POSITIONAL, + inspect.Parameter.VAR_KEYWORD, + ): + continue + if param_name in seen_params or param_name == "audio_tokenizer": + continue + seen_params.add(param_name) + if any(sp_type in param_name for sp_type in subprocessor_types): + attributes.append(param_name) + if attributes: + return attributes + + # Priority 2: Check for _class attributes in the full class hierarchy # Legacy processors may not override `__init__` and instead expose modality # attributes via `_class`. In that case, `args_in_init` only exposes # `*args`/`**kwargs`, so we need to infer the attributes from those class-level # hints to keep backward compatibility (e.g. dynamic processors stored on the Hub). - if not attributes: - for attribute_name, value in cls.__dict__.items(): - if value is None or attribute_name == "audio_tokenizer_class" or not attribute_name.endswith("_class"): + attributes_from_class_hints = [] + for base_class in cls.__mro__: + for attribute_name in base_class.__dict__: + if not attribute_name.endswith("_class") or attribute_name == "audio_tokenizer_class": + continue + value = getattr(base_class, attribute_name, None) + if value is None: continue inferred_attribute = attribute_name[: -len("_class")] if inferred_attribute == "audio_tokenizer": continue - if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()): - attributes.append(inferred_attribute) + if any(sp_type in inferred_attribute for sp_type in subprocessor_types): + if inferred_attribute not in attributes_from_class_hints: + attributes_from_class_hints.append(inferred_attribute) - return attributes + return attributes_from_class_hints @classmethod def register_for_auto_class(cls, auto_class="AutoProcessor"): @@ -1481,8 +1515,8 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor """ Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers, and feature extractors. This method inspects the processor's `__init__` signature to identify parameters - that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain - modality names in their attribute name. + that correspond to known sub-processor types (image_processor, tokenizer, feature_extractor, etc.) or + contain sub-processor type names in their attribute name. For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`. Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders. @@ -1501,35 +1535,35 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor # get args from processor init signature sub_processors = cls.get_attributes() - for sub_processor_type in sub_processors: - modality = _get_modality_for_attribute(sub_processor_type) - is_primary = sub_processor_type == modality + for sub_processor_name in sub_processors: + subprocessor_type = _get_subprocessor_type(sub_processor_name) + is_primary = sub_processor_name == subprocessor_type if is_primary: # Primary non-tokenizer sub-processor: load via Auto class - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type] + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING[sub_processor_name] sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs) args.append(sub_processor) - elif "tokenizer" in sub_processor_type: + elif "tokenizer" in sub_processor_name: # Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer") # Load using AutoTokenizer with subfolder - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"] + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING["tokenizer"] sub_processor = auto_processor_class.from_pretrained( - pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs + pretrained_model_name_or_path, subfolder=sub_processor_name, **kwargs ) args.append(sub_processor) - elif sub_processor_type in processor_dict: + elif sub_processor_name in processor_dict: # Additional non-tokenizer sub-processor: instantiate from config in processor_dict - sub_processor_config = processor_dict[sub_processor_type] + sub_processor_config = processor_dict[sub_processor_name] if isinstance(sub_processor_config, dict): # Determine the class to instantiate # Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type' - type_key = f"{modality}_type" + type_key = f"{subprocessor_type}_type" class_name = sub_processor_config.get(type_key) if class_name is None: raise ValueError( - f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. " + f"Cannot instantiate {sub_processor_name}: missing '{type_key}' in config. " f"Config keys: {list(sub_processor_config.keys())}" ) processor_class = cls.get_possibly_dynamic_module(class_name) @@ -1537,12 +1571,12 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor args.append(sub_processor) else: raise ValueError( - f"Expected dict for {sub_processor_type} in processor_config.json, " + f"Expected dict for {sub_processor_name} in processor_config.json, " f"got {type(sub_processor_config)}" ) else: raise ValueError( - f"Cannot find config for {sub_processor_type} in processor_config.json. " + f"Cannot find config for {sub_processor_name} in processor_config.json. " f"Available keys: {list(processor_dict.keys())}" ) diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 9e512f982049..d50c8e21c0c3 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -27,7 +27,7 @@ from parameterized import parameterized from transformers.processing_utils import ( - MODALITY_TO_AUTOPROCESSOR_MAPPING, + SUBPROCESSOR_TO_AUTO_CLASS_MAPPING, Unpack, ) from transformers.testing_utils import ( @@ -264,7 +264,7 @@ def _get_component_class_from_processor(cls, attribute, use_fast: bool = True): config_class = CONFIG_MAPPING[model_type] # Now get the component class from the appropriate Auto mapping - if attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING: + if attribute in SUBPROCESSOR_TO_AUTO_CLASS_MAPPING: mapping_name = attribute elif "tokenizer" in attribute: mapping_name = "tokenizer" @@ -321,11 +321,11 @@ def prepare_processor_dict(): return {} def get_component(self, attribute, **kwargs): - if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute: - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"] + if attribute not in SUBPROCESSOR_TO_AUTO_CLASS_MAPPING and "tokenizer" in attribute: + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING["tokenizer"] component = auto_processor_class.from_pretrained(self.tmpdirname, subfolder=attribute, **kwargs) # noqa else: - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute] + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING[attribute] component = auto_processor_class.from_pretrained(self.tmpdirname, **kwargs) # noqa if "tokenizer" in attribute and not component.pad_token: component.pad_token = "[TEST_PAD]" @@ -443,11 +443,11 @@ def test_processor_from_and_save_pretrained_as_nested_dict(self): # Try to load each attribute separately from saved directory for attribute in processor_first.get_attributes(): - if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute: - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"] + if attribute not in SUBPROCESSOR_TO_AUTO_CLASS_MAPPING and "tokenizer" in attribute: + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING["tokenizer"] attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname, subfolder=attribute) else: - auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute] + auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING[attribute] attribute_reloaded = auto_processor_class.from_pretrained(tmpdirname) attribute_first = getattr(processor_first, attribute) From 114a48bf658262b37aa2911cfacf0d71107547ac Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 23:01:02 +0000 Subject: [PATCH 6/7] fix kwargs logic --- src/transformers/processing_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 85eb5cdd3f9d..d42aa05bd4c9 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -1423,9 +1423,9 @@ def from_pretrained( kwargs["token"] = token # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors - processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) + processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs) - return cls.from_args_and_dict(args, processor_dict, **kwargs) + return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs) @classmethod def get_attributes(cls): @@ -1498,6 +1498,8 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor """ args = [] processor_dict = processor_dict if processor_dict is not None else {} + # Remove subfolder from kwargs to avoid duplicate keyword arguments + subfolder = kwargs.pop("subfolder", "") # get args from processor init signature sub_processors = cls.get_attributes() @@ -1508,14 +1510,17 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor if is_primary: # Primary non-tokenizer sub-processor: load via Auto class auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type] - sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + sub_processor = auto_processor_class.from_pretrained( + pretrained_model_name_or_path, subfolder=subfolder, **kwargs + ) args.append(sub_processor) elif "tokenizer" in sub_processor_type: # Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer") # Load using AutoTokenizer with subfolder auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"] + tokenizer_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type sub_processor = auto_processor_class.from_pretrained( - pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs + pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs ) args.append(sub_processor) From 69041020cc72b807b4a3794ff60f455d66f9c0f9 Mon Sep 17 00:00:00 2001 From: yonigozlan Date: Fri, 5 Dec 2025 23:31:14 +0000 Subject: [PATCH 7/7] add test --- tests/models/auto/test_processor_auto.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index 4e618ea0f9b5..ae197961b199 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -544,6 +544,56 @@ def __init__(self, tokenizer, image_processor, encoder_image_processor): self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessorFast) self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessorFast) + def test_processor_inheritance_correctly_detects_subprocessors(self): + """Test that sub-processor detection works correctly with inheritance. + + Verifies that get_attributes() detects sub-processors from both parent and child classes + when the child class uses *args/**kwargs. + """ + + class BaseMultimodalProcessor(ProcessorMixin): + def __init__(self, tokenizer, image_processor): + super().__init__(tokenizer, image_processor) + + class ExtendedMultimodalProcessor(BaseMultimodalProcessor): + def __init__(self, feature_extractor, *args, **kwargs): + ProcessorMixin.__init__(self, feature_extractor, *args, **kwargs) + + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM") + image_processor = SiglipImageProcessor() + feature_extractor = Wav2Vec2FeatureExtractor() + + attributes = ExtendedMultimodalProcessor.get_attributes() + self.assertIn("tokenizer", attributes) + self.assertIn("image_processor", attributes) + self.assertIn("feature_extractor", attributes) + self.assertEqual(len(attributes), 3) + + processor = ExtendedMultimodalProcessor( + feature_extractor=feature_extractor, + tokenizer=tokenizer, + image_processor=image_processor, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + processor.save_pretrained(tmp_dir) + + with open(os.path.join(tmp_dir, "processor_config.json")) as f: + processor_config = json.load(f) + self.assertIn("image_processor", processor_config) + self.assertIn("feature_extractor", processor_config) + self.assertNotIn("tokenizer", processor_config) + + loaded_processor = ExtendedMultimodalProcessor.from_pretrained(tmp_dir) + + self.assertTrue(hasattr(loaded_processor, "tokenizer")) + self.assertTrue(hasattr(loaded_processor, "image_processor")) + self.assertTrue(hasattr(loaded_processor, "feature_extractor")) + + self.assertIsInstance(loaded_processor.tokenizer, type(tokenizer)) + self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessor) + self.assertIsInstance(loaded_processor.feature_extractor, Wav2Vec2FeatureExtractor) + @is_staging_test class ProcessorPushToHubTester(unittest.TestCase):