Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,6 @@ class AudioFlamingo3Processor(ProcessorMixin):
Special token used to represent audio inputs in the chat template.
"""

attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "Qwen2TokenizerFast"

Comment on lines -77 to -80
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cc @eustlb , moved this to the auto files to have one source of truth, and removed attributes as they are now auto-detected

def __init__(
self,
feature_extractor,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
[
("audio-spectrogram-transformer", "ASTFeatureExtractor"),
("audioflamingo3", "WhisperFeatureExtractor"),
("clap", "ClapFeatureExtractor"),
("clvp", "ClvpFeatureExtractor"),
("csm", "EncodecFeatureExtractor"),
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
("kosmos-2", "Kosmos2Processor"),
("kosmos-2.5", "Kosmos2_5Processor"),
("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
("lasr_ctc", "LasrProcessor"),
("lasr_encoder", "LasrProcessor"),
("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"),
("layoutxlm", "LayoutXLMProcessor"),
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
("align", "BertTokenizer" if is_tokenizers_available() else None),
("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("audioflamingo3", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
("bark", "BertTokenizer" if is_tokenizers_available() else None),
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
Expand Down Expand Up @@ -183,6 +184,8 @@
("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
("lasr_ctc", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("lasr_encoder", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/lasr/processing_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ class LasrProcessorKwargs(ProcessingKwargs, total=False):


class LasrProcessor(ProcessorMixin):
tokenizer_class = "ParakeetTokenizerFast"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cc @eustlb Same here, although is this supposed to be Parakeet or lasr tokenizer by default?


def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
The fake audio token pattern.
"""

audio_processor_class = "Phi4MultimodalFeatureExtractor"

def __init__(
self,
image_processor,
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/models/pix2struct/processing_pix2struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
An instance of ['T5Tokenizer`]. The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "Pix2StructImageProcessor"
tokenizer_class = ("T5Tokenizer",)

def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
Expand Down
110 changes: 91 additions & 19 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,26 @@ def keys(self):
"video_processor": "BaseVideoProcessor",
}


def _get_modality_for_attribute(attribute_name: str) -> str:
"""
Get the canonical modality type for a given attribute name.

For example:
- "image_processor" -> "image_processor"
- "encoder_image_processor" -> "image_processor"
- "text_tokenizer" -> "tokenizer"
- "my_feature_extractor" -> "feature_extractor"
"""
for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys():
if modality in attribute_name:
return modality
raise ValueError(
f"Cannot determine modality for attribute '{attribute_name}'. "
f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}"
)
Comment on lines +133 to +149
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo this is simplified a lot. Users do not always call attributes following the pattern and also might want to use their own processing classes. There is a lot of inheritance and patching in custom code afaik, which can't be simplified to _get_modality_for_attribute imo

Copy link
Member Author

@yonigozlan yonigozlan Dec 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_modality_for_attribute is called on "attributes" names obtain with cls.get_attributes(), which are already filtered "attributes" corresponding to the sub-processors. So I think this should be fine, unless we want users being able to define sub-processors which variable name don't contain the sub-processor type, but I don't know why we would want that, and how we would detect what kind of sup-processor they are in that case.

I guess attributes is a misnomer here really, and we should maybe change it here to subprocessors, but it was named this before the refactor so I didn't want to change it to not break bc. It might be wort changing it though as it's causing a lot of confusion

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, the naming is indeed causing confusion. Probably I am thinking about too niche cases, as I haven't personally seen many power users using a custom processor. I just realized that were always strict with processor args/kwargs at initialization time 😄



if sys.version_info >= (3, 11):
Unpack = typing.Unpack
else:
Expand Down Expand Up @@ -664,8 +684,10 @@ def check_argument_for_proper_class(self, argument_name, argument):
mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
is returned.
"""
if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
argument_name = "tokenizer"
# If the exact attribute name is not in the mapping, use its canonical modality
# (e.g., "encoder_tokenizer" -> "tokenizer")
if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING:
argument_name = _get_modality_for_attribute(argument_name)
class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
if isinstance(class_name, tuple):
proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
Expand Down Expand Up @@ -696,9 +718,13 @@ def to_dict(self) -> dict[str, Any]:
# extra attributes to be kept
attrs_to_save += ["auto_map"]

# Remove tokenizers from output - they have their own vocab files and are saved separately.
# All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json.
for attribute in self.__class__.get_attributes():
if "tokenizer" in attribute and attribute in output:
del output[attribute]
if attribute in output:
modality = _get_modality_for_attribute(attribute)
if modality == "tokenizer":
del output[attribute]

if "chat_template" in output:
del output["chat_template"]
Expand Down Expand Up @@ -820,13 +846,15 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if attribute_name == "tokenizer":
attribute.save_pretrained(save_directory)
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
# Note that the additional tokenizers must have "tokenizer" in their attribute name.
elif "tokenizer" in attribute_name:
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
modality = _get_modality_for_attribute(attribute_name)
is_primary = attribute_name == modality
if modality == "tokenizer":
# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if is_primary:
attribute.save_pretrained(save_directory)
else:
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
elif attribute._auto_class is not None:
custom_object_save(attribute, save_directory, config=attribute)

Expand Down Expand Up @@ -1394,8 +1422,9 @@ def from_pretrained(
if token is not None:
kwargs["token"] = token

args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
# Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **kwargs)

@classmethod
Expand All @@ -1406,7 +1435,7 @@ def get_attributes(cls):
# don't treat audio_tokenizer as an attribute
if sub_processor_type == "audio_tokenizer":
continue
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
attributes.append(sub_processor_type)

# Legacy processors may not override `__init__` and instead expose modality
Expand All @@ -1420,7 +1449,7 @@ def get_attributes(cls):
inferred_attribute = attribute_name[: -len("_class")]
if inferred_attribute == "audio_tokenizer":
continue
if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute:
if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
attributes.append(inferred_attribute)

return attributes
Expand Down Expand Up @@ -1448,20 +1477,36 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"):
cls._auto_class = auto_class

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
"""
Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
"tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.)
from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like
parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument.
modality names in their attribute name.

For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`.
Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders.

For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via
Auto class. Additional ones are instantiated from the config stored in processor_config.json
(passed as processor_dict).

Args:
pretrained_model_name_or_path: Path or model id to load from.
processor_dict: Optional dict containing processor config (from processor_config.json).
Required when loading additional non-tokenizer sub-processors.
"""
args = []
processor_dict = processor_dict if processor_dict is not None else {}

# get args from processor init signature
sub_processors = cls.get_attributes()
for sub_processor_type in sub_processors:
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
modality = _get_modality_for_attribute(sub_processor_type)
is_primary = sub_processor_type == modality

if is_primary:
# Primary non-tokenizer sub-processor: load via Auto class
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
args.append(sub_processor)
Expand All @@ -1474,6 +1519,33 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
)
args.append(sub_processor)

elif sub_processor_type in processor_dict:
# Additional non-tokenizer sub-processor: instantiate from config in processor_dict
sub_processor_config = processor_dict[sub_processor_type]
if isinstance(sub_processor_config, dict):
# Determine the class to instantiate
# Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type'
type_key = f"{modality}_type"
class_name = sub_processor_config.get(type_key)
if class_name is None:
raise ValueError(
f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. "
f"Config keys: {list(sub_processor_config.keys())}"
)
processor_class = cls.get_possibly_dynamic_module(class_name)
sub_processor = processor_class(**sub_processor_config)
args.append(sub_processor)
else:
raise ValueError(
f"Expected dict for {sub_processor_type} in processor_config.json, "
f"got {type(sub_processor_config)}"
)
else:
raise ValueError(
f"Cannot find config for {sub_processor_type} in processor_config.json. "
f"Available keys: {list(processor_dict.keys())}"
)
Comment on lines +1561 to +1565
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also not sure it's a good idea to raise an error if the attribute has no config dict. One possible use-case is when a processor has optional attributes that are not available on purpose (see #40447)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understood the use case in the issue linked :(, do you have an example? This code path would only be used by processor_dict that corresponds to a sub-processor

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above, nvm. It's not breaking BC and I was thinking about a new feat which we don't yet have


return args

@staticmethod
Expand Down
113 changes: 113 additions & 0 deletions tests/models/auto/test_processor_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,15 @@
AutoTokenizer,
BaseVideoProcessor,
BertTokenizer,
CLIPImageProcessorFast,
FeatureExtractionMixin,
ImageProcessingMixin,
LlamaTokenizer,
LlavaOnevisionVideoProcessor,
LlavaProcessor,
ProcessorMixin,
SiglipImageProcessor,
SiglipImageProcessorFast,
Wav2Vec2Config,
Wav2Vec2FeatureExtractor,
Wav2Vec2Processor,
Expand Down Expand Up @@ -431,6 +433,117 @@ def test_auto_processor_save_load(self):
second_processor = AutoProcessor.from_pretrained(tmp_dir)
self.assertEqual(second_processor.__class__.__name__, processor.__class__.__name__)

def test_processor_with_multiple_tokenizers_save_load(self):
"""Test that processors with multiple tokenizers save and load correctly."""

class DualTokenizerProcessor(ProcessorMixin):
"""A processor with two tokenizers and an image processor."""

def __init__(self, tokenizer, decoder_tokenizer, image_processor):
super().__init__(tokenizer, decoder_tokenizer, image_processor)

# Create processor with multiple tokenizers
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
image_processor = SiglipImageProcessor()

processor = DualTokenizerProcessor(
tokenizer=tokenizer,
decoder_tokenizer=decoder_tokenizer,
image_processor=image_processor,
)

with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)

# Verify directory structure: primary tokenizer in root, additional in subfolder
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "tokenizer_config.json")))
self.assertTrue(os.path.isdir(os.path.join(tmp_dir, "decoder_tokenizer")))
self.assertTrue(os.path.exists(os.path.join(tmp_dir, "decoder_tokenizer", "tokenizer_config.json")))

# Verify processor_config.json contains image_processor but not tokenizers
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)
self.assertNotIn("decoder_tokenizer", processor_config)

# Reload the full processor and verify all attributes
loaded_processor = DualTokenizerProcessor.from_pretrained(tmp_dir)

# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "decoder_tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))

# Verify tokenizers loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)
self.assertEqual(loaded_processor.decoder_tokenizer.vocab_size, decoder_tokenizer.vocab_size)

# Verify image processor loaded correctly
self.assertEqual(loaded_processor.image_processor.size, image_processor.size)

def test_processor_with_multiple_image_processors_save_load(self):
"""Test that processors with multiple image processors save and load correctly."""

class DualImageProcessorProcessor(ProcessorMixin):
"""A processor with two image processors and a tokenizer."""

def __init__(self, tokenizer, image_processor, encoder_image_processor):
super().__init__(tokenizer, image_processor, encoder_image_processor)

# Create processor with multiple image processors
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-BertForMaskedLM")
image_processor = SiglipImageProcessorFast(size={"height": 224, "width": 224})
encoder_image_processor = CLIPImageProcessorFast(size={"height": 384, "width": 384})

processor = DualImageProcessorProcessor(
tokenizer=tokenizer,
image_processor=image_processor,
encoder_image_processor=encoder_image_processor,
)

with tempfile.TemporaryDirectory() as tmp_dir:
processor.save_pretrained(tmp_dir)

# Verify processor_config.json contains both image processors
with open(os.path.join(tmp_dir, "processor_config.json")) as f:
processor_config = json.load(f)
self.assertIn("image_processor", processor_config)
self.assertIn("encoder_image_processor", processor_config)
self.assertNotIn("tokenizer", processor_config)

# Verify both image processors have the correct type key for instantiation
self.assertIn("image_processor_type", processor_config["image_processor"])
self.assertIn("image_processor_type", processor_config["encoder_image_processor"])
self.assertEqual(processor_config["image_processor"]["image_processor_type"], "SiglipImageProcessorFast")
self.assertEqual(
processor_config["encoder_image_processor"]["image_processor_type"], "CLIPImageProcessorFast"
)

# Verify the sizes are different (to ensure they're separate configs)
self.assertEqual(processor_config["image_processor"]["size"], {"height": 224, "width": 224})
self.assertEqual(processor_config["encoder_image_processor"]["size"], {"height": 384, "width": 384})

# Reload the full processor and verify all attributes
loaded_processor = DualImageProcessorProcessor.from_pretrained(tmp_dir)

# Verify the processor has all expected attributes
self.assertTrue(hasattr(loaded_processor, "tokenizer"))
self.assertTrue(hasattr(loaded_processor, "image_processor"))
self.assertTrue(hasattr(loaded_processor, "encoder_image_processor"))

# Verify tokenizer loaded correctly
self.assertEqual(loaded_processor.tokenizer.vocab_size, tokenizer.vocab_size)

# Verify image processors loaded correctly with their distinct sizes
self.assertEqual(loaded_processor.image_processor.size, {"height": 224, "width": 224})
self.assertEqual(loaded_processor.encoder_image_processor.size, {"height": 384, "width": 384})

# Verify they are different types
self.assertIsInstance(loaded_processor.image_processor, SiglipImageProcessorFast)
self.assertIsInstance(loaded_processor.encoder_image_processor, CLIPImageProcessorFast)


@is_staging_test
class ProcessorPushToHubTester(unittest.TestCase):
Expand Down
Loading