Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,6 @@ class AudioFlamingo3Processor(ProcessorMixin):
Special token used to represent audio inputs in the chat template.
"""

attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "Qwen2TokenizerFast"

def __init__(
self,
feature_extractor,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
[
("audio-spectrogram-transformer", "ASTFeatureExtractor"),
("audioflamingo3", "WhisperFeatureExtractor"),
("clap", "ClapFeatureExtractor"),
("clvp", "ClvpFeatureExtractor"),
("csm", "EncodecFeatureExtractor"),
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
("kosmos-2", "Kosmos2Processor"),
("kosmos-2.5", "Kosmos2_5Processor"),
("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
("lasr_ctc", "LasrProcessor"),
("lasr_encoder", "LasrProcessor"),
("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"),
("layoutxlm", "LayoutXLMProcessor"),
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
("align", "BertTokenizer" if is_tokenizers_available() else None),
("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("audioflamingo3", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
("bark", "BertTokenizer" if is_tokenizers_available() else None),
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
Expand Down Expand Up @@ -183,6 +184,8 @@
("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
("lasr_ctc", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("lasr_encoder", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/lasr/modular_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _decode(


class LasrProcessor(ParakeetProcessor):
tokenizer_class = "ParakeetTokenizerFast"
pass


class LasrEncoderConfig(ParakeetEncoderConfig):
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/lasr/processing_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ class LasrProcessorKwargs(ProcessingKwargs, total=False):


class LasrProcessor(ProcessorMixin):
tokenizer_class = "ParakeetTokenizerFast"

def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
The fake audio token pattern.
"""

audio_processor_class = "Phi4MultimodalFeatureExtractor"

def __init__(
self,
image_processor,
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/models/pix2struct/processing_pix2struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
An instance of ['T5Tokenizer`]. The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "Pix2StructImageProcessor"
tokenizer_class = ("T5Tokenizer",)

def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
Expand Down
193 changes: 152 additions & 41 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def keys(self):
return self._MAPPING_NAMES.keys()


MODALITY_TO_AUTOPROCESSOR_MAPPING = _LazyAutoProcessorMapping()
SUBPROCESSOR_TO_AUTO_CLASS_MAPPING = _LazyAutoProcessorMapping()

MODALITY_TO_BASE_CLASS_MAPPING = {
SUBPROCESSOR_TO_BASE_CLASS_MAPPING = {
"audio_tokenizer": "DacModel",
"audio_processor": "FeatureExtractionMixin",
"tokenizer": ("PreTrainedTokenizerBase", "MistralCommonBackend"),
Expand All @@ -130,6 +130,27 @@ def keys(self):
"video_processor": "BaseVideoProcessor",
}


def _get_subprocessor_type(attribute_name: str) -> str:
"""
Get the canonical sub-processor type for a given attribute name.

For example:
- "image_processor" -> "image_processor"
- "encoder_image_processor" -> "image_processor"
- "text_tokenizer" -> "tokenizer"
- "my_feature_extractor" -> "feature_extractor"
"""
subprocessor_types = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING.keys()
for subprocessor_type in subprocessor_types:
if subprocessor_type in attribute_name:
return subprocessor_type
raise ValueError(
f"Cannot determine sub-processor type for attribute '{attribute_name}'. "
f"Attribute name must contain one of: {list(subprocessor_types)}"
)


if sys.version_info >= (3, 11):
Unpack = typing.Unpack
else:
Expand Down Expand Up @@ -664,9 +685,11 @@ def check_argument_for_proper_class(self, argument_name, argument):
mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
is returned.
"""
if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
argument_name = "tokenizer"
class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
# If the exact attribute name is not in the mapping, use its canonical sub-processor type
# (e.g., "encoder_tokenizer" -> "tokenizer")
if argument_name not in SUBPROCESSOR_TO_BASE_CLASS_MAPPING:
argument_name = _get_subprocessor_type(argument_name)
class_name = SUBPROCESSOR_TO_BASE_CLASS_MAPPING.get(argument_name)
if isinstance(class_name, tuple):
proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
else:
Expand Down Expand Up @@ -696,9 +719,13 @@ def to_dict(self) -> dict[str, Any]:
# extra attributes to be kept
attrs_to_save += ["auto_map"]

# Remove tokenizers from output - they have their own vocab files and are saved separately.
# All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json.
for attribute in self.__class__.get_attributes():
if "tokenizer" in attribute and attribute in output:
del output[attribute]
if attribute in output:
subprocessor_type = _get_subprocessor_type(attribute)
if subprocessor_type == "tokenizer":
del output[attribute]

if "chat_template" in output:
del output["chat_template"]
Expand Down Expand Up @@ -820,13 +847,15 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if attribute_name == "tokenizer":
attribute.save_pretrained(save_directory)
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
# Note that the additional tokenizers must have "tokenizer" in their attribute name.
elif "tokenizer" in attribute_name:
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
subprocessor_type = _get_subprocessor_type(attribute_name)
is_primary = attribute_name == subprocessor_type
if subprocessor_type == "tokenizer":
# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if is_primary:
attribute.save_pretrained(save_directory)
else:
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
elif attribute._auto_class is not None:
custom_object_save(attribute, save_directory, config=attribute)

Expand Down Expand Up @@ -1394,36 +1423,70 @@ def from_pretrained(
if token is not None:
kwargs["token"] = token

args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **kwargs)
# Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)

@classmethod
def get_attributes(cls):
args_in_init = inspect.signature(cls.__init__).parameters.keys()
"""
Detect the sub-processor attributes for this processor class.

Detection priority:
1. Auto-detection from `__init__` signature parameters across the full class hierarchy (MRO)
2. `<subprocessor_type>_class` class attributes (legacy pattern, checks full class hierarchy)

Returns:
List of attribute names corresponding to sub-processors.
"""
subprocessor_types = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING.keys()

# Priority 1: Auto-detect from __init__ signatures across the full MRO
# This handles inheritance where child classes use *args/**kwargs
attributes = []
for sub_processor_type in args_in_init:
# don't treat audio_tokenizer as an attribute
if sub_processor_type == "audio_tokenizer":
seen_params = set()
for base_class in cls.__mro__:
if not hasattr(base_class, "__init__"):
continue
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
attributes.append(sub_processor_type)
sig = inspect.signature(base_class.__init__)
for param_name, param in sig.parameters.items():
# Skip self, *args, **kwargs
if param_name in ("self",) or param.kind in (
inspect.Parameter.VAR_POSITIONAL,
inspect.Parameter.VAR_KEYWORD,
):
continue
if param_name in seen_params or param_name == "audio_tokenizer":
continue
seen_params.add(param_name)
if any(sp_type in param_name for sp_type in subprocessor_types):
attributes.append(param_name)

if attributes:
return attributes

# Priority 2: Check for <subprocessor_type>_class attributes in the full class hierarchy
# Legacy processors may not override `__init__` and instead expose modality
# attributes via `<attribute>_class`. In that case, `args_in_init` only exposes
# `*args`/`**kwargs`, so we need to infer the attributes from those class-level
# hints to keep backward compatibility (e.g. dynamic processors stored on the Hub).
if not attributes:
for attribute_name, value in cls.__dict__.items():
if value is None or attribute_name == "audio_tokenizer_class" or not attribute_name.endswith("_class"):
attributes_from_class_hints = []
for base_class in cls.__mro__:
for attribute_name in base_class.__dict__:
if not attribute_name.endswith("_class") or attribute_name == "audio_tokenizer_class":
continue
value = getattr(base_class, attribute_name, None)
if value is None:
continue
inferred_attribute = attribute_name[: -len("_class")]
if inferred_attribute == "audio_tokenizer":
continue
if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute:
attributes.append(inferred_attribute)
if any(sp_type in inferred_attribute for sp_type in subprocessor_types):
if inferred_attribute not in attributes_from_class_hints:
attributes_from_class_hints.append(inferred_attribute)

return attributes
return attributes_from_class_hints

@classmethod
def register_for_auto_class(cls, auto_class="AutoProcessor"):
Expand All @@ -1448,32 +1511,80 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"):
cls._auto_class = auto_class

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
"""
Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
"tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.)
from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like
parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument.
that correspond to known sub-processor types (image_processor, tokenizer, feature_extractor, etc.) or
contain sub-processor type names in their attribute name.

For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`.
Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders.

For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via
Auto class. Additional ones are instantiated from the config stored in processor_config.json
(passed as processor_dict).

Args:
pretrained_model_name_or_path: Path or model id to load from.
processor_dict: Optional dict containing processor config (from processor_config.json).
Required when loading additional non-tokenizer sub-processors.
"""
args = []
processor_dict = processor_dict if processor_dict is not None else {}
# Remove subfolder from kwargs to avoid duplicate keyword arguments
subfolder = kwargs.pop("subfolder", "")

# get args from processor init signature
sub_processors = cls.get_attributes()
for sub_processor_type in sub_processors:
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
for sub_processor_name in sub_processors:
subprocessor_type = _get_subprocessor_type(sub_processor_name)
is_primary = sub_processor_name == subprocessor_type

if is_primary:
# Primary non-tokenizer sub-processor: load via Auto class
auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING[sub_processor_name]
sub_processor = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=subfolder, **kwargs
)
args.append(sub_processor)
elif "tokenizer" in sub_processor_type:
elif "tokenizer" in sub_processor_name:
# Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer")
# Load using AutoTokenizer with subfolder
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
auto_processor_class = SUBPROCESSOR_TO_AUTO_CLASS_MAPPING["tokenizer"]
tokenizer_subfolder = os.path.join(subfolder, sub_processor_name) if subfolder else sub_processor_name
sub_processor = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs
pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs
)
args.append(sub_processor)

elif sub_processor_name in processor_dict:
# Additional non-tokenizer sub-processor: instantiate from config in processor_dict
sub_processor_config = processor_dict[sub_processor_name]
if isinstance(sub_processor_config, dict):
# Determine the class to instantiate
# Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type'
type_key = f"{subprocessor_type}_type"
class_name = sub_processor_config.get(type_key)
if class_name is None:
raise ValueError(
f"Cannot instantiate {sub_processor_name}: missing '{type_key}' in config. "
f"Config keys: {list(sub_processor_config.keys())}"
)
processor_class = cls.get_possibly_dynamic_module(class_name)
sub_processor = processor_class(**sub_processor_config)
args.append(sub_processor)
else:
raise ValueError(
f"Expected dict for {sub_processor_name} in processor_config.json, "
f"got {type(sub_processor_config)}"
)
else:
raise ValueError(
f"Cannot find config for {sub_processor_name} in processor_config.json. "
f"Available keys: {list(processor_dict.keys())}"
)

return args

@staticmethod
Expand Down
Loading