Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,6 @@ class AudioFlamingo3Processor(ProcessorMixin):
Special token used to represent audio inputs in the chat template.
"""

attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "Qwen2TokenizerFast"

Comment on lines -77 to -80
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cc @eustlb , moved this to the auto files to have one source of truth, and removed attributes as they are now auto-detected

def __init__(
self,
feature_extractor,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/feature_extraction_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict(
[
("audio-spectrogram-transformer", "ASTFeatureExtractor"),
("audioflamingo3", "WhisperFeatureExtractor"),
("clap", "ClapFeatureExtractor"),
("clvp", "ClvpFeatureExtractor"),
("csm", "EncodecFeatureExtractor"),
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@
("kosmos-2", "Kosmos2Processor"),
("kosmos-2.5", "Kosmos2_5Processor"),
("kyutai_speech_to_text", "KyutaiSpeechToTextProcessor"),
("lasr_ctc", "LasrProcessor"),
("lasr_encoder", "LasrProcessor"),
("layoutlmv2", "LayoutLMv2Processor"),
("layoutlmv3", "LayoutLMv3Processor"),
("layoutxlm", "LayoutXLMProcessor"),
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
("align", "BertTokenizer" if is_tokenizers_available() else None),
("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("audioflamingo3", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
("bark", "BertTokenizer" if is_tokenizers_available() else None),
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
Expand Down Expand Up @@ -183,6 +184,8 @@
("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None),
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None),
("lasr_ctc", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("lasr_encoder", "ParakeetTokenizerFast" if is_tokenizers_available() else None),
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/lasr/modular_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def _decode(


class LasrProcessor(ParakeetProcessor):
tokenizer_class = "ParakeetTokenizerFast"
pass


class LasrEncoderConfig(ParakeetEncoderConfig):
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/lasr/processing_lasr.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ class LasrProcessorKwargs(ProcessingKwargs, total=False):


class LasrProcessor(ProcessorMixin):
tokenizer_class = "ParakeetTokenizerFast"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cc @eustlb Same here, although is this supposed to be Parakeet or lasr tokenizer by default?


def __init__(self, feature_extractor, tokenizer):
super().__init__(feature_extractor, tokenizer)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
The fake audio token pattern.
"""

audio_processor_class = "Phi4MultimodalFeatureExtractor"

def __init__(
self,
image_processor,
Expand Down
4 changes: 0 additions & 4 deletions src/transformers/models/pix2struct/processing_pix2struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
An instance of ['T5Tokenizer`]. The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "Pix2StructImageProcessor"
tokenizer_class = ("T5Tokenizer",)

Comment on lines -64 to -67
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cc @itazap Just to make sure it's safe to remove, was this added back by mistake?

def __init__(self, image_processor, tokenizer):
tokenizer.return_token_type_ids = False
super().__init__(image_processor, tokenizer)
Expand Down
123 changes: 100 additions & 23 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,26 @@ def keys(self):
"video_processor": "BaseVideoProcessor",
}


def _get_modality_for_attribute(attribute_name: str) -> str:
"""
Get the canonical modality type for a given attribute name.

For example:
- "image_processor" -> "image_processor"
- "encoder_image_processor" -> "image_processor"
- "text_tokenizer" -> "tokenizer"
- "my_feature_extractor" -> "feature_extractor"
"""
for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys():
if modality in attribute_name:
return modality
raise ValueError(
f"Cannot determine modality for attribute '{attribute_name}'. "
f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}"
)


if sys.version_info >= (3, 11):
Unpack = typing.Unpack
else:
Expand Down Expand Up @@ -664,8 +684,10 @@ def check_argument_for_proper_class(self, argument_name, argument):
mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
is returned.
"""
if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
argument_name = "tokenizer"
# If the exact attribute name is not in the mapping, use its canonical modality
# (e.g., "encoder_tokenizer" -> "tokenizer")
if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING:
argument_name = _get_modality_for_attribute(argument_name)
class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
if isinstance(class_name, tuple):
proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
Expand Down Expand Up @@ -696,9 +718,13 @@ def to_dict(self) -> dict[str, Any]:
# extra attributes to be kept
attrs_to_save += ["auto_map"]

# Remove tokenizers from output - they have their own vocab files and are saved separately.
# All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json.
for attribute in self.__class__.get_attributes():
if "tokenizer" in attribute and attribute in output:
del output[attribute]
if attribute in output:
modality = _get_modality_for_attribute(attribute)
if modality == "tokenizer":
del output[attribute]

if "chat_template" in output:
del output["chat_template"]
Expand Down Expand Up @@ -820,13 +846,15 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
if hasattr(attribute, "_set_processor_class"):
attribute._set_processor_class(self.__class__.__name__)

# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if attribute_name == "tokenizer":
attribute.save_pretrained(save_directory)
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
# Note that the additional tokenizers must have "tokenizer" in their attribute name.
elif "tokenizer" in attribute_name:
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
modality = _get_modality_for_attribute(attribute_name)
is_primary = attribute_name == modality
if modality == "tokenizer":
# Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
if is_primary:
attribute.save_pretrained(save_directory)
else:
# if a model has multiple tokenizers, save the additional tokenizers in their own folders.
attribute.save_pretrained(os.path.join(save_directory, attribute_name))
elif attribute._auto_class is not None:
custom_object_save(attribute, save_directory, config=attribute)

Expand Down Expand Up @@ -1394,9 +1422,10 @@ def from_pretrained(
if token is not None:
kwargs["token"] = token

args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **kwargs)
# Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)

@classmethod
def get_attributes(cls):
Expand All @@ -1406,7 +1435,7 @@ def get_attributes(cls):
# don't treat audio_tokenizer as an attribute
if sub_processor_type == "audio_tokenizer":
continue
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
attributes.append(sub_processor_type)

# Legacy processors may not override `__init__` and instead expose modality
Expand All @@ -1420,7 +1449,7 @@ def get_attributes(cls):
inferred_attribute = attribute_name[: -len("_class")]
if inferred_attribute == "audio_tokenizer":
continue
if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute:
if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
attributes.append(inferred_attribute)

return attributes
Expand Down Expand Up @@ -1448,32 +1477,80 @@ def register_for_auto_class(cls, auto_class="AutoProcessor"):
cls._auto_class = auto_class

@classmethod
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
"""
Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
"tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.)
from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like
parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument.
modality names in their attribute name.

For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`.
Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders.

For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via
Auto class. Additional ones are instantiated from the config stored in processor_config.json
(passed as processor_dict).

Args:
pretrained_model_name_or_path: Path or model id to load from.
processor_dict: Optional dict containing processor config (from processor_config.json).
Required when loading additional non-tokenizer sub-processors.
"""
args = []
processor_dict = processor_dict if processor_dict is not None else {}
# Remove subfolder from kwargs to avoid duplicate keyword arguments
subfolder = kwargs.pop("subfolder", "")

# get args from processor init signature
sub_processors = cls.get_attributes()
for sub_processor_type in sub_processors:
if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
modality = _get_modality_for_attribute(sub_processor_type)
is_primary = sub_processor_type == modality

if is_primary:
# Primary non-tokenizer sub-processor: load via Auto class
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
sub_processor = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=subfolder, **kwargs
)
args.append(sub_processor)
elif "tokenizer" in sub_processor_type:
# Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer")
# Load using AutoTokenizer with subfolder
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
tokenizer_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type
sub_processor = auto_processor_class.from_pretrained(
pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs
pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs
)
args.append(sub_processor)

elif sub_processor_type in processor_dict:
# Additional non-tokenizer sub-processor: instantiate from config in processor_dict
sub_processor_config = processor_dict[sub_processor_type]
if isinstance(sub_processor_config, dict):
# Determine the class to instantiate
# Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type'
type_key = f"{modality}_type"
class_name = sub_processor_config.get(type_key)
if class_name is None:
raise ValueError(
f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. "
f"Config keys: {list(sub_processor_config.keys())}"
)
processor_class = cls.get_possibly_dynamic_module(class_name)
sub_processor = processor_class(**sub_processor_config)
args.append(sub_processor)
else:
raise ValueError(
f"Expected dict for {sub_processor_type} in processor_config.json, "
f"got {type(sub_processor_config)}"
)
else:
raise ValueError(
f"Cannot find config for {sub_processor_type} in processor_config.json. "
f"Available keys: {list(processor_dict.keys())}"
)

return args

@staticmethod
Expand Down
Loading