huggingface
diff --git a/‎src/transformers/cache_utils.py‎
Lines changed: 138 additions & 0 deletions b/‎src/transformers/cache_utils.py‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎src/transformers/generation/candidate_generator.py‎
Lines changed: 21 additions & 90 deletions b/‎src/transformers/generation/candidate_generator.py‎
Lines changed: 21 additions & 90 deletions
diff --git a/‎src/transformers/generation/utils.py‎
Lines changed: 12 additions & 6 deletions b/‎src/transformers/generation/utils.py‎
Lines changed: 12 additions & 6 deletions
@@ -80,6 +80,34 @@ def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
             self.keys = self.keys.index_select(0, beam_idx.to(self.keys.device))
             self.values = self.values.index_select(0, beam_idx.to(self.values.device))
 
+    def align(
+        self,
+        new_seq_length: int,
+        copy_instructions: list[tuple[int, slice, slice]],
+    ) -> None:
+        """
+        Align this layer's cache based on copy instructions.
+
+        Args:
+            new_seq_length (`int`): The new sequence length for the aligned cache.
+            copy_instructions (`list[tuple[int, slice, slice]]`): List of (batch_idx, src_slice, dst_slice) tuples
+                specifying what to copy from the old cache to the new cache.
+        """
+        if not self.is_initialized:
+            return
+
+        B, H, _, D = self.keys.shape
+        new_keys = self.keys.new_zeros((B, H, new_seq_length, D))
+        new_values = self.values.new_zeros((B, H, new_seq_length, D))
+
+        # Execute the pre-calculated copy instructions
+        for i, src_slice, dst_slice in copy_instructions:
+            new_keys[i, :, dst_slice] = self.keys[i, :, src_slice]
+            new_values[i, :, dst_slice] = self.values[i, :, src_slice]
+
+        self.keys = new_keys
+        self.values = new_values
+
 
 class DynamicLayer(CacheLayerMixin):
     """
@@ -891,6 +919,90 @@ def __len__(self):
         # forward through all the layers
         return len(self.layers)
 
+    def align(
+        self,
+        new_ids: torch.LongTensor,
+        ids_in_cache: torch.LongTensor,
+        pad_token_id: int,
+        return_new_ids_in_cache: bool = False,
+    ):
+        """
+        Align the cache when input sequences change (e.g., when batching different sequences together).
+
+        Args:
+            new_ids (`torch.LongTensor`): The new input IDs after batching changes.
+            ids_in_cache (`torch.LongTensor`): The input IDs that were used to build the current cache.
+            pad_token_id (`int`): The padding token ID.
+            return_new_ids_in_cache (`bool`, *optional*, defaults to `False`): Whether to return the aligned input IDs.
+
+        Returns:
+            `None` if `return_new_ids_in_cache=False`, otherwise the aligned input IDs tensor.
+        """
+        # 1. Setup metadata (Shape: [Batch, Heads, Sequence_Length, Dimension])
+        # We access the first layer just to get shapes and device
+        if len(self.layers) == 0 or not self.layers[0].is_initialized:
+            raise ValueError("Cache is not initialized")
+
+        ref_layer = self.layers[0]
+        B, H, S_old, D = ref_layer.keys.shape
+        S_new = new_ids.shape[1] - 1  # Preserving your original sizing logic
+
+        # 2. Pre-calculate "What to copy" for the whole batch ONCE.
+
+        # Find start indices (Vectorized)
+        # Note: sum() assumes left-padding only.
+        old_start_indices = (ids_in_cache == pad_token_id).sum(dim=1)
+        new_start_indices = (new_ids == pad_token_id).sum(dim=1)
+
+        # We will store the copy instructions here to apply to all layers later
+        # Format: List of tuples (batch_idx, source_slice, dest_slice)
+        copy_instructions = []
+
+        # We still loop over batch (B), but only once, not B * Layers
+        for i in range(B):
+            # Identify the content without padding
+            # We use standard python slicing here as it's just index math, very fast
+            o_start = old_start_indices[i].item()
+            n_start = new_start_indices[i].item()
+
+            # Get the actual token sequences (views, not copies)
+            # We perform the comparison on the ID tensors (int64), which is cheap
+            trimmed_old = ids_in_cache[i, o_start:]
+            trimmed_new = new_ids[i, n_start:]
+
+            min_len = min(len(trimmed_old), len(trimmed_new))
+
+            # Compare only up to min_len
+            # Using .ne() (not equal) and finding the first true is faster than checks
+            if min_len == 0:
+                copy_len = 0
+            else:
+                # Find mismatch: (a != b)
+                mismatch = trimmed_old[:min_len].ne(trimmed_new[:min_len])
+                if not mismatch.any():
+                    copy_len = min_len
+                else:
+                    # argmax on boolean gives index of first True
+                    copy_len = mismatch.int().argmax().item()
+
+            if copy_len > 0:
+                # Define the slice objects now so we don't recreate them 32 times
+                src_slice = slice(o_start, o_start + copy_len)
+                # You align to the right (-length:)
+                dst_slice = slice(-copy_len, None)
+                copy_instructions.append((i, src_slice, dst_slice))
+
+        # 3. Apply changes to all layers using per-layer align method
+        for layer in self.layers:
+            layer.align(S_new, copy_instructions)
+
+        if return_new_ids_in_cache:
+            new_input_ids_in_cache = ids_in_cache.new_zeros((B, S_new))
+            # Execute the copy instructions for input IDs
+            for i, src_slice, dst_slice in copy_instructions:
+                new_input_ids_in_cache[i, dst_slice] = ids_in_cache[i, src_slice]
+            return new_input_ids_in_cache
+
 
 class DynamicCache(Cache):
     """
@@ -1277,6 +1389,32 @@ def batch_select_indices(self, indices: torch.Tensor):
         self.self_attention_cache.batch_select_indices(indices)
         self.cross_attention_cache.batch_select_indices(indices)
 
+    def align(
+        self,
+        new_ids: torch.LongTensor,
+        ids_in_cache: torch.LongTensor,
+        pad_token_id: int,
+        return_new_ids_in_cache: bool = False,
+    ):
+        """
+        Align the cache when input sequences change (e.g., when batching different sequences together).
+        This aligns both self-attention and cross-attention caches.
+
+        Args:
+            new_ids (`torch.LongTensor`): The new input IDs after batching changes.
+            ids_in_cache (`torch.LongTensor`): The input IDs that were used to build the current cache.
+            pad_token_id (`int`): The padding token ID.
+            return_new_ids_in_cache (`bool`, *optional*, defaults to `False`): Whether to return the aligned input IDs.
+
+        Returns:
+            `None` if `return_new_ids_in_cache=False`, otherwise the aligned input IDs tensor.
+        """
+        if return_new_ids_in_cache:
+            aligned_ids = self.self_attention_cache.align(new_ids, ids_in_cache, pad_token_id, return_new_ids_in_cache)
+            return aligned_ids
+        else:
+            self.self_attention_cache.align(new_ids, ids_in_cache, pad_token_id, return_new_ids_in_cache)
+
     def get_max_cache_shape(self) -> int:
         """Returns the maximum sequence length (i.e. max capacity) of the cache object"""
         return self.self_attention_cache.get_max_cache_shape()
 
@@ -41,13 +41,16 @@
 class CandidateGenerator:
     """Abstract base class for all candidate generators that can be applied during assisted generation."""
 
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """
         Fetches the candidates to be tried for the current input.
 
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            assistant_ids_in_cache (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
 
         Return:
             `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
@@ -248,6 +251,7 @@ def update_candidate_strategy(
         """
         # Handle backward compatibility: convert int to tensor
         if isinstance(num_matches, int):
+            assert input_ids.shape[0] == 1, "num_matches should be a tensor of shape (batch_size,) when batch_size > 1"
             num_matches = torch.tensor([num_matches], device=input_ids.device)
 
         batch_size = input_ids.shape[0]
@@ -332,13 +336,12 @@ def _update_past_and_masks(
         """Update past key values and attention masks for subsequent generation rounds."""
         has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
-            new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
-            current_cache = self.assistant_kwargs["past_key_values"]
             if input_ids.shape[0] > 1:
-                self.assistant_kwargs["past_key_values"] = align_cache(
-                    current_cache, input_ids, assistant_ids_in_cache, self.generation_config.pad_token_id
+                self.assistant_kwargs["past_key_values"].align(
+                    input_ids, assistant_ids_in_cache, self.generation_config.pad_token_id
                 )
             else:
+                new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
                 self.assistant_kwargs["past_key_values"].crop(new_cache_size - num_added_tokens)
             self.assistant_kwargs = _prepare_attention_mask(
                 self.assistant_kwargs,
@@ -561,7 +564,8 @@ def get_candidates(
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-
+            assistant_ids_in_cache (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the assistant vocabulary that are in the cache.
         Return:
             `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be
             assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length,
@@ -1116,13 +1120,17 @@ def __init__(
         if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0:
             raise ValueError("Invalid max_matching_ngram_size or num_output_tokens")
 
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """
         Fetches the candidates to be tried for the current input.
 
         Args:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            assistant_ids_in_cache (`torch.LongTensor`, *optional*):
+                Assistant model input IDs that are already in the cache. Not used by prompt lookup decoding.
 
         Return:
             `torch.LongTensor` of shape `(num_candidates, candidate_length)`: The candidate sequences to be tried.
@@ -1277,12 +1285,16 @@ def __init__(
         self.assistant_early_exit = self.generation_config.assistant_early_exit
         self.generation_config.assistant_early_exit = None
 
-    def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         # Temporarily sets the number of hidden layers to the early exit value
         base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
         original_num_hidden_layers = base_model.config.num_hidden_layers
         base_model.config.num_hidden_layers = self.assistant_early_exit
-        candidate_ids, candidate_logits = super().get_candidates(input_ids)
+        candidate_ids, candidate_logits = super().get_candidates(
+            input_ids, assistant_ids_in_cache=assistant_ids_in_cache
+        )
         base_model.config.num_hidden_layers = original_num_hidden_layers
         return candidate_ids, candidate_logits
 
@@ -1345,84 +1357,3 @@ def _prepare_token_type_ids(model_kwargs: dict[str, Any], new_length: int) -> di
         token_type_copies = final_token_type.repeat(1, type_length_diff)
         model_kwargs["token_type_ids"] = torch.cat([model_kwargs["token_type_ids"], token_type_copies], dim=-1)
     return model_kwargs
-
-
-def align_cache(cache, new_ids, assistant_ids_in_cache, pad_token_id, apply_same_transform_on_old_ids: bool = False):
-    # 1. Setup metadata (Shape: [Batch, Heads, Sequence_Length, Dimension])
-    # We access the first layer just to get shapes and device
-
-    ref_layer = cache.layers[0]
-    old_ids = assistant_ids_in_cache
-    B, H, S_old, D = ref_layer.keys.shape
-    S_new = new_ids.shape[1] - 1  # Preserving your original sizing logic
-
-    # 2. Pre-calculate "What to copy" for the whole batch ONCE.
-    # This removes the logic calculation from the inner loop (32x-80x speedup on logic)
-
-    # Find start indices (Vectorized)
-    # Note: sum() assumes left-padding only.
-    old_start_indices = (old_ids == pad_token_id).sum(dim=1)
-    new_start_indices = (new_ids == pad_token_id).sum(dim=1)
-
-    # We will store the copy instructions here to apply to all layers later
-    # Format: List of tuples (batch_idx, source_slice, dest_slice, copy_len)
-    copy_instructions = []
-
-    # We still loop over batch (B), but only once, not B * Layers
-    for i in range(B):
-        # Identify the content without padding
-        # We use standard python slicing here as it's just index math, very fast
-        o_start = old_start_indices[i].item()
-        n_start = new_start_indices[i].item()
-
-        # Get the actual token sequences (views, not copies)
-        # We perform the comparison on the ID tensors (int64), which is cheap
-        trimmed_old = old_ids[i, o_start:]
-        trimmed_new = new_ids[i, n_start:]
-
-        min_len = min(len(trimmed_old), len(trimmed_new))
-
-        # Compare only up to min_len
-        # Using .ne() (not equal) and finding the first true is faster than checks
-        if min_len == 0:
-            copy_len = 0
-        else:
-            # Find mismatch: (a != b)
-            mismatch = trimmed_old[:min_len].ne(trimmed_new[:min_len])
-            if not mismatch.any():
-                copy_len = min_len
-            else:
-                # argmax on boolean gives index of first True
-                copy_len = mismatch.int().argmax().item()
-
-        if copy_len > 0:
-            # Define the slice objects now so we don't recreate them 32 times
-            src_slice = slice(o_start, o_start + copy_len)
-            # You align to the right (-length:)
-            dst_slice = slice(-copy_len, None)
-            copy_instructions.append((i, src_slice, dst_slice))
-
-    # 3. Apply changes to all layers
-    # We allocate new tensors and copy in bulk based on pre-calculated instructions
-    new_input_ids_in_cache = None
-    for layer in cache.layers:
-        # Allocation (This is the heavy GPU/Memory op)
-        new_keys = layer.keys.new_zeros((B, H, S_new, D))
-        new_values = layer.values.new_zeros((B, H, S_new, D))
-        if apply_same_transform_on_old_ids and new_input_ids_in_cache is None:
-            new_input_ids_in_cache = assistant_ids_in_cache.new_zeros((B, S_new))
-        # Execute the pre-calculated copy instructions
-        for i, src_slice, dst_slice in copy_instructions:
-            # Copy Keys/Values
-            new_keys[i, :, dst_slice] = layer.keys[i, :, src_slice]
-            new_values[i, :, dst_slice] = layer.values[i, :, src_slice]
-
-            if apply_same_transform_on_old_ids and new_input_ids_in_cache is not None:
-                new_input_ids_in_cache[i, dst_slice] = assistant_ids_in_cache[i, src_slice]
-        # Update the layer
-        layer.keys = new_keys
-        layer.values = new_values
-
-    if apply_same_transform_on_old_ids:
-        return cache, new_input_ids_in_cache
-    return cache
@@ -3649,8 +3649,16 @@ def _assisted_decoding(
 
         # keep track of which sequences are already finished
         batch_size, cur_len = input_ids.shape[:2]
-        if batch_size > 1 and assistant_tokenizer is not None:
-            raise ValueError("assisted generate is only supported for batch_size > 1 if assistant_tokenizer is None")
+        if batch_size > 1:
+            if assistant_tokenizer is not None:
+                raise ValueError(
+                    "assisted generate is only supported for batch_size > 1 if assistant_tokenizer is None"
+                )
+            if generation_config.prompt_lookup_num_tokens is not None:
+                raise ValueError(
+                    "assisted generate is only supported for batch_size > 1 if prompt_lookup_num_tokens is None"
+                )
+
         unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(cur_len, input_ids.device, model_kwargs)
 
@@ -4046,7 +4054,7 @@ def repadd_batch_and_fix_cache(input_ids, past_key_values, accepted_tokens_padde
     padding_mask = cache_input_ids[:, :-1] == pad_token_id
 
     for layer in past_key_values.layers:
-        layer = compress_and_repad_cache(layer, padding_mask, pad_token_id)
+        compress_and_repad_cache(layer, padding_mask)
     # 1. Filter out current padding and repad to minimum length.
     next_input_ids_clean = [row[row != pad_token_id] for row in next_input_ids]
     next_input_ids_padded = pad_sequence(
@@ -4055,7 +4063,7 @@ def repadd_batch_and_fix_cache(input_ids, past_key_values, accepted_tokens_padde
     return next_input_ids_padded, past_key_values
 
 
-def compress_and_repad_cache(layer, padding_mask, pad_token_id):
+def compress_and_repad_cache(layer, padding_mask):
     # padding_mask: True = Pad, False = Keep
     B, H, S, D = layer.keys.shape
 
@@ -4095,5 +4103,3 @@ def compress_and_repad_cache(layer, padding_mask, pad_token_id):
     # 5. Assign back
     layer.keys = out_keys
     layer.values = out_values
-
-    return layer