huggingface
diff --git a/‎src/transformers/generation/candidate_generator.py‎
Lines changed: 84 additions & 41 deletions b/‎src/transformers/generation/candidate_generator.py‎
Lines changed: 84 additions & 41 deletions
@@ -58,7 +58,9 @@ def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor,
             f"{self.__class__} is an abstract class. Only classes inheriting this class can call `get_candidates`."
         )
 
-    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True):
+    def update_candidate_strategy(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True
+    ):
         """
         Updates the candidate generation strategy based on the outcomes.
 
@@ -199,7 +201,9 @@ def __init__(
             self.matches = []
             self.clean_probs = []
 
-    def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """
         Fetches the candidates to be tried for the current input.
 
@@ -224,7 +228,9 @@ def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: to
         candidate_ids, candidate_logits = self._generate_candidates(generation_args)
         return candidate_ids, candidate_logits
 
-    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True):
+    def update_candidate_strategy(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True
+    ):
         """
         Updates the candidate generation strategy based on the outcomes.
 
@@ -239,21 +245,20 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
                 If `int`, assumes `batch_size=1` for backward compatibility.
             assistant_used (`bool`):
                 Whether the assistant was used to generate the candidates. Assistant was not used if max_new_tokens is 0.
-            """
+        """
         # Handle backward compatibility: convert int to tensor
         if isinstance(num_matches, int):
             num_matches = torch.tensor([num_matches], device=input_ids.device)
-        
+
         batch_size = input_ids.shape[0]
-        
+
         # Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
         # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
         # cost of forecasting incorrect assistant tokens.
         if self.assistant_model.generation_config.num_assistant_tokens_schedule in {
             "heuristic",
             "heuristic_transient",
         }:
-            
             # For batch processing, we can use different strategies:
             # Option 1: Use average matches across batch
             avg_matches = num_matches.float().mean().item()
@@ -268,7 +273,7 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
                 self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
 
         # The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes.
-        # The costs are estimated based on the ROC curve, which considers the probability of the draft token and its match with the target. 
+        # The costs are estimated based on the ROC curve, which considers the probability of the draft token and its match with the target.
         # A cost of 25% is assigned to false positives and 75% to false negatives.
         # This adaptation is not compatible with UAG, as it relies on the number of matched tokens based on the draft vocabulary, which is unavailable in UAG.
         if (
@@ -287,13 +292,13 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F
                     # this means we reject a token.
                     item_matches.append(0)
                 # taking only the relevant probabilities. for all the accepted tokens and the first rejected token.
-                self.clean_probs.extend([self.probs[len(self.matches)][:len(item_matches)]])  
+                self.clean_probs.extend([self.probs[len(self.matches)][: len(item_matches)]])
                 self.matches.extend([item_matches])
-                
+
             assert len(self.matches) == len(self.clean_probs), "matches and probs must have the same length"
             clean_matches = np.concatenate(self.matches)
             clean_probs = np.concatenate(self.clean_probs)
-            
+
             # calculate ROC curve and update threshold if we have enough samples
             if (
                 len(clean_probs) > 5 and {0, 1}.issubset(clean_matches)
@@ -318,20 +323,29 @@ def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> tuple[int, int]:
         return min_new_tokens, max_new_tokens
 
     def _update_past_and_masks(
-        self, input_ids: torch.LongTensor, remove_from_pkv: int = 0, num_added_tokens: int = 1, assistant_ids_in_cache: torch.LongTensor = None
+        self,
+        input_ids: torch.LongTensor,
+        remove_from_pkv: int = 0,
+        num_added_tokens: int = 1,
+        assistant_ids_in_cache: torch.LongTensor = None,
     ) -> bool:
         """Update past key values and attention masks for subsequent generation rounds."""
         has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
         if has_past_key_values:
             new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv
             current_cache = self.assistant_kwargs["past_key_values"]
-            if (batch_size:=input_ids.shape[0]) > 1:
-                self.assistant_kwargs["past_key_values"] = align_cache(current_cache, input_ids, assistant_ids_in_cache,
-                                                                       self.generation_config.pad_token_id)
+            if input_ids.shape[0] > 1:
+                self.assistant_kwargs["past_key_values"] = align_cache(
+                    current_cache, input_ids, assistant_ids_in_cache, self.generation_config.pad_token_id
+                )
             else:
                 self.assistant_kwargs["past_key_values"].crop(new_cache_size - num_added_tokens)
             self.assistant_kwargs = _prepare_attention_mask(
-                self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder, input_ids, self.generation_config.pad_token_id
+                self.assistant_kwargs,
+                input_ids.shape[-1],
+                self.assistant_model.config.is_encoder_decoder,
+                input_ids,
+                self.generation_config.pad_token_id,
             )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1])
 
@@ -355,15 +369,23 @@ def _generate_candidates(self, generation_args: dict) -> tuple[torch.LongTensor,
         """Generate candidate sequences using the assistant model."""
         assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
-        candidate_logits = torch.stack(assistant_output.scores, dim=1) # shape: (batch_size, candidate_length, vocab_size)
+        candidate_logits = torch.stack(
+            assistant_output.scores, dim=1
+        )  # shape: (batch_size, candidate_length, vocab_size)
         if (
             is_sklearn_available()
             and self.assistant_model.generation_config.assistant_confidence_threshold
             and type(self) is AssistedCandidateGenerator
         ):
-            scores_softmax = torch.softmax(candidate_logits, dim=-1) # shape: (batch_size, candidate_length, vocab_size)
-            ids = assistant_output.sequences[:, -len(assistant_output.scores):] # shape: (batch_size, candidate_length)
-            p = torch.gather(scores_softmax, dim=-1, index=ids.unsqueeze(-1)).squeeze(-1) # shape: (batch_size, candidate_length)
+            scores_softmax = torch.softmax(
+                candidate_logits, dim=-1
+            )  # shape: (batch_size, candidate_length, vocab_size)
+            ids = assistant_output.sequences[
+                :, -len(assistant_output.scores) :
+            ]  # shape: (batch_size, candidate_length)
+            p = torch.gather(scores_softmax, dim=-1, index=ids.unsqueeze(-1)).squeeze(
+                -1
+            )  # shape: (batch_size, candidate_length)
             self.probs.extend(p.tolist())
         candidate_ids = assistant_output.sequences
         return candidate_ids, candidate_logits
@@ -530,7 +552,9 @@ def convert_source_tokens_to_target_tokens(
         dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"]
         return dest_ids.to(input_ids.device)
 
-    def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """
         Fetches the candidates to be tried for the current input.
 
@@ -555,7 +579,9 @@ def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: to
 
         min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0)
 
-        self._update_past_and_masks(assistant_input_ids, remove_from_pkv, assistant_ids_in_cache=assistant_ids_in_cache)
+        self._update_past_and_masks(
+            assistant_input_ids, remove_from_pkv, assistant_ids_in_cache=assistant_ids_in_cache
+        )
         generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
         self.assistant_kwargs.pop("attention_mask", None)
 
@@ -955,7 +981,9 @@ def __init__(
         self._target_seq_len_with_candidates: int = 0
         self._prev_assistant_ids: torch.LongTensor | None = None
 
-    def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
+    def get_candidates(
+        self, input_ids: torch.LongTensor, assistant_ids_in_cache: torch.LongTensor = None
+    ) -> tuple[torch.LongTensor, torch.FloatTensor | None]:
         """
         Simplified version of get_candidates that uses the translator cache for token conversion.
         """
@@ -966,7 +994,9 @@ def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: to
         if max_new_tokens == 0:
             return input_ids, None
 
-        self._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens, assistant_ids_in_cache=assistant_ids_in_cache)
+        self._update_past_and_masks(
+            assistant_input_ids, num_added_tokens=num_added_tokens, assistant_ids_in_cache=assistant_ids_in_cache
+        )
         generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens)
 
         # Ensure scores are returned
@@ -987,7 +1017,12 @@ def get_candidates(self, input_ids: torch.LongTensor, assistant_ids_in_cache: to
 
         return target_candidate_ids, target_candidate_logits
 
-    def _update_past_and_masks(self, assistant_input_ids: torch.LongTensor, num_added_tokens: int = 1, assistant_ids_in_cache: torch.LongTensor = None) -> bool:
+    def _update_past_and_masks(
+        self,
+        assistant_input_ids: torch.LongTensor,
+        num_added_tokens: int = 1,
+        assistant_ids_in_cache: torch.LongTensor = None,
+    ) -> bool:
         if self._prev_assistant_ids is None:
             # Prepare attention mask for the first generation.
             # For subsequent generations, the attention mask is updated in super()_update_past_and_masks.
@@ -1175,7 +1210,9 @@ def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor,
         # assisted_generation expects logits as well, but we don't have those here, so returning None
         return candidate_input_ids, None
 
-    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True):
+    def update_candidate_strategy(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int, assistant_used: bool = True
+    ):
         """
         Updates the candidate generation strategy based on the outcomes.
 
@@ -1250,7 +1287,13 @@ def get_candidates(self, input_ids: torch.LongTensor) -> tuple[torch.LongTensor,
         return candidate_ids, candidate_logits
 
 
-def _prepare_attention_mask(model_kwargs: dict[str, Any], new_length: int, is_encoder_decoder: bool, input_ids: torch.LongTensor = None, pad_token_id: int = None) -> dict[str, Any]:
+def _prepare_attention_mask(
+    model_kwargs: dict[str, Any],
+    new_length: int,
+    is_encoder_decoder: bool,
+    input_ids: torch.LongTensor | None = None,
+    pad_token_id: int | None = None,
+) -> dict[str, Any]:
     """Expands or crops the model's mask for decoding purposes, to the defined length"""
 
     mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
@@ -1261,7 +1304,7 @@ def _prepare_attention_mask(model_kwargs: dict[str, Any], new_length: int, is_en
     mask_length_diff = new_length - mask.shape[1]
     if input_ids is not None and pad_token_id is not None:
         model_kwargs[mask_key] = (input_ids != pad_token_id).to(mask.dtype)
-    elif mask_length_diff < 0: # not sure when we get into this case
+    elif mask_length_diff < 0:  # not sure when we get into this case
         model_kwargs[mask_key] = mask[:, :mask_length_diff]
     elif mask_length_diff > 0:
         model_kwargs[mask_key] = torch.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1)
@@ -1307,38 +1350,38 @@ def _prepare_token_type_ids(model_kwargs: dict[str, Any], new_length: int) -> di
 def align_cache(cache, new_ids, assistant_ids_in_cache, pad_token_id, apply_same_transform_on_old_ids: bool = False):
     # 1. Setup metadata (Shape: [Batch, Heads, Sequence_Length, Dimension])
     # We access the first layer just to get shapes and device
-        
+
     ref_layer = cache.layers[0]
     old_ids = assistant_ids_in_cache
     B, H, S_old, D = ref_layer.keys.shape
-    S_new = new_ids.shape[1] - 1 # Preserving your original sizing logic
-    
+    S_new = new_ids.shape[1] - 1  # Preserving your original sizing logic
+
     # 2. Pre-calculate "What to copy" for the whole batch ONCE.
     # This removes the logic calculation from the inner loop (32x-80x speedup on logic)
-    
+
     # Find start indices (Vectorized)
-    # Note: sum() assumes left-padding only. 
+    # Note: sum() assumes left-padding only.
     old_start_indices = (old_ids == pad_token_id).sum(dim=1)
     new_start_indices = (new_ids == pad_token_id).sum(dim=1)
-    
+
     # We will store the copy instructions here to apply to all layers later
     # Format: List of tuples (batch_idx, source_slice, dest_slice, copy_len)
     copy_instructions = []
-    
+
     # We still loop over batch (B), but only once, not B * Layers
     for i in range(B):
         # Identify the content without padding
         # We use standard python slicing here as it's just index math, very fast
         o_start = old_start_indices[i].item()
         n_start = new_start_indices[i].item()
-        
+
         # Get the actual token sequences (views, not copies)
         # We perform the comparison on the ID tensors (int64), which is cheap
         trimmed_old = old_ids[i, o_start:]
         trimmed_new = new_ids[i, n_start:]
-        
+
         min_len = min(len(trimmed_old), len(trimmed_new))
-        
+
         # Compare only up to min_len
         # Using .ne() (not equal) and finding the first true is faster than checks
         if min_len == 0:
@@ -1356,7 +1399,7 @@ def align_cache(cache, new_ids, assistant_ids_in_cache, pad_token_id, apply_same
             # Define the slice objects now so we don't recreate them 32 times
             src_slice = slice(o_start, o_start + copy_len)
             # You align to the right (-length:)
-            dst_slice = slice(-copy_len, None) 
+            dst_slice = slice(-copy_len, None)
             copy_instructions.append((i, src_slice, dst_slice))
 
     # 3. Apply changes to all layers
@@ -1373,7 +1416,7 @@ def align_cache(cache, new_ids, assistant_ids_in_cache, pad_token_id, apply_same
             # Copy Keys/Values
             new_keys[i, :, dst_slice] = layer.keys[i, :, src_slice]
             new_values[i, :, dst_slice] = layer.values[i, :, src_slice]
-            
+
             if apply_same_transform_on_old_ids and new_input_ids_in_cache is not None:
                 new_input_ids_in_cache[i, dst_slice] = assistant_ids_in_cache[i, src_slice]
         # Update the layer
@@ -1382,4 +1425,4 @@ def align_cache(cache, new_ids, assistant_ids_in_cache, pad_token_id, apply_same
 
     if apply_same_transform_on_old_ids:
         return cache, new_input_ids_in_cache
-    return cache
+    return cache