feat(trainer): Just-in-time (JIT) asynchronous checkpointing using SIGTERM signals (#41723)

efazal · SunMarc · github-actions[bot] · web-flow · commit fda2d7350d71 · 2025-12-04T16:36:29.000+01:00
* Just-in-time (JIT) asynchronous checkpointing using SIGTERM signals and cuda streams.

* Fix failing ci tests

* Update JIT checkpoint code to remove CUDA streams and async checkpointing. Introduce sentinal file to identify incomplete checkpoints. Update trainer arg doc and tests.

* Fix sentinel file save path to checkpoint folder, update checkpoint related envs with HF_ prefix.

* Refactor JIT checkpoint logic: rename methods and variables for clarity, improve SIGTERM handling, and update related tests.

* Remove support for environment variable overrides in `TrainingArguments` and update related documentation.

* Apply style fixes

---------

Co-authored-by: Marc Sun &lt;57196510+SunMarc@users.noreply.github.com&gt;
Co-authored-by: github-actions[bot] &lt;github-actions[bot]@users.noreply.github.com&gt;
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -642,6 +642,16 @@ def __init__(
                 "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
             )
         default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+
+        # Add JIT checkpoint callback if enabled
+        if self.args.enable_jit_checkpoint:
+            from .trainer_jit_checkpoint import JITCheckpointCallback
+
+            jit_callback = JITCheckpointCallback()
+            default_callbacks = default_callbacks + [jit_callback]
+            # Set trainer reference for JIT callback after initialization
+            jit_callback.set_trainer(self)
+
         callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
         self.callback_handler = CallbackHandler(
             callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
diff --git a/src/transformers/trainer_jit_checkpoint.py b/src/transformers/trainer_jit_checkpoint.py
@@ -0,0 +1,126 @@
+import os
+import signal
+import threading
+from typing import Optional
+
+from .trainer_callback import TrainerCallback
+from .trainer_utils import PREFIX_CHECKPOINT_DIR
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CheckpointManager:
+    def __init__(self, trainer, kill_wait: int = 3):
+        """
+        Initialize the CheckpointManager for Just-In-Time checkpoint handling.
+
+        Args:
+            trainer: The Trainer instance that will be used to save checkpoints when SIGTERM is received.
+            kill_wait (`int`, *optional*, defaults to 3): Grace period to distinguish between SIGTERM and SIGKILL.
+        """
+        self.trainer = trainer
+        self.is_checkpoint_requested = False
+        self._original_sigterm_handler = None
+        self.kill_wait = kill_wait
+
+    def setup_signal_handler(self):
+        self._original_sigterm_handler = signal.signal(signal.SIGTERM, self._sigterm_handler)
+        logger.info("JIT checkpoint signal handler registered for SIGTERM")
+
+    def _sigterm_handler(self, signum, frame):
+        if self.is_checkpoint_requested:
+            return
+
+        logger.info(f"SIGTERM received, will request JIT checkpoint after {self.kill_wait}s")
+        threading.Timer(self.kill_wait, self._enable_checkpoint).start()
+
+    def _enable_checkpoint(self):
+        logger.info("Kill wait period elapsed, requesting checkpoint")
+        self.is_checkpoint_requested = True
+
+    def execute_jit_checkpoint(self):
+        try:
+            # Set checkpoint flag to False to avoid multiple checkpoints getting triggered by other callbacks
+            self.is_checkpoint_requested = False
+
+            logger.info("Starting JIT checkpointing...")
+            current_step = self.trainer.state.global_step
+            logger.info(f"Saving JIT checkpoint at step {current_step}")
+
+            output_dir = self.trainer._get_output_dir(trial=None)
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{current_step}"
+            checkpoint_path = os.path.join(output_dir, checkpoint_folder)
+
+            # Create checkpoint directory
+            os.makedirs(checkpoint_path, exist_ok=True)
+
+            # Create a sentinel file to indicate checkpointing is in progress
+            sentinel_file = os.path.join(output_dir, checkpoint_folder, "checkpoint-is-incomplete.txt")
+            with open(sentinel_file, "w") as f:
+                f.write(f"Checkpoint started at step {current_step} and in progress...")
+            logger.info(f"Created checkpoint progress sentinel marker file: {sentinel_file}")
+
+            # Invoke the trainer's checkpoint method directly
+            self.trainer._save_checkpoint(self.trainer.model, trial=None)
+
+            # Remove sentinel file upon successful checkpointing
+            if os.path.exists(sentinel_file):
+                os.remove(sentinel_file)
+                logger.info("Sentinel marker file removed")
+
+            logger.info("Immediate JIT checkpoint completed successfully")
+
+        except Exception as e:
+            logger.error(f"Failed to save JIT checkpoint: {e}")
+            raise
+
+
+class JITCheckpointCallback(TrainerCallback):
+    """
+    Callback for Just-In-Time checkpointing on SIGTERM signals.
+
+    When SIGTERM is received, the checkpoint manager sets `is_checkpoint_requested=True`.
+    The callbacks detect this flag and set `control.should_training_stop=True`, which signals
+    the Trainer's training loop to exit gracefully after saving the checkpoint.
+    """
+
+    def __init__(self):
+        self.trainer = None
+        self.jit_manager: Optional[CheckpointManager] = None
+
+    def set_trainer(self, trainer):
+        self.trainer = trainer
+        if trainer.args.enable_jit_checkpoint:
+            self.jit_manager = CheckpointManager(trainer=trainer)
+            self.jit_manager.setup_signal_handler()
+            logger.info("JIT checkpointing enabled")
+
+    def on_pre_optimizer_step(self, args, state, control, **kwargs):
+        if self.jit_manager and self.jit_manager.is_checkpoint_requested:
+            control.should_training_stop = True
+            self.jit_manager.execute_jit_checkpoint()
+
+    def on_step_begin(self, args, state, control, **kwargs):
+        if self.jit_manager and self.jit_manager.is_checkpoint_requested:
+            control.should_training_stop = True
+            self.jit_manager.execute_jit_checkpoint()
+
+    def on_step_end(self, args, state, control, **kwargs):
+        if self.jit_manager and self.jit_manager.is_checkpoint_requested:
+            control.should_save = False
+            control.should_training_stop = True
+            self.jit_manager.execute_jit_checkpoint()
+
+    def on_epoch_end(self, args, state, control, **kwargs):
+        if self.jit_manager and self.jit_manager.is_checkpoint_requested:
+            control.should_save = False
+            control.should_training_stop = True
+            self.jit_manager.execute_jit_checkpoint()
+
+    def on_train_end(self, args, state, control, **kwargs):
+        #  Restore original SIGTERM handler
+        if self.jit_manager and self.jit_manager._original_sigterm_handler is not None:
+            signal.signal(signal.SIGTERM, self.jit_manager._original_sigterm_handler)
+            logger.info("Restored original SIGTERM handler after training completion")
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -340,6 +340,17 @@ class TrainingArguments:
             `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
             alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
             checkpoints are saved: the last one and the best one (if they are different).
+        enable_jit_checkpoint (`bool`, *optional*, defaults to `False`):
+            Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. When enabled, training will
+            checkpoint upon receiving SIGTERM, allowing for graceful termination without losing
+            progress. This is particularly useful for shared clusters with preemptible workloads (e.g., Kueue).
+            **Important**: You must configure your orchestrator's graceful shutdown period to allow sufficient time
+            for checkpoint completion. For Kubernetes, set `terminationGracePeriodSeconds` in your job definition
+            (method varies by cloud-native trainer: Kubeflow, Ray, etc.). Note: the default is only 30 seconds,
+            which is typically insufficient. For Slurm, use `--signal=USR1@<seconds>` in your sbatch script to send
+            SIGTERM with adequate time before the job time limit. Calculate the required grace period as: longest
+            possible iteration time + checkpoint saving time. For example, if an iteration takes 2 minutes and
+            checkpoint saving takes 2 minutes, set at least 4 minutes (240 seconds) of grace time.
         save_safetensors (`bool`, *optional*, defaults to `True`):
             Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
             default `torch.load` and `torch.save`.
@@ -929,7 +940,23 @@ class TrainingArguments:
                 " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be"
                 " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`,"
                 " it is possible that two checkpoints are saved: the last one and the best one (if they are different)."
-                " Default is unlimited checkpoints"
+                " Default is unlimited checkpoints."
+            )
+        },
+    )
+    enable_jit_checkpoint: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable Just-In-Time (JIT) checkpointing on SIGTERM signal. "
+                "When enabled, training will checkpoint upon receiving SIGTERM, "
+                "allowing for graceful termination without losing progress. "
+                "This is particularly useful for shared clusters with preemptible workloads (Kueue). "
+                "IMPORTANT: You must configure your orchestrator's graceful shutdown period. "
+                "Kubernetes: set terminationGracePeriodSeconds (default 30s is insufficient!) in your job definition. "
+                "Slurm: use --signal=USR1@<seconds> in sbatch to send SIGTERM before time limit. "
+                "Calculate required grace period as: iteration time + checkpoint saving time. "
+                "Example: 2min iteration + 2min checkpoint = 240 seconds minimum."
             )
         },
     )
diff --git a/tests/trainer/test_trainer_jit_checkpoint.py b/tests/trainer/test_trainer_jit_checkpoint.py