support gradient checkpoint in forward_intermediates

brianhou0208 · brianhou0208 · commit 57f85542da80 · 2025-05-29T05:04:39.000+08:00
diff --git a/timm/models/beit.py b/timm/models/beit.py
@@ -451,7 +451,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x, shared_rel_pos_bias=rel_pos_bias)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias)
+            else:
+                x = blk(x, shared_rel_pos_bias=rel_pos_bias)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py
@@ -44,7 +44,7 @@
 )
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import named_apply, checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq, named_apply
 from ._registry import generate_default_cfgs, register_model
 
 __all__ = ['ByobNet', 'ByoModelCfg', 'ByoBlockCfg', 'create_byob_stem', 'create_block']
@@ -1384,7 +1384,10 @@ def forward_intermediates(
             stages = self.stages[:max_index]
         for stage in stages:
             feat_idx += 1
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if not exclude_final_conv and feat_idx == last_idx:
                 # default feature_info for this model uses final_conv as the last feature output (if present)
                 x = self.final_conv(x)
diff --git a/timm/models/cait.py b/timm/models/cait.py
@@ -18,7 +18,7 @@
 from timm.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, use_fused_attn
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 __all__ = ['Cait', 'ClassAttn', 'LayerScaleBlockClassAttn', 'LayerScaleBlock', 'TalkingHeadAttn']
@@ -373,7 +373,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
diff --git a/timm/models/davit.py b/timm/models/davit.py
@@ -25,7 +25,7 @@
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._features_fx import register_notrace_function
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import generate_default_cfgs, register_model
 
 __all__ = ['DaVit']
@@ -671,7 +671,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
  
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 if norm and feat_idx == last_idx:
                     x_inter = self.norm_pre(x)  # applying final norm to last intermediate
diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py
@@ -210,9 +210,11 @@ def forward_intermediates(
             blocks = self.blocks
         else:
             blocks = self.blocks[:max_index]
-        for blk in blocks:
-            feat_idx += 1
-            x = blk(x)
+        for feat_idx, blk in enumerate(blocks, start=1):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
diff --git a/timm/models/efficientvit_mit.py b/timm/models/efficientvit_mit.py
@@ -19,7 +19,7 @@
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._features_fx import register_notrace_module
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 
@@ -789,7 +789,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
  
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
@@ -943,7 +946,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
  
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
diff --git a/timm/models/efficientvit_msra.py b/timm/models/efficientvit_msra.py
@@ -18,7 +18,7 @@
 from timm.layers import SqueezeExcite, SelectAdaptivePool2d, trunc_normal_, _assert
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 
@@ -510,7 +510,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
  
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
diff --git a/timm/models/eva.py b/timm/models/eva.py
@@ -716,7 +716,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x, rope=rot_pos_embed)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, rope=rot_pos_embed)
+            else:
+                x = blk(x, rope=rot_pos_embed)
             if i in take_indices:
                 intermediates.append(self.norm(x) if norm else x)
 
diff --git a/timm/models/hiera.py b/timm/models/hiera.py
@@ -24,7 +24,7 @@
 # --------------------------------------------------------
 import math
 from functools import partial
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple, Type, Union
 
 import torch
 import torch.nn as nn
@@ -719,7 +719,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
             if i in take_indices:
                 x_int = self.reroll(x, i, mask=mask)
                 intermediates.append(x_int.permute(0, 3, 1, 2) if output_fmt == 'NCHW' else x_int)
diff --git a/timm/models/levit.py b/timm/models/levit.py
@@ -34,7 +34,7 @@
 from timm.layers import to_ntuple, to_2tuple, get_act_layer, DropPath, trunc_normal_, ndgrid
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import generate_default_cfgs, register_model
 
 __all__ = ['Levit']
@@ -671,7 +671,10 @@ def forward_intermediates(
         else:
             stages = self.stages[:max_index + 1]
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 if self.use_conv:
                     intermediates.append(x)
diff --git a/timm/models/metaformer.py b/timm/models/metaformer.py
@@ -631,7 +631,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and stage.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x) 
 
diff --git a/timm/models/mlp_mixer.py b/timm/models/mlp_mixer.py
@@ -49,7 +49,7 @@
 from timm.layers import PatchEmbed, Mlp, GluMlp, GatedMlp, DropPath, lecun_normal_, to_2tuple
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import named_apply, checkpoint_seq
+from ._manipulate import named_apply, checkpoint, checkpoint_seq
 from ._registry import generate_default_cfgs, register_model, register_model_deprecations
 
 __all__ = ['MixerBlock', 'MlpMixer']  # model_registry will add each entrypoint fn to this
@@ -298,7 +298,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x)
+            else:
+                x = blk(x)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
diff --git a/timm/models/mobilenetv3.py b/timm/models/mobilenetv3.py
@@ -211,9 +211,11 @@ def forward_intermediates(
             blocks = self.blocks
         else:
             blocks = self.blocks[:max_index]
-        for blk in blocks:
-            feat_idx += 1
-            x = blk(x)
+        for feat_idx, blk in enumerate(blocks, start=1):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint_seq(blk, x, flatten=True)
+            else:
+                x = blk(x)
             if feat_idx in take_indices:
                 intermediates.append(x)
 
diff --git a/timm/models/nextvit.py b/timm/models/nextvit.py
@@ -17,7 +17,6 @@
 from timm.layers import ClassifierHead
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._features_fx import register_notrace_function
 from ._manipulate import checkpoint_seq
 from ._registry import generate_default_cfgs, register_model
 
diff --git a/timm/models/repghost.py b/timm/models/repghost.py
@@ -17,7 +17,7 @@
 from ._builder import build_model_with_cfg
 from ._efficientnet_blocks import SqueezeExcite, ConvBnAct
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 __all__ = ['RepGhostNet']
@@ -336,7 +336,10 @@ def forward_intermediates(
             stages = self.blocks[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages, start=1):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x) 
 
diff --git a/timm/models/repvit.py b/timm/models/repvit.py
@@ -23,7 +23,7 @@
 from timm.layers import SqueezeExcite, trunc_normal_, to_ntuple, to_2tuple
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 __all__ = ['RepVit']
@@ -367,7 +367,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x) 
 
diff --git a/timm/models/resnetv2.py b/timm/models/resnetv2.py
@@ -41,7 +41,7 @@
     DropPath, AvgPool2dSame, create_pool2d, StdConv2d, create_conv2d, get_act_layer, get_norm_act_layer, make_divisible
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq, named_apply, adapt_input_conv
+from ._manipulate import checkpoint, checkpoint_seq, named_apply, adapt_input_conv
 from ._registry import generate_default_cfgs, register_model, register_model_deprecations
 
 __all__ = ['ResNetV2']  # model_registry will add each entrypoint fn to this
@@ -585,7 +585,10 @@ def forward_intermediates(
             stages = self.stages[:max_index]
 
         for feat_idx, stage in enumerate(stages, start=1):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 if feat_idx == last_idx:
                     x_inter = self.norm(x) if norm else x
diff --git a/timm/models/rexnet.py b/timm/models/rexnet.py
@@ -22,7 +22,7 @@
 from ._builder import build_model_with_cfg
 from ._efficientnet_builder import efficientnet_init_weights
 from ._features import feature_take_indices
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import generate_default_cfgs, register_model
 
 __all__ = ['RexNet']  # model_registry will add each entrypoint fn to this
@@ -271,7 +271,10 @@ def forward_intermediates(
             stages = self.features[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stage, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x) 
 
diff --git a/timm/models/tiny_vit.py b/timm/models/tiny_vit.py
@@ -22,7 +22,7 @@
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
 from ._features_fx import register_notrace_module
-from ._manipulate import checkpoint_seq
+from ._manipulate import checkpoint, checkpoint_seq
 from ._registry import register_model, generate_default_cfgs
 
 
@@ -570,7 +570,10 @@ def forward_intermediates(
             stages = self.stages[:max_index + 1]
 
         for feat_idx, stage in enumerate(stages):
-            x = stage(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(stages, x)
+            else:
+                x = stage(x)
             if feat_idx in take_indices:
                 intermediates.append(x) 
 
diff --git a/timm/models/tnt.py b/timm/models/tnt.py
@@ -386,7 +386,10 @@ def forward_intermediates(
             blocks = self.blocks[:max_index + 1]
 
         for i, blk in enumerate(blocks):
-            pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                pixel_embed, patch_embed = checkpoint(blk, pixel_embed, patch_embed)
+            else:
+                pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(patch_embed) if norm else patch_embed)
diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py
@@ -46,7 +46,7 @@
     get_act_layer, get_norm_layer, LayerType
 from ._builder import build_model_with_cfg
 from ._features import feature_take_indices
-from ._manipulate import named_apply, checkpoint_seq, adapt_input_conv
+from ._manipulate import named_apply, checkpoint, checkpoint_seq, adapt_input_conv
 from ._registry import generate_default_cfgs, register_model, register_model_deprecations
 
 __all__ = ['VisionTransformer']  # model_registry will add each entrypoint fn to this
@@ -759,7 +759,10 @@ def forward_intermediates(
         else:
             blocks = self.blocks[:max_index + 1]
         for i, blk in enumerate(blocks):
-            x = blk(x)
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk. x)
+            else:
+                x = blk(x)
             if i in take_indices:
                 # normalize intermediates with final norm layer if enabled
                 intermediates.append(self.norm(x) if norm else x)
diff --git a/timm/models/vision_transformer_relpos.py b/timm/models/vision_transformer_relpos.py
diff --git a/timm/models/vision_transformer_sam.py b/timm/models/vision_transformer_sam.py
diff --git a/timm/models/volo.py b/timm/models/volo.py
diff --git a/timm/models/xcit.py b/timm/models/xcit.py