fixed intermediate output indices

dillonalaird · rwightman · commit 63ee54853c1d · 2023-11-22T16:32:41.000-08:00
diff --git a/timm/models/fastvit.py b/timm/models/fastvit.py
@@ -1164,8 +1164,10 @@ def __init__(
 
         # For segmentation and detection, extract intermediate output
         if self.fork_feat:
-            # add a norm layer for each output
-            self.out_indices = [0, 2, 4, 6]
+            # Add a norm layer for each output. self.stages is slightly different than self.network
+            # in the original code, the PatchEmbed layer is part of self.stages in this code where
+            # it was part of self.network in the original code. So we do not need to skip out indices.
+            self.out_indices = [0, 1, 2, 3]
             for i_emb, i_layer in enumerate(self.out_indices):
                 if i_emb == 0 and os.environ.get("FORK_LAST3", None):
                     """For RetinaNet, `start_level=1`. The first norm layer will not used.
@@ -1416,4 +1418,4 @@ def fastvit_ma36(pretrained=False, **kwargs):
         pos_embs=(None, None, None, partial(RepConditionalPosEnc, spatial_shape=(7, 7))),
         token_mixers=("repmixer", "repmixer", "repmixer", "attention")
     )
-    return _create_fastvit('fastvit_ma36', pretrained=pretrained, **dict(model_args, **kwargs))
+    return _create_fastvit('fastvit_ma36', pretrained=pretrained, **dict(model_args, **kwargs))