@@ -1164,8 +1164,10 @@ def __init__(
11641164
11651165 # For segmentation and detection, extract intermediate output
11661166 if self .fork_feat :
1167- # add a norm layer for each output
1168- self .out_indices = [0 , 2 , 4 , 6 ]
1167+ # Add a norm layer for each output. self.stages is slightly different than self.network
1168+ # in the original code, the PatchEmbed layer is part of self.stages in this code where
1169+ # it was part of self.network in the original code. So we do not need to skip out indices.
1170+ self .out_indices = [0 , 1 , 2 , 3 ]
11691171 for i_emb , i_layer in enumerate (self .out_indices ):
11701172 if i_emb == 0 and os .environ .get ("FORK_LAST3" , None ):
11711173 """For RetinaNet, `start_level=1`. The first norm layer will not used.
@@ -1416,4 +1418,4 @@ def fastvit_ma36(pretrained=False, **kwargs):
14161418 pos_embs = (None , None , None , partial (RepConditionalPosEnc , spatial_shape = (7 , 7 ))),
14171419 token_mixers = ("repmixer" , "repmixer" , "repmixer" , "attention" )
14181420 )
1419- return _create_fastvit ('fastvit_ma36' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
1421+ return _create_fastvit ('fastvit_ma36' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
0 commit comments