support skip atten in export

Hanxian97 · facebook-github-bot · commit dca1f2cf7dbd · 2025-12-05T10:49:41.000-08:00
Summary:
Support export for llama model variants with attention layer skipping. We only need to specify the attention skip patterns in config.json in layer_type. E.g., 

"layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "skip_attention",
    "skip_attention",
    "skip_attention"
  ]

Differential Revision: D88399533
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -132,6 +132,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions):  # x:
         )
         if not isinstance(self.attention, AttentionSkip):
             h = x + h
+        else:
+            h = x
+            attn_options_update = None
 
         if hasattr(self, "block_sparse_moe"):
             out = h + self.block_sparse_moe(self.ffn_norm(h))
@@ -272,6 +275,10 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:
                     norm_eps=model_args.norm_eps,
                 )
             )
+        elif model_args.layer_types and model_args.layer_types[layer_id] == "skip_attention":
+            attention = AttentionSkip()
+            transformer_block = TransformerBlock(model_args, attention)
+            layers.append(transformer_block)
         else:
             attention = cls(
                 model_args, layer_id, rope, **model_args.attention_kwargs

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,9 @@ def forward(self, x, freqs_cos, freqs_sin, attn_options: ForwardOptions): # x:`
`132`	`132`	`)`
`133`	`133`	`if not isinstance(self.attention, AttentionSkip):`
`134`	`134`	`h = x + h`
	`135`	`+ else:`
	`136`	`+ h = x`
	`137`	`+ attn_options_update = None`
`135`	`138`
`136`	`139`	`if hasattr(self, "block_sparse_moe"):`
`137`	`140`	`out = h + self.block_sparse_moe(self.ffn_norm(h))`
`@@ -272,6 +275,10 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:`
`272`	`275`	`norm_eps=model_args.norm_eps,`
`273`	`276`	`)`
`274`	`277`	`)`
	`278`	`+ elif model_args.layer_types and model_args.layer_types[layer_id] == "skip_attention":`
	`279`	`+ attention = AttentionSkip()`
	`280`	`+ transformer_block = TransformerBlock(model_args, attention)`
	`281`	`+ layers.append(transformer_block)`
`275`	`282`	`else:`
`276`	`283`	`attention = cls(`
`277`	`284`	`model_args, layer_id, rope, **model_args.attention_kwargs`