Add Gemma-3n (#50)

kaselby · web-flow · commit d2252aa0e837 · 2025-07-08T16:25:45.000-04:00
* Initial commit for Gemma-3n.

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Fixes to Gemma implementation.

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Add Gemma to __init__

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Updated requirements for Gemma.

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Minor changes to benchmark and evaluate.

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Default for layer_idx set to None for set_mlp_train and set_mlp_inference in modeling_skip

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Updated activation capture to work with refactor

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Updated activation capture to work with refactor

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

* Activation capture fix.

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;

---------

Signed-off-by: Kira Selby &lt;kaselby@uwaterloo.ca&gt;
diff --git a/benchmark.py b/benchmark.py
@@ -23,6 +23,8 @@ def parse_args() -> argparse.Namespace:
                       help='Verbose output')
     parser.add_argument('--config', type=str, default='configs/llama_skip_causal_3b.json',
                       help='Config file')
+    parser.add_argument('--max_response_length', type=int, default=-1,
+                      help='Maximum response tokens per prompt.')
     return parser.parse_args()
 
 
@@ -400,6 +402,9 @@ def main():
 
     # Get test prompts
     test_prompts = get_diverse_test_prompts()
+    if args.max_response_length > 0:
+        for prompt in test_prompts:
+            prompt['max_tokens'] = min(prompt['max_tokens'], args.max_response_length)
     
     print(f"\n🎯 Running comprehensive benchmark with {len(test_prompts)} diverse prompts...")
     print(f"📝 Test prompts: {[p['description'] for p in test_prompts]}")
diff --git a/configs/gemma3n_skip_causal_e2b.json b/configs/gemma3n_skip_causal_e2b.json
@@ -0,0 +1,103 @@
+{
+    "_name_or_path": "google/gemma-3n-E2B",
+    "sparsity": 0.3,
+    "architectures": [
+        "Gemma3nSkipConnectionForCausalLM"
+    ],
+    "activation_sparsity_pattern": [
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.95,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0
+    ],
+    "altup_active_idx": 0,
+    "altup_coef_clip": 120.0,
+    "altup_correct_scale": true,
+    "altup_lr_multiplier": 1.0,
+    "altup_num_inputs": 4,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "final_logit_softcapping": 30.0,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2048,
+    "hidden_size_per_layer_input": 256,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "laurel_rank": 64,
+    "layer_types": [
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "sliding_attention",
+        "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "model_type": "gemma3n-skip",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 30,
+    "num_key_value_heads": 2,
+    "num_kv_shared_layers": 10,
+    "query_pre_attn_scalar": 256,
+    "rms_norm_eps": 1e-06,
+    "rope_local_base_freq": 10000.0,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": 512,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.53.0.dev0",
+    "use_cache": true,
+    "vocab_size": 262400,
+    "vocab_size_per_layer_input": 262144
+}
diff --git a/evaluate.py b/evaluate.py
@@ -60,8 +60,9 @@ def main():
 
     wrapped_model = HFLM(
         pretrained=model,
+        backend="causal",
         batch_size=args.batch_size,
-        device=device
+        device=device,
     )
 
     logging.info("Beginning evaluation...")
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 # Core ML/AI packages
 # conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia
-transformers==4.52.4
+transformers==4.53.0
 numpy
 psutil
 optimum
@@ -11,4 +11,6 @@ datasets
 sentencepiece
 protobuf
 wandb
-ninja
+ninja
+timm
+pillow
diff --git a/src/activation_capture.py b/src/activation_capture.py
@@ -68,7 +68,7 @@ class ActivationCaptureDefault(ActivationCapture):
     has_up_proj: bool = True
 
     def get_layers(self):
-        return self.model.model.layers
+        return self.model.get_decoder().layers
 
     def _create_mlp_hook(self, layer_idx, proj_type):
         def hook(module, input, output):
diff --git a/src/modeling_skip.py b/src/modeling_skip.py
@@ -52,14 +52,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.up(self.down(x))
                      
 class SkipMLP(nn.Module):
-    def __init__(self, hidden_size: int, intermediate_size: int, sparsity: float, bias: bool = False):
+    def __init__(self, hidden_size: int, intermediate_size: int, sparsity: float, bias: bool = False, act_fn="silu"):
         super().__init__()
         self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
         self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
         self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
         self.sparsity = sparsity
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
+        self.act_fn = act_fn
         
         # Initialize mask but defer WeightCache creation until post_init
         self.init_mask = torch.ones(intermediate_size, dtype=torch.bool)
@@ -101,7 +102,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self.weight_cache.get_active_down_weight(),  # type: ignore
             self.down_proj_buffer,
             self.combined_proj_buffer,
-            "silu"
+            self.act_fn
         )
         return out
 
@@ -110,16 +111,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class SkipDecoderLayer(ABC, GradientCheckpointingLayer):
     def __init__(self, config: PretrainedConfig, layer_idx: int):
         super().__init__()
+        self.config = config
         self.hidden_size = config.hidden_size
         self.layer_idx = layer_idx
         self.sparsity = config.sparsity
 
         self._init_components(config, layer_idx)
 
-        self.lora_size = int(config.intermediate_size * 0.04)
+        intermediate_size = config.intermediate_size[layer_idx] if isinstance(config.intermediate_size, list) \
+            else config.intermediate_size
+        self.lora_size = int(intermediate_size * 0.04)
         self.mlp_lora_proj = FastLoRAProjection(
             config.hidden_size, 
-            config.intermediate_size,
+            intermediate_size,
             self.lora_size
         )
         
@@ -128,20 +132,20 @@ def __init__(self, config: PretrainedConfig, layer_idx: int):
         # Only initialize predictor training components if explicitly enabled
         if self.is_training_config:
             # Standard MLP for ground truth collection during training
-            self._set_mlp_train(config)
+            self._set_mlp_train(config, layer_idx)
         else:
-            self._set_mlp_inference(config)
+            self._set_mlp_inference(config, layer_idx)
 
     @abstractmethod
     def _init_components(self, config, layer_idx):
         pass
 
     @abstractmethod
-    def _set_mlp_train(self, config):
+    def _set_mlp_train(self, config, layer_idx=None):
         pass
 
     @abstractmethod
-    def _set_mlp_inference(self, config):
+    def _set_mlp_inference(self, config, layer_idx=None):
         pass
 
     @property
@@ -199,6 +203,12 @@ def forward(
         return outputs
     
 
+'''
+Note:
+Now that the intermediate losses have been removed, almost all the actual changes are confined to SkipDecoderLayer and Skip MLP.
+SkipConnectionModel/SkipConnectionForCausalLM may not even be necessary. It's possible at some point in the future we might want 
+to attempt a refactor here to simply extend from e.g. LlamaModel and just override the initialization.
+'''
 def build_skip_connection_model(pretrained_model_class: type[PreTrainedModel]) -> type[PreTrainedModel]:
     class SkipConnectionModel(ABC, pretrained_model_class):
         def __init__(self, config: PretrainedConfig):
@@ -336,17 +346,7 @@ def forward(
                 hidden_states=all_hidden_states,  # type: ignore
                 attentions=all_self_attns,
             )
-        
-        @abstractmethod
-        def _update_causal_mask(
-            self,
-            attention_mask: Union[torch.Tensor, "BlockMask"], # type: ignore    
-            input_tensor: torch.Tensor,
-            cache_position: torch.Tensor,
-            past_key_values: Cache,
-            output_attentions: bool = False,
-        ):
-            pass
+
     return SkipConnectionModel
 
 
diff --git a/src/models/__init__.py b/src/models/__init__.py
@@ -2,4 +2,5 @@
 from . import qwen2
 from . import mistral
 from . import phi3
+from . import gemma3n
 # from . import dia
diff --git a/src/models/gemma3n/__init__.py b/src/models/gemma3n/__init__.py
@@ -0,0 +1,10 @@
+from . import configuration_gemma_skip
+from . import modelling_gemma_skip
+
+from transformers import AutoConfig, AutoModelForCausalLM
+from .configuration_gemma_skip import Gemma3nSkipConnectionConfig
+from .modelling_gemma_skip import Gemma3nSkipConnectionForCausalLM
+AutoConfig.register("gemma3n-skip", Gemma3nSkipConnectionConfig)
+AutoModelForCausalLM.register(Gemma3nSkipConnectionConfig, Gemma3nSkipConnectionForCausalLM)
+
+__all__ = [configuration_gemma_skip, modelling_gemma_skip]
diff --git a/src/models/gemma3n/activation_capture_gemma.py b/src/models/gemma3n/activation_capture_gemma.py
@@ -0,0 +1,11 @@
+from src.activation_capture import ActivationCaptureDefault
+
+
+class ActivationCaptureGemma3n(ActivationCaptureDefault):
+    """Helper class to capture activations from model layers."""
+
+    def _register_gate_hook(self, layer_idx, layer):
+        handle = layer.mlp.act_fn.register_forward_hook(
+            self._create_mlp_hook(layer_idx, 'gate')
+        )
+        return handle
diff --git a/src/models/gemma3n/configuration_gemma_skip.py b/src/models/gemma3n/configuration_gemma_skip.py
@@ -0,0 +1,4 @@
+from transformers import Gemma3nTextConfig
+from src.configuration_skip import build_skip_config
+
+Gemma3nSkipConnectionConfig = build_skip_config(Gemma3nTextConfig, "gemma3n-skip")
diff --git a/src/models/gemma3n/modelling_gemma_skip.py b/src/models/gemma3n/modelling_gemma_skip.py
diff --git a/src/models/llama/modelling_llama_skip.py b/src/models/llama/modelling_llama_skip.py
diff --git a/src/models/mistral/modelling_mistral_skip.py b/src/models/mistral/modelling_mistral_skip.py
diff --git a/src/models/phi3/modelling_phi_skip.py b/src/models/phi3/modelling_phi_skip.py
diff --git a/src/models/qwen2/modelling_qwen_skip.py b/src/models/qwen2/modelling_qwen_skip.py

Original file line number	Diff line number	Diff line change
`@@ -60,8 +60,9 @@ def main():`
`60`	`60`
`61`	`61`	`wrapped_model = HFLM(`
`62`	`62`	`pretrained=model,`
	`63`	`+ backend="causal",`
`63`	`64`	`batch_size=args.batch_size,`
`64`		`- device=device`
	`65`	`+ device=device,`
`65`	`66`	`)`
`66`	`67`
`67`	`68`	`logging.info("Beginning evaluation...")`