-- added attention pooling for autoint

manujosephv · manujosephv · commit b49f8276832e · 2021-03-12T14:10:42.000+05:30
-- added backbone features also to ret logits prediction
-- added autoint attentionpooling test cases
diff --git a/examples/to_test_regression.py b/examples/to_test_regression.py
@@ -65,7 +65,8 @@
 # )
 # # model_config.validate()
 # model_config = CategoryEmbeddingModelConfig(task="regression")
-model_config = AutoIntConfig(task="regression", deep_layers=True, embedding_dropout=0.2, batch_norm_continuous_input=True)
+model_config = AutoIntConfig(task="regression", deep_layers=True, embedding_dropout=0.2, 
+batch_norm_continuous_input=True, attention_pooling=True)
 trainer_config = TrainerConfig(checkpoints=None, max_epochs=25, gpus=1, profiler=None, fast_dev_run=False, auto_lr_find=True)
 # experiment_config = ExperimentConfig(
 #     project_name="DeepGMM_test",
diff --git a/pytorch_tabular/models/autoint/autoint.py b/pytorch_tabular/models/autoint/autoint.py
@@ -72,11 +72,16 @@ def _build_network(self):
         )
         if self.hparams.has_residuals:
             self.V_res_embedding = torch.nn.Linear(
-                _curr_units, self.hparams.attn_embed_dim
+                _curr_units,
+                self.hparams.attn_embed_dim * self.hparams.num_attn_blocks
+                if self.hparams.attention_pooling
+                else self.hparams.attn_embed_dim,
             )
         self.output_dim = (
             self.hparams.continuous_dim + self.hparams.categorical_dim
         ) * self.hparams.attn_embed_dim
+        if self.hparams.attention_pooling:
+            self.output_dim = self.output_dim * self.hparams.num_attn_blocks
 
     def forward(self, x: Dict):
         # (B, N)
@@ -109,8 +114,14 @@ def forward(self, x: Dict):
             x = self.linear_layers(x)
         # (N, B, E*) --> E* is the Attn Dimention
         cross_term = self.attn_proj(x).transpose(0, 1)
+        if self.hparams.attention_pooling:
+            attention_ops = []
         for self_attn in self.self_attns:
             cross_term, _ = self_attn(cross_term, cross_term, cross_term)
+            if self.hparams.attention_pooling:
+                attention_ops.append(cross_term)
+        if self.hparams.attention_pooling:
+            cross_term = torch.cat(attention_ops, dim=-1)
         # (B, N, E*)
         cross_term = cross_term.transpose(0, 1)
         if self.hparams.has_residuals:
diff --git a/pytorch_tabular/models/autoint/config.py b/pytorch_tabular/models/autoint/config.py
@@ -39,7 +39,8 @@ class AutoIntConfig(ModelConfig):
             Defaults to ReLU
         dropout (float): probability of an classification element to be zeroed in the deep MLP. Defaults to 0.0
         use_batch_norm (bool): Flag to include a BatchNorm layer after each Linear Layer+DropOut. Defaults to False
-        batch_norm_continuous_input (bool): If True, we will normalize the contiinuous layer by passing it through a BatchNorm layer
+        batch_norm_continuous_input (bool): If True, we will normalize the contiinuous layer by passing it through a BatchNorm layer. Defaults to False
+        attention_pooling (bool): If True, will combine the attention outputs of each block for final prediction. Defaults to False
         initialization (str): Initialization scheme for the linear layers. Defaults to `kaiming`.
             Choices are: [`kaiming`,`xavier`,`random`].
 
@@ -122,7 +123,13 @@ class AutoIntConfig(ModelConfig):
     batch_norm_continuous_input: bool = field(
         default=False,
         metadata={
-            "help": "If True, we will normalize the continuous layer by passing it through a BatchNorm layer"
+            "help": "If True, we will normalize the continuous layer by passing it through a BatchNorm layer. Defaults to Fasle"
+        },
+    )
+    attention_pooling: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, will combine the attention outputs of each block for final prediction. Defaults to False"
         },
     )
     initialization: str = field(
diff --git a/pytorch_tabular/models/base_model.py b/pytorch_tabular/models/base_model.py
@@ -73,6 +73,7 @@ def _setup_metrics(self):
                     raise e
         else:
             self.metrics = self.custom_metrics
+            self.hparams.metrics = [m.__name__ for m in self.custom_metrics]
 
     def calculate_loss(self, y, y_hat, tag):
         if (self.hparams.task == "regression") and (self.hparams.output_dim > 1):
diff --git a/pytorch_tabular/tabular_model.py b/pytorch_tabular/tabular_model.py
@@ -627,8 +627,8 @@ def predict(
                 y_hat, ret_value = self.model.predict(batch, ret_model_output=True)
             if ret_logits:
                 for k, v in ret_value.items():
-                    if k == "backbone_features":
-                        continue
+                    # if k == "backbone_features":
+                    #     continue
                     logits_predictions[k].append(v.detach().cpu())
             point_predictions.append(y_hat.detach().cpu())
             if is_probabilistic:
@@ -751,6 +751,8 @@ def load_from_checkpoint(cls, dir: str):
         tabular_model.model = model
         tabular_model.datamodule = datamodule
         tabular_model.callbacks = callbacks
+        #TODO max_epochs and min_epochs, make it optional
+        #TODO custom model and custom metrics need to be dealt with separately
         tabular_model._prepare_trainer()
         tabular_model.trainer.model = model
         tabular_model.logger = logger
diff --git a/tests/test_autoint.py b/tests/test_autoint.py
@@ -30,6 +30,7 @@
 @pytest.mark.parametrize("target_range", [True, False])
 @pytest.mark.parametrize("deep_layers", [True, False])
 @pytest.mark.parametrize("batch_norm_continuous_input", [True, False])
+@pytest.mark.parametrize("attention_pooling", [True, False])
 def test_regression(
     regression_data,
     multi_target,
@@ -39,7 +40,8 @@ def test_regression(
     normalize_continuous_features,
     target_range,
     deep_layers,
-    batch_norm_continuous_input
+    batch_norm_continuous_input,
+    attention_pooling
 ):
     (train, test, target) = regression_data
     if len(continuous_cols) + len(categorical_cols) == 0:
@@ -65,6 +67,7 @@ def test_regression(
             model_config_params["target_range"] = _target_range
         model_config_params["deep_layers"] = deep_layers
         model_config_params["batch_norm_continuous_input"] = batch_norm_continuous_input
+        model_config_params["attention_pooling"] = attention_pooling
         model_config = AutoIntConfig(**model_config_params)
         trainer_config = TrainerConfig(
             max_epochs=3, checkpoints=None, early_stopping=None, gpus=0