-- added shared embeddings for TabTransformer

manujosephv · manujosephv · commit 75dc0cdc4177 · 2021-06-24T06:40:49.000+05:30
-- added extract_embeddings for NODE, AutoInt
-- updated ReadMe
diff --git a/README.md b/README.md
@@ -69,6 +69,7 @@ For complete Documentation with tutorials visit []
 * [TabNet: Attentive Interpretable Tabular Learning](https://arxiv.org/abs/1908.07442) is another model coming out of Google Research which uses Sparse Attention in multiple steps of decision making to model the output.
 * [Mixture Density Networks](https://publications.aston.ac.uk/id/eprint/373/1/NCRG_94_004.pdf) is a regression model which uses gaussian components to approximate the target function and  provide a probabilistic prediction out of the box.
 * [AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks](https://arxiv.org/abs/1810.11921) is a model which tries to learn interactions between the features in an automated way and create a better representation and then use this representation in downstream task
+* [TabTransformer] (https://arxiv.org/abs/2012.06678) is an adaptation of the Transformer model for Tabular Data which creates contextual representations for categorical features.
 
 To implement new models, see the [How to implement new models tutorial](https://github.com/manujosephv/pytorch_tabular/blob/main/docs/04-Implementing%20New%20Architectures.ipynb). It covers basic as well as advanced architectures.
 
@@ -112,9 +113,9 @@ loaded_model = TabularModel.load_from_checkpoint("examples/basic")
 ```
 ## Blogs
 
-[PyTorch Tabular – A Framework for Deep Learning for Tabular Data](https://deep-and-shallow.com/2021/01/27/pytorch-tabular-a-framework-for-deep-learning-for-tabular-data/)
-[Neural Oblivious Decision Ensembles(NODE) – A State-of-the-Art Deep Learning Algorithm for Tabular Data](https://deep-and-shallow.com/2021/02/25/neural-oblivious-decision-ensemblesnode-a-state-of-the-art-deep-learning-algorithm-for-tabular-data/)
-[Mixture Density Networks: Probabilistic Regression for Uncertainty Estimation](https://deep-and-shallow.com/2021/03/20/mixture-density-networks-probabilistic-regression-for-uncertainty-estimation/)
+- [PyTorch Tabular – A Framework for Deep Learning for Tabular Data](https://deep-and-shallow.com/2021/01/27/pytorch-tabular-a-framework-for-deep-learning-for-tabular-data/)
+- [Neural Oblivious Decision Ensembles(NODE) – A State-of-the-Art Deep Learning Algorithm for Tabular Data](https://deep-and-shallow.com/2021/02/25/neural-oblivious-decision-ensemblesnode-a-state-of-the-art-deep-learning-algorithm-for-tabular-data/)
+- [Mixture Density Networks: Probabilistic Regression for Uncertainty Estimation](https://deep-and-shallow.com/2021/03/20/mixture-density-networks-probabilistic-regression-for-uncertainty-estimation/)
 
 ## Future Roadmap(Contributions are Welcome)
 
@@ -124,8 +125,13 @@ loaded_model = TabularModel.load_from_checkpoint("examples/basic")
 4. Add Fourier Encoding for cyclic time variables
 5. Integrate Optuna Hyperparameter Tuning
 6. Add Text and Image Modalities for mixed modal problems
-7. Integrate Wide and Deep model
-8. Integrate TabTransformer
+7. Add Variable Importance
+8. Integrate SHAP for interpretability
+** DL Models**
+9. [DNF-Net: A Neural Architecture for Tabular Data](https://www.semanticscholar.org/paper/DNF-Net%3A-A-Neural-Architecture-for-Tabular-Data-Abutbul-Elidan/99c49f3a917815eed2144bfb5d064623ff09ade5)
+10. [Attention augmented differentiable forest for tabular data](https://www.semanticscholar.org/paper/Attention-augmented-differentiable-forest-for-data-Chen/57990b40affc5f34f4029dab39bc78e44e7d3b10)
+11. [XBNet : An Extremely Boosted Neural Network](https://arxiv.org/abs/2106.05239v2)
+12. [Revisiting Deep Learning Models for Tabular Data](https://arxiv.org/abs/2106.11959)
 ## Citation
 If you use PyTorch Tabular for a scientific publication, we would appreciate citations to the published software and the following paper:
 
diff --git a/examples/to_test_classification.py b/examples/to_test_classification.py
@@ -90,19 +90,22 @@
     normalize_continuous_features=False,
 )
 # model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1","accuracy"], metrics_params=[{"num_classes":num_classes},{}])
-model_config = NodeConfig(
-    task="classification",
-    depth=4,
-    num_trees=1024,
-    input_dropout=0.0,
-    metrics=["f1", "accuracy"],
-    metrics_params=[{"num_classes": num_classes, "average": "macro"}, {}],
-)
-# model_config = TabTransformerConfig(
+# model_config = NodeConfig(
 #     task="classification",
+#     depth=4,
+#     num_trees=1024,
+#     input_dropout=0.0,
 #     metrics=["f1", "accuracy"],
 #     metrics_params=[{"num_classes": num_classes, "average": "macro"}, {}],
 # )
+model_config = TabTransformerConfig(
+    task="classification",
+    metrics=["f1", "accuracy"],
+    share_embedding = True,
+    share_embedding_strategy="fraction",
+    shared_embedding_fraction=0.25,
+    metrics_params=[{"num_classes": num_classes, "average": "macro"}, {}],
+)
 trainer_config = TrainerConfig(gpus=-1, auto_select_gpus=True, fast_dev_run=False, max_epochs=5, batch_size=512)
 experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", 
                                      run_name="node_forest_cov", 
diff --git a/pytorch_tabular/models/autoint/autoint.py b/pytorch_tabular/models/autoint/autoint.py
@@ -165,3 +165,9 @@ def forward(self, x: Dict):
                 y_min, y_max = self.hparams.target_range[i]
                 y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
         return {"logits": y_hat, "backbone_features": x}
+    
+    def extract_embedding(self):
+        if len(self.hparams.categorical_cols) > 0:
+            return self.backbone.cat_embedding_layers
+        else:
+            raise ValueError("Model has been trained with no categorical feature and therefore can't be used as a Categorical Encoder")
diff --git a/pytorch_tabular/models/node/node_model.py b/pytorch_tabular/models/node/node_model.py
@@ -142,3 +142,10 @@ def forward(self, x: Dict):
                 y_min, y_max = self.hparams.target_range[i]
                 y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
         return {"logits": y_hat, "backbone_features": x}
+
+    def extract_embedding(self):
+        if self.hparams.embed_categorical:
+            if self.embedding_cat_dim != 0:
+                return self.embedding_layers
+        else:
+            raise ValueError("Model has been trained with no categorical feature and therefore can't be used as a Categorical Encoder")
diff --git a/pytorch_tabular/models/tab_transformer/components.py b/pytorch_tabular/models/tab_transformer/components.py
@@ -44,9 +44,37 @@ def forward(self, x):
         out = rearrange(out, "b h n d -> b n (h d)", h=h)
         return self.to_out(out)
 
-
-# transformer
-
+#Shamelessly copied with slight adaptation from https://github.com/jrzaurin/pytorch-widedeep/blob/b487b06721c5abe56ac68c8a38580b95e0897fd4/pytorch_widedeep/models/tab_transformer.py
+class SharedEmbeddings(nn.Module):
+    def __init__(
+        self,
+        num_embed: int,
+        embed_dim: int,
+        add_shared_embed: bool = False,
+        frac_shared_embed: float=0.25,
+    ):
+        super(SharedEmbeddings, self).__init__()
+        assert (
+            frac_shared_embed < 1
+        ), "'frac_shared_embed' must be less than 1"
+
+        self.add_shared_embed = add_shared_embed
+        self.embed = nn.Embedding(num_embed, embed_dim, padding_idx=0)
+        self.embed.weight.data.clamp_(-2, 2)
+        if add_shared_embed:
+            col_embed_dim = embed_dim
+        else:
+            col_embed_dim = int(embed_dim * frac_shared_embed)
+        self.shared_embed = nn.Parameter(torch.empty(1, col_embed_dim).uniform_(-1, 1))
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        out = self.embed(X)
+        shared_embed = self.shared_embed.expand(out.shape[0], -1)
+        if self.add_shared_embed:
+            out += shared_embed
+        else:
+            out[:, : shared_embed.shape[1]] = shared_embed
+        return out
 
 class TransformerEncoderBlock(nn.Module):
     def __init__(
diff --git a/pytorch_tabular/models/tab_transformer/config.py b/pytorch_tabular/models/tab_transformer/config.py
@@ -54,6 +54,31 @@ class TabTransformerConfig(ModelConfig):
             "help": "The embedding dimension for the input categorical features. Defaults to 32"
         },
     )
+    embedding_dropout: float = field(
+        default=0.1,
+        metadata={
+            "help": "Dropout to be applied to the Categorical Embedding. Defaults to 0.1"
+        },
+    )
+    share_embedding: bool = field(
+        default=False,
+        metadata={
+            "help": "The flag turns on shared embeddings in the input embedding process. The key idea here is to have an embedding for the feature as a whole along with embeddings of each unique values of that column. For more details refer to Appendix A of the TabTransformer paper. Defaults to False"
+        }
+    )
+    share_embedding_strategy: Optional[str] = field(
+        default="fraction",
+        metadata={
+            "help": "There are two strategies in adding shared embeddings. 1. `add` - A separate embedding for the feature is added to the embedding of the unique values of the feature. 2. `fraction` - A fraction of the input embedding is reserved for the shared embedding of the feature. Defaults to fraction.",
+            "choices": ["add","fraction"]
+        }
+    )
+    shared_embedding_fraction: float = field(
+        default=0.25,
+        metadata={
+            "help": "Fraction of the input_embed_dim to be reserved by the shared embedding. Should be less than one. Defaults to 0.25"
+        },
+    )
     num_heads: int = field(
         default=8,
         metadata={
@@ -72,12 +97,6 @@ class TabTransformerConfig(ModelConfig):
             "help": "The number of hidden units in the Multi-Headed Attention layers. Defaults to None and will be same as input_dim."
         },
     )
-    embedding_dropout: float = field(
-        default=0.1,
-        metadata={
-            "help": "Dropout to be applied to the Categorical Embedding. Defaults to 0.1"
-        },
-    )
     attn_dropout: float = field(
         default=0.1,
         metadata={
diff --git a/pytorch_tabular/models/tab_transformer/tab_transformer.py b/pytorch_tabular/models/tab_transformer/tab_transformer.py
@@ -1,7 +1,16 @@
 # Pytorch Tabular
 # Author: Manu Joseph <manujoseph@gmail.com>
 # For license information, see LICENSE.TXT
-# Inspired by https://github.com/lucidrains/tab-transformer-pytorch/blob/main/tab_transformer_pytorch/tab_transformer_pytorch.py
+# Inspired by implementations 
+# 1. lucidrains - https://github.com/lucidrains/tab-transformer-pytorch/
+# If you are interested in Transformers, you should definitely check out his repositories.
+# 2. PyTorch Wide and Deep - https://github.com/jrzaurin/pytorch-widedeep/ 
+# It is another library for tabular data, which supports multi modal problems. 
+# Check out the library if you haven't already.
+# 3. AutoGluon - https://github.com/awslabs/autogluon
+# AutoGluon is an AuttoML library which supports Tabular data as well. it is from Amazon Research and is in MXNet
+# 4. LabML Annotated Deep Learning Papers - The position-wise FF was shamelessly copied from 
+# https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/transformers
 """TabTransformer Model"""
 import logging
 from typing import Dict, OrderedDict
@@ -13,29 +22,46 @@
 from einops import rearrange
 
 from pytorch_tabular.utils import _initialize_layers, _linear_dropout_bn
-from .components import TransformerEncoderBlock
+from .components import TransformerEncoderBlock, SharedEmbeddings
 
 from ..base_model import BaseModel
 
 logger = logging.getLogger(__name__)
 
+
 class TabTransformerBackbone(pl.LightningModule):
     def __init__(self, config: DictConfig):
         super().__init__()
+        assert config.share_embedding_strategy in [
+            "add",
+            "fraction",
+        ], f"`share_embedding_strategy` should be one of `add` or `fraction`, not {self.hparams.share_embedding_strategy}"
         self.save_hyperparameters(config)
         self._build_network()
-        #TODO Add output_dim
 
     def _build_network(self):
         if len(self.hparams.categorical_cols) > 0:
             # Category Embedding layers
-            # self.embedding_dropout = nn.Dropout(self.hparams.embedding_dropout)
-            self.cat_embedding_layers = nn.ModuleList(
-                [
-                    nn.Embedding(cardinality, self.hparams.input_embed_dim)
-                    for cardinality in self.hparams.categorical_cardinality
-                ]
-            )
+            if self.hparams.share_embedding:
+                self.cat_embedding_layers = nn.ModuleList(
+                    [
+                        SharedEmbeddings(
+                            cardinality,
+                            self.hparams.input_embed_dim,
+                            add_shared_embed=self.hparams.share_embedding_strategy
+                            == "add",
+                            frac_shared_embed=self.hparams.shared_embedding_fraction,
+                        )
+                        for cardinality in self.hparams.categorical_cardinality
+                    ]
+                )
+            else:
+                self.cat_embedding_layers = nn.ModuleList(
+                    [
+                        nn.Embedding(cardinality, self.hparams.input_embed_dim)
+                        for cardinality in self.hparams.categorical_cardinality
+                    ]
+                )
             if self.hparams.embedding_dropout != 0:
                 self.embed_dropout = nn.Dropout(self.hparams.embedding_dropout)
         self.transformer_blocks = OrderedDict()
@@ -44,17 +70,20 @@ def _build_network(self):
                 input_embed_dim=self.hparams.input_embed_dim,
                 num_heads=self.hparams.num_heads,
                 ff_hidden_multiplier=self.hparams.ff_hidden_multiplier,
-                ff_activation = self.hparams.transformer_activation,
+                ff_activation=self.hparams.transformer_activation,
                 attn_dropout=self.hparams.attn_dropout,
                 ff_dropout=self.hparams.ff_dropout,
                 add_norm_dropout=self.hparams.add_norm_dropout,
             )
         self.transformer_blocks = nn.Sequential(self.transformer_blocks)
-
+        self.attention_weights = [None] * self.hparams.num_attn_blocks
         if self.hparams.batch_norm_continuous_input:
             self.normalizing_batch_norm = nn.BatchNorm1d(self.hparams.continuous_dim)
         # Final MLP Layers
-        _curr_units = self.hparams.input_embed_dim*len(self.hparams.categorical_cols) + self.hparams.continuous_dim
+        _curr_units = (
+            self.hparams.input_embed_dim * len(self.hparams.categorical_cols)
+            + self.hparams.continuous_dim
+        )
         # Linear Layers
         layers = []
         for units in self.hparams.out_ff_layers.split("-"):
@@ -87,7 +116,7 @@ def forward(self, x: Dict):
                 x = self.embed_dropout(x)
             for i, block in enumerate(self.transformer_blocks):
                 x = block(x)
-            #Flatten (Batch, N_Categorical, Hidden) --> (Batch, N_CategoricalxHidden)
+            # Flatten (Batch, N_Categorical, Hidden) --> (Batch, N_CategoricalxHidden)
             x = rearrange(x, "b n h -> b (n h)")
         if self.hparams.continuous_dim > 0:
             if self.hparams.batch_norm_continuous_input:
@@ -99,6 +128,7 @@ def forward(self, x: Dict):
         x = self.linear_layers(x)
         return x
 
+
 class TabTransformerModel(BaseModel):
     def __init__(self, config: DictConfig, **kwargs):
         super().__init__(config, **kwargs)
@@ -111,7 +141,11 @@ def _build_network(self):
         self.output_layer = nn.Linear(
             self.backbone.output_dim, self.hparams.output_dim
         )  # output_dim auto-calculated from other config
-        _initialize_layers(self.hparams.out_ff_activation, self.hparams.out_ff_initialization, self.output_layer)
+        _initialize_layers(
+            self.hparams.out_ff_activation,
+            self.hparams.out_ff_initialization,
+            self.output_layer,
+        )
 
     def forward(self, x: Dict):
         x = self.backbone(x)
@@ -124,3 +158,9 @@ def forward(self, x: Dict):
                 y_min, y_max = self.hparams.target_range[i]
                 y_hat[:, i] = y_min + nn.Sigmoid()(y_hat[:, i]) * (y_max - y_min)
         return {"logits": y_hat, "backbone_features": x}
+
+    def extract_embedding(self):
+        if len(self.hparams.categorical_cols) > 0:
+            return self.cat_embedding_layers
+        else:
+            raise ValueError("Model has been trained with no categorical feature and therefore can't be used as a Categorical Encoder")