-- style formatting for TabTransformer

manujosephv · manujosephv · commit 86c71bc859bd · 2021-06-24T06:56:00.000+05:30
-- DocStrings update for TabTransformer
diff --git a/pytorch_tabular/models/tab_transformer/components.py b/pytorch_tabular/models/tab_transformer/components.py
@@ -1,12 +1,30 @@
+# Pytorch Tabular
+# Author: Manu Joseph <manujoseph@gmail.com>
+# For license information, see LICENSE.TXT
+# Inspired by implementations
+# 1. lucidrains - https://github.com/lucidrains/tab-transformer-pytorch/
+# If you are interested in Transformers, you should definitely check out his repositories.
+# 2. PyTorch Wide and Deep - https://github.com/jrzaurin/pytorch-widedeep/
+# It is another library for tabular data, which supports multi modal problems.
+# Check out the library if you haven't already.
+# 3. AutoGluon - https://github.com/awslabs/autogluon
+# AutoGluon is an AuttoML library which supports Tabular data as well. it is from Amazon Research and is in MXNet
+# 4. LabML Annotated Deep Learning Papers - The position-wise FF was shamelessly copied from
+# https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/transformers
 from typing import Optional
+
 import torch
 import torch.nn.functional as F
-from torch import nn, einsum
-from pytorch_tabular.models import common #import PositionWiseFeedForward, GEGLU, ReGLU, SwiGLU
 from einops import rearrange
+from torch import einsum, nn
+
+from pytorch_tabular.models import common
 
 
 class AddNorm(nn.Module):
+    """
+    Applies LayerNorm, Dropout and adds to input. Standard AddNorm operations in Transformers
+    """
     def __init__(self, input_dim: int, dropout: float):
         super(AddNorm, self).__init__()
         self.dropout = nn.Dropout(dropout)
@@ -17,11 +35,16 @@ def forward(self, X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
 
 
 class MultiHeadedAttention(nn.Module):
+    """
+    Multi Headed Attention Block in Transformers
+    """
     def __init__(
         self, input_dim: int, num_heads: int = 8, head_dim: int = 16, dropout: int = 0.1
     ):
         super().__init__()
-        assert input_dim % num_heads == 0, "'input_dim' must be multiples of 'num_heads'"
+        assert (
+            input_dim % num_heads == 0
+        ), "'input_dim' must be multiples of 'num_heads'"
         inner_dim = head_dim * num_heads
         self.n_heads = num_heads
         self.scale = head_dim ** -0.5
@@ -44,19 +67,21 @@ def forward(self, x):
         out = rearrange(out, "b h n d -> b n (h d)", h=h)
         return self.to_out(out)
 
-#Shamelessly copied with slight adaptation from https://github.com/jrzaurin/pytorch-widedeep/blob/b487b06721c5abe56ac68c8a38580b95e0897fd4/pytorch_widedeep/models/tab_transformer.py
+
+# Slight adaptation from https://github.com/jrzaurin/pytorch-widedeep which in turn adapted from AutoGluon
 class SharedEmbeddings(nn.Module):
+    """
+    Enables different values in a categorical feature to share some embeddings across
+    """
     def __init__(
         self,
         num_embed: int,
         embed_dim: int,
         add_shared_embed: bool = False,
-        frac_shared_embed: float=0.25,
+        frac_shared_embed: float = 0.25,
     ):
         super(SharedEmbeddings, self).__init__()
-        assert (
-            frac_shared_embed < 1
-        ), "'frac_shared_embed' must be less than 1"
+        assert frac_shared_embed < 1, "'frac_shared_embed' must be less than 1"
 
         self.add_shared_embed = add_shared_embed
         self.embed = nn.Embedding(num_embed, embed_dim, padding_idx=0)
@@ -76,7 +101,10 @@ def forward(self, X: torch.Tensor) -> torch.Tensor:
             out[:, : shared_embed.shape[1]] = shared_embed
         return out
 
+
 class TransformerEncoderBlock(nn.Module):
+    """A single Transformer Encoder Block
+    """
     def __init__(
         self,
         input_embed_dim: int,
@@ -97,17 +125,19 @@ def __init__(
             else transformer_head_dim,
             dropout=attn_dropout,
         )
-        
+
         try:
-            self.pos_wise_ff = getattr(common, ff_activation)(d_model=input_embed_dim,
-            d_ff=input_embed_dim * ff_hidden_multiplier,
-            dropout=ff_dropout)
+            self.pos_wise_ff = getattr(common, ff_activation)(
+                d_model=input_embed_dim,
+                d_ff=input_embed_dim * ff_hidden_multiplier,
+                dropout=ff_dropout,
+            )
         except AttributeError:
             self.pos_wise_ff = getattr(common, "PositionWiseFeedForward")(
                 d_model=input_embed_dim,
                 d_ff=input_embed_dim * ff_hidden_multiplier,
                 dropout=ff_dropout,
-                activation = getattr(nn, self.hparams.ff_activation)
+                activation=getattr(nn, self.hparams.ff_activation),
             )
         self.attn_add_norm = AddNorm(input_embed_dim, add_norm_dropout)
         self.ff_add_norm = AddNorm(input_embed_dim, add_norm_dropout)
@@ -116,26 +146,4 @@ def forward(self, x):
         y = self.mha(x)
         x = self.attn_add_norm(x, y)
         y = self.pos_wise_ff(y)
-        return self.ff_add_norm(x, y)
-
-
-# class MLP(nn.Module):
-#     def __init__(self, dims, act=None):
-#         super().__init__()
-#         dims_pairs = list(zip(dims[:-1], dims[1:]))
-#         layers = []
-#         for ind, (dim_in, dim_out) in enumerate(dims_pairs):
-#             is_last = ind >= (len(dims) - 1)
-#             linear = nn.Linear(dim_in, dim_out)
-#             layers.append(linear)
-
-#             if is_last:
-#                 continue
-
-#             act = default(act, nn.ReLU())
-#             layers.append(act)
-
-#         self.mlp = nn.Sequential(*layers)
-
-#     def forward(self, x):
-#         return self.mlp(x)
+        return self.ff_add_norm(x, y)
diff --git a/pytorch_tabular/models/tab_transformer/config.py b/pytorch_tabular/models/tab_transformer/config.py
@@ -12,36 +12,72 @@
 class TabTransformerConfig(ModelConfig):
     """Tab Transformer configuration
     Args:
-        task (str): Specify whether the problem is regression of classification.Choices are: regression classification
+        task (str): Specify whether the problem is regression of classification. 
+            Choices are: [`regression`,`classification`].
+        embedding_dims (Union[List[int], NoneType]): The dimensions of the embedding for 
+            each categorical column as a list of tuples (cardinality, embedding_dim). 
+            If left empty, will infer using the cardinality of the categorical column using 
+            the rule min(50, (x + 1) // 2)
         learning_rate (float): The learning rate of the model
-        loss (Union[str, NoneType]): The loss function to be applied.
-            By Default it is MSELoss for regression and CrossEntropyLoss for classification.
-            Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression and CrossEntropyLoss for classification
-        metrics (Union[List[str], NoneType]): the list of metrics you need to track during training.
-            The metrics should be one of the metrics implemented in PyTorch Lightning.
-            By default, it is Accuracy if classification and MeanSquaredLogError for regression
-        metrics_params (Union[List, NoneType]): The parameters to be passed to the Metrics initialized
-        target_range (Union[List, NoneType]): The range in which we should limit the output variable. Currently ignored for multi-target regression
-            Typically used for Regression problems. If left empty, will not apply any restrictions
+        loss (Union[str, NoneType]): The loss function to be applied. 
+            By Default it is MSELoss for regression and CrossEntropyLoss for classification. 
+            Unless you are sure what you are doing, leave it at MSELoss or L1Loss for regression 
+            and CrossEntropyLoss for classification
+        metrics (Union[List[str], NoneType]): the list of metrics you need to track during training. 
+            The metrics should be one of the functional metrics implemented in ``torchmetrics``. 
+            By default, it is accuracy if classification and mean_squared_error for regression
+        metrics_params (Union[List, NoneType]): The parameters to be passed to the metrics function
+        target_range (Union[List, NoneType]): The range in which we should limit the output variable. 
+            Currently ignored for multi-target regression. Typically used for Regression problems. 
+            If left empty, will not apply any restrictions
 
-        attn_embed_dim (int): The number of hidden units in the Multi-Headed Attention layers. Defaults to 32
-        num_heads (int): The number of heads in the Multi-Headed Attention layer. Defaults to 2
-        num_attn_blocks (int): The number of layers of stacked Multi-Headed Attention layers. Defaults to 2
-        attn_dropouts (float): Dropout between layers of Multi-Headed Attention Layers. Defaults to 0.0
-        has_residuals (bool): Flag to have a residual connect from enbedded output to attention layer output.
-            Defaults to True
-        embedding_dim (int): The dimensions of the embedding for continuous and categorical columns. Defaults to 16
-        embedding_dropout (float): probability of an embedding element to be zeroed. Defaults to 0.0
-        deep_layers (bool): Flag to enable a deep MLP layer before the Multi-Headed Attention layer. Defaults to False
-        layers (str): Hyphen-separated number of layers and units in the deep MLP. Defaults to 128-64-32
-        activation (str): The activation type in the deep MLP. The default activaion in PyTorch like
-            ReLU, TanH, LeakyReLU, etc. https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity.
+        input_embed_dim (int): The embedding dimension for the input categorical features. 
+            Defaults to 32
+        embedding_dropout (float): Dropout to be applied to the Categorical Embedding. 
+            Defaults to 0.1
+        share_embedding (bool): The flag turns on shared embeddings in the input embedding process. 
+            The key idea here is to have an embedding for the feature as a whole along with embeddings of 
+            each unique values of that column. For more details refer to Appendix A of the TabTransformer paper. 
+            Defaults to False
+        share_embedding_strategy (Union[str, NoneType]): There are two strategies in adding shared embeddings. 
+            1. `add` - A separate embedding for the feature is added to the embedding of the unique values of the feature. 
+            2. `fraction` - A fraction of the input embedding is reserved for the shared embedding of the feature. 
+            Defaults to fraction.
+            Choices are: [`add`,`fraction`].
+        shared_embedding_fraction (float): Fraction of the input_embed_dim to be reserved by the shared embedding. 
+            Should be less than one. Defaults to 0.25
+        num_heads (int): The number of heads in the Multi-Headed Attention layer. 
+            Defaults to 8
+        num_attn_blocks (int): The number of layers of stacked Multi-Headed Attention layers. 
+            Defaults to 6
+        transformer_head_dim (Union[int, NoneType]): The number of hidden units in the Multi-Headed Attention layers. 
+            Defaults to None and will be same as input_dim.
+        attn_dropout (float): Dropout to be applied after Multi headed Attention. 
+            Defaults to 0.1
+        add_norm_dropout (float): Dropout to be applied in the AddNorm Layer. 
+            Defaults to 0.1
+        ff_dropout (float): Dropout to be applied in the Positionwise FeedForward Network. 
+            Defaults to 0.1
+        ff_hidden_multiplier (int): Multiple by which the Positionwise FF layer scales the input. 
+            Defaults to 4
+        transformer_activation (str): The activation type in the transformer feed forward layers. 
+            In addition to the default activation in PyTorch like ReLU, TanH, LeakyReLU, etc. 
+            https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity, 
+            GEGLU, ReGLU and SwiGLU are also implemented(https://arxiv.org/pdf/2002.05202.pdf). 
+            Defaults to GEGLU
+        out_ff_layers (str): Hyphen-separated number of layers and units in the deep MLP. 
+            Defaults to 128-64-32
+        out_ff_activation (str): The activation type in the deep MLP. The default activaion in PyTorch like ReLU, TanH, LeakyReLU, etc. 
+            https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity.
             Defaults to ReLU
-        dropout (float): probability of an classification element to be zeroed in the deep MLP. Defaults to 0.0
-        use_batch_norm (bool): Flag to include a BatchNorm layer after each Linear Layer+DropOut. Defaults to False
-        batch_norm_continuous_input (bool): If True, we will normalize the contiinuous layer by passing it through a BatchNorm layer. Defaults to False
-        attention_pooling (bool): If True, will combine the attention outputs of each block for final prediction. Defaults to False
-        initialization (str): Initialization scheme for the linear layers. Defaults to `kaiming`.
+        out_ff_dropout (float): Probability of an classification element to be zeroed in the deep MLP. 
+            Defaults to 0.0
+        use_batch_norm (bool): Flag to include a BatchNorm layer after each Linear Layer+DropOut. 
+            Defaults to False
+        batch_norm_continuous_input (bool): If True, we will normalize the continuous layer by passing it through a BatchNorm layer. 
+            Defaults to False
+        out_ff_initialization (str): Initialization scheme for the linear layers. 
+            Defaults to `kaiming`. 
             Choices are: [`kaiming`,`xavier`,`random`].
 
     Raises:
@@ -170,7 +206,7 @@ class TabTransformerConfig(ModelConfig):
     _config_name: str = field(default="TabTransformerConfig")
 
 
-# cls = AutoIntConfig
+# cls = TabTransformerConfig
 # desc = "Configuration for Data."
 # doc_str = f"{desc}\nArgs:"
 # for key in cls.__dataclass_fields__.keys():
diff --git a/pytorch_tabular/models/tab_transformer/tab_transformer.py b/pytorch_tabular/models/tab_transformer/tab_transformer.py
@@ -1,15 +1,15 @@
 # Pytorch Tabular
 # Author: Manu Joseph <manujoseph@gmail.com>
 # For license information, see LICENSE.TXT
-# Inspired by implementations 
+# Inspired by implementations
 # 1. lucidrains - https://github.com/lucidrains/tab-transformer-pytorch/
 # If you are interested in Transformers, you should definitely check out his repositories.
-# 2. PyTorch Wide and Deep - https://github.com/jrzaurin/pytorch-widedeep/ 
-# It is another library for tabular data, which supports multi modal problems. 
+# 2. PyTorch Wide and Deep - https://github.com/jrzaurin/pytorch-widedeep/
+# It is another library for tabular data, which supports multi modal problems.
 # Check out the library if you haven't already.
 # 3. AutoGluon - https://github.com/awslabs/autogluon
 # AutoGluon is an AuttoML library which supports Tabular data as well. it is from Amazon Research and is in MXNet
-# 4. LabML Annotated Deep Learning Papers - The position-wise FF was shamelessly copied from 
+# 4. LabML Annotated Deep Learning Papers - The position-wise FF was shamelessly copied from
 # https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/transformers
 """TabTransformer Model"""
 import logging
@@ -18,13 +18,13 @@
 import pytorch_lightning as pl
 import torch
 import torch.nn as nn
-from omegaconf import DictConfig
 from einops import rearrange
+from omegaconf import DictConfig
 
 from pytorch_tabular.utils import _initialize_layers, _linear_dropout_bn
-from .components import TransformerEncoderBlock, SharedEmbeddings
 
 from ..base_model import BaseModel
+from .components import SharedEmbeddings, TransformerEncoderBlock
 
 logger = logging.getLogger(__name__)
 
@@ -163,4 +163,6 @@ def extract_embedding(self):
         if len(self.hparams.categorical_cols) > 0:
             return self.cat_embedding_layers
         else:
-            raise ValueError("Model has been trained with no categorical feature and therefore can't be used as a Categorical Encoder")
+            raise ValueError(
+                "Model has been trained with no categorical feature and therefore can't be used as a Categorical Encoder"
+            )