pytorch-tabular
diff --git a/‎examples/to_test_classification.py‎
Lines changed: 18 additions & 10 deletions b/‎examples/to_test_classification.py‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎pytorch_tabular/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎pytorch_tabular/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pytorch_tabular/models/autoint/autoint.py‎
Lines changed: 5 additions & 5 deletions b/‎pytorch_tabular/models/autoint/autoint.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎pytorch_tabular/models/category_embedding/category_embedding_model.py‎
Lines changed: 4 additions & 4 deletions b/‎pytorch_tabular/models/category_embedding/category_embedding_model.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pytorch_tabular/models/common.py‎
Lines changed: 120 additions & 0 deletions b/‎pytorch_tabular/models/common.py‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎pytorch_tabular/models/tab_transformer/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎pytorch_tabular/models/tab_transformer/__init__.py‎
Lines changed: 3 additions & 3 deletions
@@ -1,3 +1,4 @@
+from pytorch_tabular.models.tab_transformer.config import TabTransformerConfig
 import torch
 import numpy as np
 from torch.functional import norm
@@ -88,16 +89,21 @@
     continuous_feature_transform=None,#"quantile_normal",
     normalize_continuous_features=False,
 )
-model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1","accuracy"], metrics_params=[{"num_classes":num_classes},{}])
-# model_config = NodeConfig(
+# model_config = CategoryEmbeddingModelConfig(task="classification", metrics=["f1","accuracy"], metrics_params=[{"num_classes":num_classes},{}])
+model_config = NodeConfig(
+    task="classification",
+    depth=4,
+    num_trees=1024,
+    input_dropout=0.0,
+    metrics=["f1", "accuracy"],
+    metrics_params=[{"num_classes": num_classes, "average": "macro"}, {}],
+)
+# model_config = TabTransformerConfig(
 #     task="classification",
-#     depth=4,
-#     num_trees=1024,
-#     input_dropout=0.0,
 #     metrics=["f1", "accuracy"],
 #     metrics_params=[{"num_classes": num_classes, "average": "macro"}, {}],
 # )
-trainer_config = TrainerConfig(gpus=-1, auto_select_gpus=True, fast_dev_run=False, max_epochs=5, batch_size=1024)
+trainer_config = TrainerConfig(gpus=-1, auto_select_gpus=True, fast_dev_run=False, max_epochs=5, batch_size=512)
 experiment_config = ExperimentConfig(project_name="PyTorch Tabular Example", 
                                      run_name="node_forest_cov", 
                                      exp_watch="gradients", 
@@ -130,8 +136,10 @@
 result = tabular_model.evaluate(test)
 print(result)
 # test.drop(columns=target_name, inplace=True)
-# pred_df = tabular_model.predict(test)
+pred_df = tabular_model.predict(test)
+print(pred_df.head())
 # pred_df.to_csv("output/temp2.csv")
-# tabular_model.save_model("test_save")
-# new_model = TabularModel.load_from_checkpoint("test_save")
-# result = new_model.evaluate(test)
+tabular_model.save_model("test_save")
+new_model = TabularModel.load_from_checkpoint("test_save")
+result = new_model.evaluate(test)
+print(result)
@@ -12,6 +12,7 @@
     AutoIntMDNConfig
 )
 from .autoint import AutoIntConfig, AutoIntModel
+from .tab_transformer import TabTransformerConfig, TabTransformerModel
 from .base_model import BaseModel
 from . import category_embedding, node, mixture_density, tabnet, autoint
 
@@ -33,9 +34,12 @@
     "AutoIntMDNConfig",
     "AutoIntConfig",
     "AutoIntModel",
+    "TabTransformerConfig", 
+    "TabTransformerModel",
     "category_embedding",
     "node",
     "mixture_density",
     "tabnet",
     "autoint",
+    "tab_transformer"
 ]
@@ -46,24 +46,24 @@ def _build_network(self):
         # Deep Layers
         _curr_units = self.hparams.embedding_dim
         if self.hparams.deep_layers:
-            activation = getattr(nn, self.hparams.activation)
             # Linear Layers
             layers = []
             for units in self.hparams.layers.split("-"):
                 layers.extend(
                     _linear_dropout_bn(
-                        self.hparams,
+                        self.hparams.activation,
+                        self.hparams.initialization,
+                        self.hparams.use_batch_norm,
                         _curr_units,
                         int(units),
-                        activation,
                         self.hparams.dropout,
                     )
                 )
                 _curr_units = int(units)
             self.linear_layers = nn.Sequential(*layers)
         # Projection to Multi-Headed Attention Dims
         self.attn_proj = nn.Linear(_curr_units, self.hparams.attn_embed_dim)
-        _initialize_layers(self.hparams, self.attn_proj)
+        _initialize_layers(self.hparams.activation, self.hparams.initialization, self.attn_proj)
         # Multi-Headed Attention Layers
         self.self_attns = nn.ModuleList(
             [
@@ -152,7 +152,7 @@ def _build_network(self):
         self.output_layer = nn.Linear(
             self.backbone.output_dim, self.hparams.output_dim
         )  # output_dim auto-calculated from other config
-        _initialize_layers(self.hparams, self.output_layer)
+        _initialize_layers(self.hparams.activation, self.hparams.initialization, self.output_layer)
 
     def forward(self, x: Dict):
         x = self.backbone(x)
 
@@ -24,7 +24,6 @@ def __init__(self, config: DictConfig, **kwargs):
         self._build_network()
 
     def _build_network(self):
-        activation = getattr(nn, self.hparams.activation)
         # Linear Layers
         layers = []
         _curr_units = self.embedding_cat_dim + self.hparams.continuous_dim
@@ -33,10 +32,11 @@ def _build_network(self):
         for units in self.hparams.layers.split("-"):
             layers.extend(
                 _linear_dropout_bn(
-                    self.hparams,
+                    self.hparams.activation,
+                    self.hparams.initialization,
+                    self.hparams.use_batch_norm,
                     _curr_units,
                     int(units),
-                    activation,
                     self.hparams.dropout,
                 )
             )
@@ -69,7 +69,7 @@ def _build_network(self):
         self.output_layer = nn.Linear(
             self.backbone.output_dim, self.hparams.output_dim
         )  # output_dim auto-calculated from other config
-        _initialize_layers(self.hparams, self.output_layer)
+        _initialize_layers(self.hparams.activation, self.hparams.initialization, self.output_layer)
 
     def unpack_input(self, x: Dict):
         continuous_data, categorical_data = x["continuous"], x["categorical"]
 
@@ -0,0 +1,120 @@
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+
+from einops import rearrange
+
+
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+
+
+# https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/feed_forward.py
+class PositionWiseFeedForward(nn.Module):
+    """
+    title: Position-wise Feed-Forward Network (FFN)
+    summary: Documented reusable implementation of the position wise feedforward network.
+
+    # Position-wise Feed-Forward Network (FFN)
+    This is a [PyTorch](https://pytorch.org)  implementation
+    of position-wise feedforward network used in transformer.
+    FFN consists of two fully connected layers.
+    Number of dimensions in the hidden layer $d_{ff}$, is generally set to around
+    four times that of the token embedding $d_{model}$.
+    So it is sometime also called the expand-and-contract network.
+    There is an activation at the hidden layer, which is
+    usually set to ReLU (Rectified Linear Unit) activation, $$\max(0, x)$$
+    That is, the FFN function is,
+    $$FFN(x, W_1, W_2, b_1, b_2) = \max(0, x W_1 + b_1) W_2 + b_2$$
+    where $W_1$, $W_2$, $b_1$ and $b_2$ are learnable parameters.
+    Sometimes the
+    GELU (Gaussian Error Linear Unit) activation is also used instead of ReLU.
+    $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
+    ### Gated Linear Units
+    This is a generic implementation that supports different variants including
+    [Gated Linear Units](https://arxiv.org/abs/2002.05202) (GLU).
+    We have also implemented experiments on these:
+    * [experiment that uses `labml.configs`](glu_variants/experiment.html)
+    * [simpler version from scratch](glu_variants/simple.html)
+    """
+
+    def __init__(self, d_model: int, d_ff: int,
+                 dropout: float = 0.1,
+                 activation=nn.ReLU(),
+                 is_gated: bool = False,
+                 bias1: bool = True,
+                 bias2: bool = True,
+                 bias_gate: bool = True):
+        """
+        * `d_model` is the number of features in a token embedding
+        * `d_ff` is the number of features in the hidden layer of the FFN
+        * `dropout` is dropout probability for the hidden layer
+        * `is_gated` specifies whether the hidden layer is gated
+        * `bias1` specified whether the first fully connected layer should have a learnable bias
+        * `bias2` specified whether the second fully connected layer should have a learnable bias
+        * `bias_gate` specified whether the fully connected layer for the gate should have a learnable bias
+        """
+        super().__init__()
+        # Layer one parameterized by weight $W_1$ and bias $b_1$
+        self.layer1 = nn.Linear(d_model, d_ff, bias=bias1)
+        # Layer one parameterized by weight $W_1$ and bias $b_1$
+        self.layer2 = nn.Linear(d_ff, d_model, bias=bias2)
+        # Hidden layer dropout
+        self.dropout = nn.Dropout(dropout)
+        # Activation function $f$
+        self.activation = activation
+        # Whether there is a gate
+        self.is_gated = is_gated
+        if is_gated:
+            # If there is a gate the linear layer to transform inputs to
+            # be multiplied by the gate, parameterized by weight $V$ and bias $c$
+            self.linear_v = nn.Linear(d_model, d_ff, bias=bias_gate)
+
+    def forward(self, x: torch.Tensor):
+        # $f(x W_1 + b_1)$
+        g = self.activation(self.layer1(x))
+        # If gated, $f(x W_1 + b_1) \otimes (x V + b) $
+        if self.is_gated:
+            x = g * self.linear_v(x)
+        # Otherwise
+        else:
+            x = g
+        # Apply dropout
+        x = self.dropout(x)
+        # $(f(x W_1 + b_1) \otimes (x V + b)) W_2 + b_2$ or $f(x W_1 + b_1) W_2 + b_2$
+        # depending on whether it is gated
+        return self.layer2(x)
+
+# GLU Variants Improve Transformer https://arxiv.org/pdf/2002.05202.pdf
+class GEGLU(nn.Module):
+    def __init__(self, d_model: int, d_ff: int,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout, nn.GELU(), True, False, False, False)
+    
+    def forward(self, x: torch.Tensor):
+        return self.ffn(x)
+
+class ReGLU(nn.Module):
+    def __init__(self, d_model: int, d_ff: int,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout, nn.ReLU(), True, False, False, False)
+    
+    def forward(self, x: torch.Tensor):
+        return self.ffn(x)
+
+class SwiGLU(nn.Module):
+    def __init__(self, d_model: int, d_ff: int,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.ffn = PositionWiseFeedForward(d_model, d_ff, dropout, nn.SiLU(), True, False, False, False)
+    
+    def forward(self, x: torch.Tensor):
+        return self.ffn(x)
+
@@ -1,4 +1,4 @@
-from .autoint import AutoIntBackbone, AutoIntModel
-from .config import AutoIntConfig
+from .tab_transformer import TabTransformerBackbone, TabTransformerModel
+from .config import TabTransformerConfig
 
-__all__ = ["AutoIntModel", "AutoIntBackbone", "AutoIntConfig"]
+__all__ = ["TabTransformerBackbone", "TabTransformerModel", "TabTransformerConfig"]