Remove entmax dependency (#352)

manujosephv · pre-commit-ci[bot] · web-flow · commit 335068110e08 · 2023-12-24T08:45:07.000+05:30
* factored out entmax * removed entmax dependency * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -14,4 +14,3 @@ matplotlib >3.1
 ipywidgets
 einops >=0.6.0, <0.8.0
 rich >=11.0.0
-entmax>=1.1
diff --git a/src/pytorch_tabular/models/common/layers/activations.py b/src/pytorch_tabular/models/common/layers/activations.py
@@ -1,7 +1,7 @@
 # W605
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from entmax import entmax15, sparsemax
 from torch import Tensor
 from torch.autograd import Function
 from torch.jit import script
@@ -43,11 +43,6 @@ def sparsemoid(input):
     return (0.5 * input + 0.5).clamp_(0, 1)
 
 
-entmoid15 = Entmoid15.apply
-entmax15 = entmax15
-sparsemax = sparsemax
-
-
 def t_softmax(input: Tensor, t: Tensor = None, dim: int = -1) -> Tensor:
     if t is None:
         t = torch.tensor(0.5, device=input.device)
@@ -98,3 +93,321 @@ def calculate_t(cls, input: Tensor, r: Tensor, dim: int = -1, eps: float = 1e-8)
     def forward(self, input: Tensor, r: Tensor):
         t = RSoftmax.calculate_t(input, r, self.dim, self.eps)
         return self.tsoftmax(input, t)
+
+
+"""
+An implementation of entmax (Peters et al., 2019). See
+https://arxiv.org/pdf/1905.05702 for detailed description.
+
+This builds on previous work with sparsemax (Martins & Astudillo, 2016).
+See https://arxiv.org/pdf/1602.02068.
+"""
+
+# Author: Ben Peters
+# Author: Vlad Niculae <vlad@vene.ro>
+# License: MIT
+# Including here for conda compatibility
+
+
+def _make_ix_like(X, dim):
+    d = X.size(dim)
+    rho = torch.arange(1, d + 1, device=X.device, dtype=X.dtype)
+    view = [1] * X.dim()
+    view[0] = -1
+    return rho.view(view).transpose(0, dim)
+
+
+def _roll_last(X, dim):
+    if dim == -1:
+        return X
+    elif dim < 0:
+        dim = X.dim() - dim
+
+    perm = [i for i in range(X.dim()) if i != dim] + [dim]
+    return X.permute(perm)
+
+
+def _sparsemax_threshold_and_support(X, dim=-1, k=None):
+    """Core computation for sparsemax: optimal threshold and support size.
+
+    Parameters
+    ----------
+    X : torch.Tensor
+        The input tensor to compute thresholds over.
+
+    dim : int
+        The dimension along which to apply sparsemax.
+
+    k : int or None
+        number of largest elements to partial-sort over. For optimal
+        performance, should be slightly bigger than the expected number of
+        nonzeros in the solution. If the solution is more than k-sparse,
+        this function is recursively called with a 2*k schedule.
+        If `None`, full sorting is performed from the beginning.
+
+    Returns
+    -------
+    tau : torch.Tensor like `X`, with all but the `dim` dimension intact
+        the threshold value for each vector
+    support_size : torch LongTensor, shape like `tau`
+        the number of nonzeros in each vector.
+    """
+
+    if k is None or k >= X.shape[dim]:  # do full sort
+        topk, _ = torch.sort(X, dim=dim, descending=True)
+    else:
+        topk, _ = torch.topk(X, k=k, dim=dim)
+
+    topk_cumsum = topk.cumsum(dim) - 1
+    rhos = _make_ix_like(topk, dim)
+    support = rhos * topk > topk_cumsum
+
+    support_size = support.sum(dim=dim).unsqueeze(dim)
+    tau = topk_cumsum.gather(dim, support_size - 1)
+    tau /= support_size.to(X.dtype)
+
+    if k is not None and k < X.shape[dim]:
+        unsolved = (support_size == k).squeeze(dim)
+
+        if torch.any(unsolved):
+            in_ = _roll_last(X, dim)[unsolved]
+            tau_, ss_ = _sparsemax_threshold_and_support(in_, dim=-1, k=2 * k)
+            _roll_last(tau, dim)[unsolved] = tau_
+            _roll_last(support_size, dim)[unsolved] = ss_
+
+    return tau, support_size
+
+
+def _entmax_threshold_and_support(X, dim=-1, k=None):
+    """Core computation for 1.5-entmax: optimal threshold and support size.
+
+    Parameters
+    ----------
+    X : torch.Tensor
+        The input tensor to compute thresholds over.
+
+    dim : int
+        The dimension along which to apply 1.5-entmax.
+
+    k : int or None
+        number of largest elements to partial-sort over. For optimal
+        performance, should be slightly bigger than the expected number of
+        nonzeros in the solution. If the solution is more than k-sparse,
+        this function is recursively called with a 2*k schedule.
+        If `None`, full sorting is performed from the beginning.
+
+    Returns
+    -------
+    tau : torch.Tensor like `X`, with all but the `dim` dimension intact
+        the threshold value for each vector
+    support_size : torch LongTensor, shape like `tau`
+        the number of nonzeros in each vector.
+    """
+
+    if k is None or k >= X.shape[dim]:  # do full sort
+        Xsrt, _ = torch.sort(X, dim=dim, descending=True)
+    else:
+        Xsrt, _ = torch.topk(X, k=k, dim=dim)
+
+    rho = _make_ix_like(Xsrt, dim)
+    mean = Xsrt.cumsum(dim) / rho
+    mean_sq = (Xsrt**2).cumsum(dim) / rho
+    ss = rho * (mean_sq - mean**2)
+    delta = (1 - ss) / rho
+
+    # NOTE this is not exactly the same as in reference algo
+    # Fortunately it seems the clamped values never wrongly
+    # get selected by tau <= sorted_z. Prove this!
+    delta_nz = torch.clamp(delta, 0)
+    tau = mean - torch.sqrt(delta_nz)
+
+    support_size = (tau <= Xsrt).sum(dim).unsqueeze(dim)
+    tau_star = tau.gather(dim, support_size - 1)
+
+    if k is not None and k < X.shape[dim]:
+        unsolved = (support_size == k).squeeze(dim)
+
+        if torch.any(unsolved):
+            X_ = _roll_last(X, dim)[unsolved]
+            tau_, ss_ = _entmax_threshold_and_support(X_, dim=-1, k=2 * k)
+            _roll_last(tau_star, dim)[unsolved] = tau_
+            _roll_last(support_size, dim)[unsolved] = ss_
+
+    return tau_star, support_size
+
+
+class SparsemaxFunction(Function):
+    @classmethod
+    def forward(cls, ctx, X, dim=-1, k=None):
+        ctx.dim = dim
+        max_val, _ = X.max(dim=dim, keepdim=True)
+        X = X - max_val  # same numerical stability trick as softmax
+        tau, supp_size = _sparsemax_threshold_and_support(X, dim=dim, k=k)
+        output = torch.clamp(X - tau, min=0)
+        ctx.save_for_backward(supp_size, output)
+        return output
+
+    @classmethod
+    def backward(cls, ctx, grad_output):
+        supp_size, output = ctx.saved_tensors
+        dim = ctx.dim
+        grad_input = grad_output.clone()
+        grad_input[output == 0] = 0
+
+        v_hat = grad_input.sum(dim=dim) / supp_size.to(output.dtype).squeeze(dim)
+        v_hat = v_hat.unsqueeze(dim)
+        grad_input = torch.where(output != 0, grad_input - v_hat, grad_input)
+        return grad_input, None, None
+
+
+class Entmax15Function(Function):
+    @classmethod
+    def forward(cls, ctx, X, dim=0, k=None):
+        ctx.dim = dim
+
+        max_val, _ = X.max(dim=dim, keepdim=True)
+        X = X - max_val  # same numerical stability trick as for softmax
+        X = X / 2  # divide by 2 to solve actual Entmax
+
+        tau_star, _ = _entmax_threshold_and_support(X, dim=dim, k=k)
+
+        Y = torch.clamp(X - tau_star, min=0) ** 2
+        ctx.save_for_backward(Y)
+        return Y
+
+    @classmethod
+    def backward(cls, ctx, dY):
+        (Y,) = ctx.saved_tensors
+        gppr = Y.sqrt()  # = 1 / g'' (Y)
+        dX = dY * gppr
+        q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
+        q = q.unsqueeze(ctx.dim)
+        dX -= q * gppr
+        return dX, None, None
+
+
+def sparsemax(X, dim=-1, k=None):
+    """sparsemax: normalizing sparse transform (a la softmax).
+
+    Solves the projection:
+
+        min_p ||x - p||_2   s.t.    p >= 0, sum(p) == 1.
+
+    Parameters
+    ----------
+    X : torch.Tensor
+        The input tensor.
+
+    dim : int
+        The dimension along which to apply sparsemax.
+
+    k : int or None
+        number of largest elements to partial-sort over. For optimal
+        performance, should be slightly bigger than the expected number of
+        nonzeros in the solution. If the solution is more than k-sparse,
+        this function is recursively called with a 2*k schedule.
+        If `None`, full sorting is performed from the beginning.
+
+    Returns
+    -------
+    P : torch tensor, same shape as X
+        The projection result, such that P.sum(dim=dim) == 1 elementwise.
+    """
+
+    return SparsemaxFunction.apply(X, dim, k)
+
+
+def entmax15(X, dim=-1, k=None):
+    """1.5-entmax: normalizing sparse transform (a la softmax).
+
+    Solves the optimization problem:
+
+        max_p <x, p> - H_1.5(p)    s.t.    p >= 0, sum(p) == 1.
+
+    where H_1.5(p) is the Tsallis alpha-entropy with alpha=1.5.
+
+    Parameters
+    ----------
+    X : torch.Tensor
+        The input tensor.
+
+    dim : int
+        The dimension along which to apply 1.5-entmax.
+
+    k : int or None
+        number of largest elements to partial-sort over. For optimal
+        performance, should be slightly bigger than the expected number of
+        nonzeros in the solution. If the solution is more than k-sparse,
+        this function is recursively called with a 2*k schedule.
+        If `None`, full sorting is performed from the beginning.
+
+    Returns
+    -------
+    P : torch tensor, same shape as X
+        The projection result, such that P.sum(dim=dim) == 1 elementwise.
+    """
+
+    return Entmax15Function.apply(X, dim, k)
+
+
+class Sparsemax(nn.Module):
+    def __init__(self, dim=-1, k=None):
+        """sparsemax: normalizing sparse transform (a la softmax).
+
+        Solves the projection:
+
+            min_p ||x - p||_2   s.t.    p >= 0, sum(p) == 1.
+
+        Parameters
+        ----------
+        dim : int
+            The dimension along which to apply sparsemax.
+
+        k : int or None
+            number of largest elements to partial-sort over. For optimal
+            performance, should be slightly bigger than the expected number of
+            nonzeros in the solution. If the solution is more than k-sparse,
+            this function is recursively called with a 2*k schedule.
+            If `None`, full sorting is performed from the beginning.
+        """
+        self.dim = dim
+        self.k = k
+        super().__init__()
+
+    def forward(self, X):
+        return sparsemax(X, dim=self.dim, k=self.k)
+
+
+class Entmax15(nn.Module):
+    def __init__(self, dim=-1, k=None):
+        """1.5-entmax: normalizing sparse transform (a la softmax).
+
+        Solves the optimization problem:
+
+            max_p <x, p> - H_1.5(p)    s.t.    p >= 0, sum(p) == 1.
+
+        where H_1.5(p) is the Tsallis alpha-entropy with alpha=1.5.
+
+        Parameters
+        ----------
+        dim : int
+            The dimension along which to apply 1.5-entmax.
+
+        k : int or None
+            number of largest elements to partial-sort over. For optimal
+            performance, should be slightly bigger than the expected number of
+            nonzeros in the solution. If the solution is more than k-sparse,
+            this function is recursively called with a 2*k schedule.
+            If `None`, full sorting is performed from the beginning.
+        """
+        self.dim = dim
+        self.k = k
+        super().__init__()
+
+    def forward(self, X):
+        return entmax15(X, dim=self.dim, k=self.k)
+
+
+entmoid15 = Entmoid15.apply
+entmax15 = entmax15
+sparsemax = sparsemax