[lwm] Add draft of Learning without Memorizing.

arthurdouillard · arthurdouillard · commit 0621230cfb7d · 2019-12-02T10:42:39.000+01:00
diff --git a/inclearn/lib/losses/distillation.py b/inclearn/lib/losses/distillation.py
@@ -1,6 +1,8 @@
 import torch
 from torch.nn import functional as F
 
+from inclearn.lib import vizualization
+
 
 def mer_loss(new_logits, old_logits):
     """Distillation loss that is less important if the new model is unconfident.
@@ -145,7 +147,7 @@ def perceptual_features_reconstruction(list_attentions_a, list_attentions_b, fac
         a = F.normalize(a, p=2, dim=-1)
         b = F.normalize(b, p=2, dim=-1)
 
-        layer_loss = (F.pairwise_distance(a, b, p=2) ** 2) / (c * w * h)
+        layer_loss = (F.pairwise_distance(a, b, p=2)**2) / (c * w * h)
         loss += torch.mean(layer_loss)
 
     return factor * (loss / len(list_attentions_a))
@@ -163,7 +165,37 @@ def perceptual_style_reconstruction(list_attentions_a, list_attentions_b, factor
         gram_a = torch.bmm(a, a.transpose(2, 1)) / (c * w * h)
         gram_b = torch.bmm(b, b.transpose(2, 1)) / (c * w * h)
 
-        layer_loss = torch.frobenius_norm(gram_a - gram_b, dim=(1, 2)) ** 2
+        layer_loss = torch.frobenius_norm(gram_a - gram_b, dim=(1, 2))**2
         loss += layer_loss.mean()
 
     return factor * (loss / len(list_attentions_a))
+
+
+def gradcam_distillation(gradients_a, gradients_b, activations_a, activations_b, factor=1):
+    """Distillation loss between gradcam-generated attentions of two models.
+
+    References:
+        * Dhar et al.
+          Learning without Memorizing
+          CVPR 2019
+
+    :param base_logits: [description]
+    :param list_attentions_a: [description]
+    :param list_attentions_b: [description]
+    :param factor: [description], defaults to 1
+    :return: [description]
+    """
+    attentions_a = gradients_a * activations_a
+    attentions_b = gradients_b * activations_b
+
+    assert len(attentions_a.shape) == len(attentions_b.shape) == 4
+    assert attentions_a.shape == attentions_b.shape
+
+    batch_size = attentions_a.shape[0]
+
+    flat_attention_a = F.normalize(attentions_a.view(batch_size, -1), p=2, dim=-1)
+    flat_attention_b = F.normalize(attentions_b.view(batch_size, -1), p=2, dim=-1)
+
+    distances = F.pairwise_distance(flat_attention_a, flat_attention_b, p=1)
+
+    return factor * torch.mean(distances)
diff --git a/inclearn/lib/network/basenet.py b/inclearn/lib/network/basenet.py
@@ -26,6 +26,7 @@ def __init__(
         classifier_no_act=False,
         attention_hook=False,
         rotations_predictor=False,
+        gradcam_hook=False,
         dropout=0.
     ):
         super(BasicNet, self).__init__()
@@ -74,12 +75,17 @@ def __init__(
         self.extract_no_act = extract_no_act
         self.classifier_no_act = classifier_no_act
         self.attention_hook = attention_hook
+        self.gradcam_hook = gradcam_hook
         self.device = device
 
+        if self.gradcam_hook:
+            self._hooks = [None, None]
+            logger.info("Setting gradcam hook for gradients + activations of last conv.")
+            self.set_gradcam_hook()
         if self.extract_no_act:
-            print("Features will be extracted without the last ReLU.")
+            logger.info("Features will be extracted without the last ReLU.")
         if self.classifier_no_act:
-            print("No ReLU will be applied on features before feeding the classifier.")
+            logger.info("No ReLU will be applied on features before feeding the classifier.")
 
         self.to(self.device)
 
@@ -96,20 +102,19 @@ def on_epoch_end(self):
             self.post_processor.on_epoch_end()
 
     def forward(self, x):
-        outputs = self.convnet(x, attention_hook=self.attention_hook)
-        selected_outputs = outputs[0] if self.classifier_no_act else outputs[1]
-        logits = self.classifier(self.dropout(selected_outputs))
+        outputs = self.convnet(x)
 
-        outputs = {"logits": logits}
+        if self.classifier_no_act:
+            selected_features = outputs["raw_features"]
+        else:
+            selected_features = outputs["features"]
+        logits = self.classifier(self.dropout(selected_features))
 
-        if self.return_features:
-            if self.extract_no_act:
-                outputs["features"] = outputs[0]
-            else:
-                outputs["features"] =  outputs[1]
+        outputs["logits"] = logits
 
-            if self.attention_hook:
-                outputs["attention_maps"] = outputs[2]
+        if self.gradcam_hook:
+            outputs["gradcam_gradients"] = self._gradcam_gradients
+            outputs["gradcam_activations"] = self._gradcam_activations
 
         return outputs
 
@@ -135,10 +140,10 @@ def add_custom_weights(self, weights):
             self.classifier.add_custom_weights(weights)
 
     def extract(self, x):
-        raw_features, features = self.convnet(x)
+        outputs = self.convnet(x)
         if self.extract_no_act:
-            return raw_features
-        return features
+            return outputs["raw_features"]
+        return outputs["features"]
 
     def predict_rotations(self, inputs):
         if self.rotations_predictor is None:
@@ -160,6 +165,9 @@ def freeze(self, trainable=False, model="all"):
 
         for param in model.parameters():
             param.requires_grad = trainable
+        if self.gradcam_hook and model == "convnet":
+            for param in self.convnet.last_conv.parameters():
+                param.requires_grad = True
 
         if not trainable:
             model.eval()
@@ -185,3 +193,24 @@ def copy(self):
     @property
     def n_classes(self):
         return self.classifier.n_classes
+
+    def unset_gradcam_hook(self):
+        self._hooks[0].remove()
+        self._hooks[1].remove()
+        self._hooks[0] = None
+        self._hooks[1] = None
+        self._gradcam_gradients, self._gradcam_activations = [None], [None]
+
+    def set_gradcam_hook(self):
+        self._gradcam_gradients, self._gradcam_activations = [None], [None]
+
+        def backward_hook(module, grad_input, grad_output):
+            self._gradcam_gradients[0] = grad_output[0]
+            return None
+
+        def forward_hook(module, input, output):
+            self._gradcam_activations[0] = output
+            return None
+
+        self._hooks[0] = self.convnet.last_conv.register_backward_hook(backward_hook)
+        self._hooks[1] = self.convnet.last_conv.register_forward_hook(forward_hook)
diff --git a/inclearn/models/lwm.py b/inclearn/models/lwm.py
@@ -0,0 +1,187 @@
+import logging
+import pdb
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from inclearn.lib import factory, loops, losses, network, utils
+from inclearn.models import IncrementalLearner
+
+EPSILON = 1e-8
+
+logger = logging.getLogger(__name__)
+
+
+class LwM(IncrementalLearner):
+
+    def __init__(self, args):
+        self._device = args["device"][0]
+        self._multiple_devices = args["device"]
+
+        self._opt_name = args["optimizer"]
+        self._lr = args["lr"]
+        self._lr_decay = args["lr_decay"]
+        self._weight_decay = args["weight_decay"]
+        self._n_epochs = args["epochs"]
+        self._scheduling = args["scheduling"]
+
+        self._distillation_config = args["distillation_config"]
+        self._attention_config = args.get("attention_config", {})
+
+        logger.info("Initializing LwM")
+
+        self._network = network.BasicNet(
+            args["convnet"],
+            convnet_kwargs=args.get("convnet_config", {}),
+            classifier_kwargs=args.get("classifier_config", {
+                "type": "fc",
+                "use_bias": True
+            }),
+            device=self._device,
+            gradcam_hook=True
+        )
+
+        self._n_classes = 0
+        self._old_model = None
+
+    @property
+    def network(self):
+        return self._network
+
+    @network.setter
+    def network(self, network_path):
+        if self._network is not None:
+            del self._network
+
+    def eval(self):
+        self._network.eval()
+
+    def train(self):
+        self._network.train()
+
+    def _before_task(self, data_loader, val_loader):
+        self._n_classes += self._task_size
+        self._network.add_classes(self._task_size)
+
+        self._optimizer = factory.get_optimizer(
+            self._network.parameters(), self._opt_name, self._lr, self._weight_decay
+        )
+        if self._scheduling is None:
+            self._scheduler = None
+        else:
+            self._scheduler = torch.optim.lr_scheduler.MultiStepLR(
+                self._optimizer, self._scheduling, gamma=self._lr_decay
+            )
+
+    def _train_task(self, train_loader, val_loader):
+        loops.single_loop(
+            train_loader,
+            val_loader,
+            self._multiple_devices,
+            self._network,
+            self._n_epochs,
+            self._optimizer,
+            scheduler=self._scheduler,
+            train_function=self._forward_loss,
+            eval_function=self._accuracy,
+            task=self._task,
+            n_tasks=self._n_tasks
+        )
+
+    def _after_task(self, inc_dataset):
+        self._network.zero_grad()
+        self._network.unset_gradcam_hook()
+        self._old_model = self._network.copy().eval().to(self._device)
+        self._network.on_task_end()
+
+        self._network.set_gradcam_hook()
+        self._old_model.set_gradcam_hook()
+
+    def _eval_task(self, loader):
+        ypred, ytrue = [], []
+
+        for input_dict in loader:
+            with torch.no_grad():
+                logits = self._network(input_dict["inputs"].to(self._device))["logits"]
+
+            ytrue.append(input_dict["targets"].numpy())
+            ypred.append(torch.softmax(logits, dim=1).cpu().numpy())
+
+        ytrue = np.concatenate(ytrue)
+        ypred = np.concatenate(ypred)
+
+        return ypred, ytrue
+
+    def _accuracy(self, loader):
+        ypred, ytrue = self._eval_task(loader)
+        ypred = ypred.argmax(dim=1)
+
+        return 100 * round(np.mean(ypred == ytrue), 3)
+
+    def _forward_loss(self, training_network, inputs, targets, memory_flags, metrics):
+        inputs, targets = inputs.to(self._device), targets.to(self._device)
+        onehot_targets = utils.to_onehot(targets, self._n_classes).to(self._device)
+
+        outputs = training_network(inputs)
+
+        loss = self._compute_loss(inputs, outputs, targets, onehot_targets, memory_flags, metrics)
+
+        if not utils.check_loss(loss):
+            raise ValueError("Loss became invalid ({}).".format(loss))
+
+        metrics["loss"] += loss.item()
+
+        return loss
+
+    def _compute_loss(self, inputs, outputs, targets, onehot_targets, memory_flags, metrics):
+        logits = outputs["logits"]
+
+        if self._old_model is None:
+            # Classification loss
+            loss = F.cross_entropy(logits, targets)
+            metrics["clf"] += loss.item()
+        else:
+            self._old_model.zero_grad()
+            old_outputs = self._old_model(inputs)
+            old_logits = old_outputs["logits"]
+
+            # Classification loss
+            loss = F.cross_entropy(
+                logits[..., -self._task_size:], (targets - self._n_classes + self._task_size)
+            )
+            metrics["clf"] += loss.item()
+
+            # Distillation on probabilities
+            distill_loss = self._distillation_config["factor"] * F.binary_cross_entropy_with_logits(
+                logits[..., :-self._task_size], torch.sigmoid(old_logits.detach())
+            )
+            metrics["dis"] += distill_loss.item()
+            loss += distill_loss
+
+            # Distillation on gradcam-generated attentions
+            if self._attention_config:
+                top_logits_indexes = logits[..., :-self._task_size].argmax(dim=1)
+                onehot_top_logits = utils.to_onehot(
+                    top_logits_indexes, self._n_classes - self._task_size
+                ).to(self._device)
+
+                logits[..., :-self._task_size].backward(
+                    gradient=onehot_top_logits, retain_graph=True, create_graph=True
+                )
+                old_logits.backward(
+                    gradient=onehot_top_logits, retain_graph=True, create_graph=True
+                )
+
+                attention_loss = losses.gradcam_distillation(
+                    outputs["gradcam_gradients"][0], old_outputs["gradcam_gradients"][0].detach(),
+                    outputs["gradcam_activations"][0],
+                    old_outputs["gradcam_activations"][0].detach(), **self._attention_config
+                )
+                metrics["ad"] += attention_loss.item()
+                loss += attention_loss
+
+                self._old_model.zero_grad()
+                self._network.zero_grad()
+
+        return loss