Add lit-llama benchmarks (logits, autoregressive generation, lora fine tuning) (#1730)

ezyang · facebook-github-bot · commit 02ff72b44516 · 2023-07-07T11:34:28.000-07:00
Summary: Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: #1730 Reviewed By: xuzhao9 Differential Revision: D47295535 Pulled By: ezyang fbshipit-source-id: 9e5998569b6dc58c6c918c4caaf1e6c896600430
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "submodules/FAMBench"]
 	path = submodules/FAMBench
 	url = https://github.com/facebookresearch/FAMBench.git
+[submodule "submodules/lit-llama"]
+	path = submodules/lit-llama
+	url = https://github.com/Lightning-AI/lit-llama.git
diff --git a/submodules/lit-llama b/submodules/lit-llama
@@ -0,0 +1 @@
+Subproject commit 8aa65ba33e844c283c0a84b9758445fe0c6aab2d
diff --git a/torchbenchmark/models/lit_llama/__init__.py b/torchbenchmark/models/lit_llama/__init__.py
@@ -0,0 +1,52 @@
+from ...util.model import BenchmarkModel
+from torchbenchmark.tasks import NLP
+import torch
+import os
+from torchbenchmark import add_path, REPO_PATH
+import sys
+import lightning as L
+
+LIT_LLAMA_PATH = os.path.join(REPO_PATH, "submodules", "lit-llama")
+
+with add_path(LIT_LLAMA_PATH):
+    from lit_llama.utils import EmptyInitOnDevice, lazy_load, llama_model_lookup
+    from lit_llama import LLaMA, Tokenizer
+
+class Model(BenchmarkModel):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_EVAL_BSIZE = 1
+
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+
+        checkpoint_path = os.path.join(LIT_LLAMA_PATH, "checkpoints/lit-llama/7B/lit-llama.pth")
+        if not os.path.exists(checkpoint_path):
+            raise NotImplementedError("checkpoint doesn't exist")
+        with lazy_load(checkpoint_path) as checkpoint:
+            name = llama_model_lookup(checkpoint)
+
+            with EmptyInitOnDevice(device=device):
+                model = LLaMA.from_name(name)
+            model.load_state_dict(checkpoint)
+
+        self.model = model
+        self.seq_len = 32
+        self.max_seq_len = 64
+        self.example_inputs = (
+            torch.ones([self.batch_size, self.seq_len], dtype=torch.int32, device=self.device),
+            self.max_seq_len,
+            torch.arange(self.seq_len, dtype=torch.int64, device=self.device)  # positions
+        )
+
+
+    def get_module(self):
+        return self.model, self.example_inputs
+
+    def train(self):
+        return NotImplementedError("you will OOM trying to train directly")
+
+    def eval(self):
+        self.model.eval()
+        with torch.no_grad():
+            logits = self.model(*self.example_inputs)
+        return (logits,)
diff --git a/torchbenchmark/models/lit_llama/install.py b/torchbenchmark/models/lit_llama/install.py
@@ -0,0 +1,4 @@
+from torchbenchmark.util.framework.lit_llama import install_lit_llama
+
+if __name__ == '__main__':
+    install_lit_llama()
diff --git a/torchbenchmark/models/lit_llama/metadata.yaml b/torchbenchmark/models/lit_llama/metadata.yaml
@@ -0,0 +1,11 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 32
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
+not_implemented:
+  - test: eval
+  - test: example
diff --git a/torchbenchmark/models/lit_llama_generate/__init__.py b/torchbenchmark/models/lit_llama_generate/__init__.py
@@ -0,0 +1,41 @@
+from .. import lit_llama as lit_llama
+from ..lit_llama import LIT_LLAMA_PATH
+import importlib.util
+import os.path
+import torch.nn as nn
+import sys
+from lit_llama import Tokenizer
+
+def import_from_file_path(module_name, file_path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    sys.modules[module_name] = module
+    return module
+
+lit_llama_generate = import_from_file_path("lit_llama_generate", os.path.join(LIT_LLAMA_PATH, 'generate.py'))
+
+class GenerationWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, idx, max_new_tokens):
+        return lit_llama_generate.generate(self.model, idx, max_new_tokens)
+
+class Model(lit_llama.Model):
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+        self.model = GenerationWrapper(self.model)
+        tokenizer = Tokenizer(os.path.join(LIT_LLAMA_PATH, "checkpoints/lit-llama/tokenizer.model"))
+        # max_new_tokens matches lit-llama/generate.py
+        self.example_inputs = (tokenizer.encode("The meaning of life is", bos=True, eos=False, device=device), 50)
+
+    def train(self):
+        return NotImplementedError("cannot train on autoregressive generation")
+
+    def eval(self):
+        self.model.eval()
+        with torch.no_grad():
+            y = self.model(*self.example_inputs)
+        return (y,)
diff --git a/torchbenchmark/models/lit_llama_generate/install.py b/torchbenchmark/models/lit_llama_generate/install.py
@@ -0,0 +1,4 @@
+from torchbenchmark.util.framework.lit_llama import install_lit_llama
+
+if __name__ == '__main__':
+    install_lit_llama()
diff --git a/torchbenchmark/models/lit_llama_generate/metadata.yaml b/torchbenchmark/models/lit_llama_generate/metadata.yaml
@@ -0,0 +1,11 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 32
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
+not_implemented:
+  - test: eval
+  - test: example
diff --git a/torchbenchmark/models/lit_llama_lora/__init__.py b/torchbenchmark/models/lit_llama_lora/__init__.py
@@ -0,0 +1,66 @@
+from ...util.model import BenchmarkModel
+from torchbenchmark.tasks import NLP
+import torch
+from ..lit_llama import LIT_LLAMA_PATH
+import importlib.util
+import os.path
+import torch.nn as nn
+import sys
+from lit_llama.lora import mark_only_lora_as_trainable, lora, lora_state_dict
+from torchbenchmark import REPO_PATH
+
+LIT_LLAMA_PATH = os.path.join(REPO_PATH, "submodules", "lit-llama")
+
+sys.path.insert(0, LIT_LLAMA_PATH)
+
+from lit_llama.utils import EmptyInitOnDevice, lazy_load, llama_model_lookup
+from lit_llama import LLaMA, Tokenizer
+
+class Model(BenchmarkModel):
+    task = NLP.LANGUAGE_MODELING
+    DEFAULT_EVAL_BSIZE = 1
+    DEFAULT_TRAIN_BSIZE = 4  # micro_batch_size in lora.py
+
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+
+        # From finetune/lora.py hyperparameters
+        lora_r = 8
+        lora_alpha = 16
+        lora_dropout = 0.05
+
+        checkpoint_path = os.path.join(LIT_LLAMA_PATH, "checkpoints/lit-llama/7B/lit-llama.pth")
+        if not os.path.exists(checkpoint_path):
+            raise NotImplementedError("checkpoint doesn't exist")
+        with lazy_load(checkpoint_path) as checkpoint, lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
+            name = llama_model_lookup(checkpoint)
+
+            with EmptyInitOnDevice(device=device):
+                model = LLaMA.from_name(name)
+            # LoRA weights won't be in base checkpoint
+            model.load_state_dict(checkpoint, strict=False)
+
+        mark_only_lora_as_trainable(model)
+
+        self.model = model
+        self.seq_len = 32
+        self.max_seq_len = 64
+        self.example_inputs = (
+            torch.ones([self.batch_size, self.seq_len], dtype=torch.int32, device=self.device),
+            self.max_seq_len,
+        )
+
+
+    def get_module(self):
+        return self.model, self.example_inputs
+
+    def train(self):
+        logits = self.model(*self.example_inputs)
+        logits.sum().backward()
+        # meh this sucks
+
+    def eval(self):
+        self.model.eval()
+        with torch.no_grad():
+            logits = self.model(*self.example_inputs)
+        return (logits,)
diff --git a/torchbenchmark/models/lit_llama_lora/install.py b/torchbenchmark/models/lit_llama_lora/install.py
@@ -0,0 +1,4 @@
+from torchbenchmark.util.framework.lit_llama import install_lit_llama
+
+if __name__ == '__main__':
+    install_lit_llama()
diff --git a/torchbenchmark/models/lit_llama_lora/metadata.yaml b/torchbenchmark/models/lit_llama_lora/metadata.yaml
@@ -0,0 +1,11 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 32
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
+not_implemented:
+  - test: train
+  - test: example
diff --git a/torchbenchmark/util/framework/lit_llama.py b/torchbenchmark/util/framework/lit_llama.py
@@ -0,0 +1,50 @@
+import os
+import sys
+import subprocess
+import traceback
+from pathlib import Path
+from torchbenchmark import REPO_PATH
+
+LIT_LLAMA_PATH = os.path.join(REPO_PATH, "submodules", "lit-llama")
+
+def update_lit_llama_submodule():
+    update_command = ["git", "submodule", "update",
+                      "--init", "--recursive", os.path.join("submodules", "lit-llama")]
+    subprocess.check_call(update_command, cwd=REPO_PATH)
+
+def pip_install_requirements():
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', os.path.join(LIT_LLAMA_PATH, "requirements.txt")])
+
+def openllama_download():
+    if os.path.exists(os.path.join(LIT_LLAMA_PATH, "checkpoints/lit-llama/7B/lit-llama.pth")):
+        return
+    subprocess.check_call([
+        sys.executable,
+        os.path.join(LIT_LLAMA_PATH, 'scripts/download.py'),
+        '--repo_id',
+        'openlm-research/open_llama_7b_700bt_preview',
+        '--local_dir',
+        os.path.join(LIT_LLAMA_PATH, 'checkpoints/open-llama/7B')
+    ])
+    subprocess.check_call([
+        sys.executable,
+        os.path.join(LIT_LLAMA_PATH, 'scripts/convert_hf_checkpoint.py'),
+        '--checkpoint_dir', os.path.join(LIT_LLAMA_PATH, 'checkpoints/open-llama/7B'),
+        '--model_size', '7B',
+    ], cwd=LIT_LLAMA_PATH)
+
+def install_lit_llama():
+    import torch
+    update_lit_llama_submodule()
+    pip_install_requirements()
+    try:
+        from pynvml import nvmlDeviceGetMemoryInfo
+        info = nvmlDeviceGetMemoryInfo(torch.cuda._get_pynvml_handler())
+        if info.total < 40 * 1024 ** 3:
+            print("not enough GPU memory for 7B parameters, skipping llama (avail: {info.total / 1024 ** 3}GB)")
+            return
+    except Exception as e:
+        print("failed to test GPU memory, skipping llama weights")
+        traceback.print_exc()
+        return
+    openllama_download()