From eb2db8b0b6f15e708b6bd2a16c1d348ad2a2b9b9 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 20 Nov 2024 12:27:14 -0800
Subject: [PATCH 1/6] feat: TRTLLM API handle tokenizers without pad_id (e.g.,
 tiktoken) (#399)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: NeMo-Aligner CI <nemo-aligner-ci@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo_aligner/utils/trt_llm.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py
index 1f879064d..9f1a9345f 100644
--- a/nemo_aligner/utils/trt_llm.py
+++ b/nemo_aligner/utils/trt_llm.py
@@ -44,8 +44,9 @@ def append_and_repad_list(list_of_items, item_to_append, pad_id):
 
 
 class GPTGenerateTRTLLM:
-    # If a tokenizer does not have a pad_id, we use a large negative number and replace
-    # with self.eos_id after generation.
+    # Use a reserved negative number since there is variation between tokenizers if
+    #  they (1) have a pad_id (2) don't have a pad_id or (3) have None as the pad_id.
+    #  This pad_id is replaced with eos_id after generation.
     DEFAULT_PAD_ID = -42
 
     def __init__(
@@ -72,12 +73,6 @@ def __init__(
                 "You are trying to use NeMo-Aligner's TensorRT-LLM acceleration for LLM generation. Please build the dockerfile to enable this feature: https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile"
             )
 
-        # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
-        #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
-        #     be a token that the model is guaranteed to never generate)
-        assert (
-            tokenizer.pad_id != tokenizer.eos_id
-        ), f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} when using TRT-LLM. This is to make sure all code goes into the same path and include the eos_id when the response lengths are computed"
         assert max_input_len > 0
         assert max_generation_length > 0
         assert (
@@ -104,7 +99,7 @@ def __init__(
         rng_generator.manual_seed(seed)
         self.rng_generator = rng_generator
 
-        self.pad_id = tokenizer.pad_id if tokenizer.pad_id is not None else GPTGenerateTRTLLM.DEFAULT_PAD_ID
+        self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
         self.eos_id = tokenizer.eos_id
         end_strings = list(end_strings)
 

From 52700819882e871ef1deaeed8f974f92c083725e Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Wed, 30 Oct 2024 21:00:54 +0000
Subject: [PATCH 2/6] feat: nemotron-5 features

wip

Signed-off-by: arendu <adithya.r@gmail.com>

docs: 0.5.0 documentation updates (#346)

Signed-off-by: ashors1 <ashors@nvidia.com>

ci: Sign-off cherry pick (#366)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

docs: main readme and sft docs (#367)

Signed-off-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
Co-authored-by: Gerald Shen <119401249+gshennvm@users.noreply.github.com>

docs: fix code block rendering (#369)

Signed-off-by: ashors1 <ashors@nvidia.com>

dpo and sft

Signed-off-by: arendu <adithya.r@gmail.com>

dpo support

Signed-off-by: root <root@cw-dfw-h100-001-129-026.cm.cluster>

mamba padding

Signed-off-by: arendu <adithya.r@gmail.com>

convenience script to remove old format of DPO data

Signed-off-by: adithyare <adithyare@nvidia.com>

pad to mult 256

Signed-off-by: arendu <adithya.r@gmail.com>

copy dpo style cfg overrides

Signed-off-by: arendu <adithya.r@gmail.com>

remove _modify_config

Signed-off-by: arendu <adithya.r@gmail.com>

fix config issue

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

fix mamba config issue

Signed-off-by: Jiaqi Zeng <jiaqiz@nvidia.com>

is mamba default false

Signed-off-by: arendu <adithya.r@gmail.com>

revert cherry-pick-release-commit

Signed-off-by: Terry Kong <terryk@nvidia.com>

Revert "revert cherry-pick-release-commit"

This reverts commit 911337c626fd51688a342377ed98a9e3d0f226c8.

undo .github/workflows

Signed-off-by: Terry Kong <terryk@nvidia.com>

revert docs changes that weren't supposed to be there

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 examples/nlp/gpt/conf/gpt_dpo.yaml            |  1 +
 examples/nlp/gpt/conf/gpt_sft.yaml            |  2 +-
 examples/nlp/gpt/train_gpt_dpo.py             |  4 +-
 examples/nlp/gpt/train_gpt_sft.py             | 82 ++-----------------
 nemo_aligner/data/nlp/builders.py             |  3 +-
 .../data/nlp/scripts/undo_special_tokens.py   | 70 ++++++++++++++++
 nemo_aligner/models/nlp/gpt/gpt_sft_model.py  |  6 ++
 .../models/nlp/gpt/megatron_gpt_dpo_model.py  |  7 ++
 8 files changed, 97 insertions(+), 78 deletions(-)
 create mode 100644 nemo_aligner/data/nlp/scripts/undo_special_tokens.py

diff --git a/examples/nlp/gpt/conf/gpt_dpo.yaml b/examples/nlp/gpt/conf/gpt_dpo.yaml
index 192265244..4a240bc38 100644
--- a/examples/nlp/gpt/conf/gpt_dpo.yaml
+++ b/examples/nlp/gpt/conf/gpt_dpo.yaml
@@ -57,6 +57,7 @@ model:
   micro_batch_size: 1
   global_batch_size: 64
   megatron_amp_O2: True
+  mamba_hybrid: False
 
   dpo:
     # This default value ensures there are no numeric differences beween trained and reference policies when computing log probs.
diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index bdd757f31..745f6ae01 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -191,7 +191,7 @@ model:
       output_original_text: True  # needed for the proper metrics support
 
   optim:
-    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
     lr: 3e-5
     weight_decay: 0.01
     betas:
diff --git a/examples/nlp/gpt/train_gpt_dpo.py b/examples/nlp/gpt/train_gpt_dpo.py
index f16a9dacf..4fcac6700 100644
--- a/examples/nlp/gpt/train_gpt_dpo.py
+++ b/examples/nlp/gpt/train_gpt_dpo.py
@@ -21,7 +21,7 @@
 from nemo.utils.exp_manager import exp_manager
 from nemo_aligner.algorithms.dpo import DPOTrainer, dpo_custom_collate
 from nemo_aligner.data.nlp.builders import build_dataloader, build_train_valid_test_dpo_datasets, identity_collate
-from nemo_aligner.models.nlp.gpt.megatron_gpt_dpo_model import MegatronGPTDPOModel
+from nemo_aligner.models.nlp.gpt.megatron_gpt_dpo_model import MegatronGPTDPOModel, MegatronMambaDPOModel
 from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
@@ -53,7 +53,7 @@ def main(cfg) -> None:
     logger = CustomLoggerWrapper(trainer.loggers)
 
     ptl_model = load_from_nemo(
-        MegatronGPTDPOModel,
+        MegatronMambaDPOModel if cfg.model.mamba_hybrid else MegatronGPTDPOModel,
         cfg.model,
         trainer,
         strict=True,
diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index 371c0f5aa..7a2b4a2fb 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -27,7 +27,7 @@
 from nemo.utils.exp_manager import exp_manager
 from nemo_aligner.algorithms.supervised import SupervisedTrainer
 from nemo_aligner.data.nlp.builders import build_dataloader, build_sft_dataset
-from nemo_aligner.models.nlp.gpt.gpt_sft_model import GPTSFTModel
+from nemo_aligner.models.nlp.gpt.gpt_sft_model import GPTSFTModel, MambaSFTModel
 from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
@@ -39,7 +39,7 @@
     resolve_and_create_trainer,
     retrieve_custom_trainer_state_dict,
 )
-from nemo_aligner.utils.utils import load_from_nemo
+from nemo_aligner.utils.utils import load_and_override_model_config, load_from_nemo
 
 """Script to start SFT training"""
 
@@ -49,75 +49,9 @@
 mp.set_start_method("spawn", force=True)
 
 
-def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
-    """
-    This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
-    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
-    """
-    OmegaConf.set_struct(gpt_cfg, True)
-    OmegaConf.resolve(cfg)
-    with open_dict(gpt_cfg):
-        gpt_cfg.megatron_amp_O2 = cfg.model.get("megatron_amp_O2", False)
-        gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-        gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-        gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
-        gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
-        gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
-        gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
-        gpt_cfg.activations_checkpoint_layers_per_pipeline = cfg.model.get(
-            "activations_checkpoint_layers_per_pipeline", None
-        )
-        gpt_cfg.peft = cfg.model.peft
-        gpt_cfg.data = cfg.model.data
-        gpt_cfg.optim = cfg.model.optim
-        gpt_cfg.precision = cfg.trainer.precision
-        gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
-        gpt_cfg.restore_from_path = cfg.model.restore_from_path
-        gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
-        gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
-        gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
-        gpt_cfg.hidden_dropout = cfg.model.get("hidden_dropout", 0.0)
-        gpt_cfg.attention_dropout = cfg.model.get("attention_dropout", 0.0)
-        gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
-        gpt_cfg.use_flash_attention = cfg.model.get("use_flash_attention", False)
-        # if TP/PP size is -1, use default TP/PP size as original model
-        if cfg.model.get("tensor_model_parallel_size", 1) > 0:
-            gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
-        if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
-            gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
-        gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
-
-        if cfg.model.data.get("chat", False):
-            # chat model, overwrite the prompt template
-            prompt_template = get_prompt_template_example(cfg.model.data.chat_prompt_tokens)
-            gpt_cfg.data.train_ds.prompt_template = prompt_template
-            gpt_cfg.data.validation_ds.prompt_template = prompt_template
-
-        sft_cls = GPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        if cfg.model.get("use_flash_attention", None) is not None:
-            gpt_cfg.use_flash_attention = cfg.model.use_flash_attention
-
-        if cfg.model.get("seq_len_interpolation_factor", None) is not None:
-            gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor
-
-        if cfg.model.get("dist_ckpt_load_strictness", None) is not None:
-            gpt_cfg.dist_ckpt_load_strictness = cfg.model.dist_ckpt_load_strictness
-
-        gpt_cfg.inference = cfg.model.get("inference", {})
-
-        # This is needed when modifying a hparam file directly to load `.ckpt` files.
-        # This is not needed to modify the cfg in `.nemo` files.
-        if add_cfg_to_tree:
-            OmegaConf.resolve(gpt_cfg)
-            gpt_cfg.cfg = gpt_cfg
-
-    return gpt_cfg
-
-
 @hydra_runner(config_path="conf", config_name="gpt_sft")
 def main(cfg) -> None:
+    cfg.model = load_and_override_model_config(cfg.model.restore_from_path, cfg.model)
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
 
@@ -129,17 +63,15 @@ def main(cfg) -> None:
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
 
-    ptl_model, updated_cfg = load_from_nemo(
-        GPTSFTModel,
+    ptl_model = load_from_nemo(
+        MambaSFTModel if cfg.model.get("mamba_hybrid", False) else GPTSFTModel,
         cfg,
         trainer,
         strict=True,
-        modify_config_fn=_modify_config,
         restore_path=cfg.model.restore_from_path,
-        return_updated_cfg=True,
     )
 
-    init_peft(ptl_model, updated_cfg)
+    init_peft(ptl_model, cfg.model)
 
     with open_dict(cfg):
         # overwrite the model config with the config from the checkpoint
@@ -173,6 +105,7 @@ def main(cfg) -> None:
         train_data_cfg,
         ptl_model.tokenizer,
         num_samples,
+        is_mamba=cfg.model.get("mamba_hybrid", False),
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
@@ -185,6 +118,7 @@ def main(cfg) -> None:
         val_data_cfg,
         ptl_model.tokenizer,
         num_samples,
+        is_mamba=cfg.model.get("mamba_hybrid", False),
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 97b68ffe4..ef5c02438 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,7 +379,7 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None):
+def build_sft_dataset(data_cfg, tokenizer, num_samples, is_mamba=False, answer_only_loss=True, is_chat=True, special_tokens=None):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
@@ -411,6 +411,7 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i
         answer_only_loss=answer_only_loss,
         truncation_field=data_cfg.get("truncation_field", "text"),
         pad_to_max_length=data_cfg.get("pad_to_max_length", False),
+        pad_seq_length_to_mult=256 if is_mamba else 16,
         index_mapping_dir=data_cfg.get("index_mapping_dir", None),
         prompt_template=data_cfg.get("prompt_template", None),
         virtual_tokens=0,
diff --git a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
new file mode 100644
index 000000000..3b06f9c8a
--- /dev/null
+++ b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to remove special tokens from dpo datasets 
+and convert them into list of messages format"""
+
+import json
+import re
+import sys
+input_jsonl = sys.argv[1]
+output_jsonl = input_jsonl.replace(".jsonl", ".no_special_toks.jsonl")
+
+def format_conversation(input_string):
+    # Define roles and patterns
+    role_patterns = {
+        "<extra_id_0>System": "system",
+        "<extra_id_1>User": "user",
+        "<extra_id_1>Assistant": "assistant"
+    }
+    
+    # Initialize an empty output list
+    conversation = []
+    
+    # Use regex to find each segment's role and content
+    segments = re.findall(r"(<extra_id_[0-1]>[^\n]+)\n(.*?)((?=<extra_id_)|$)", input_string, re.DOTALL)
+    
+    for segment in segments:
+        role_tag, content, _ = segment
+        role = role_patterns.get(role_tag.strip(), "unknown")
+        conversation.append({"role": role, "content": content.strip()})
+
+    empty_asst = conversation.pop()
+    
+    return conversation, empty_asst
+
+with open(input_jsonl, "r") as f, open(output_jsonl, "w") as w:
+    for line in f:
+        j = json.loads(line)
+        prompt = j["prompt"]
+        undo_spl_prompt, empty_assistant = format_conversation(prompt)
+        chosen, rejected = j["chosen_response"], j["rejected_response"]
+        chosen = chosen.split("\n<extra_id_1>")[0]
+        rejected = rejected.split("\n<extra_id_1>")[0]
+        chosen_message = {"role": empty_assistant["role"], "content": chosen}
+        rejected_message = {"role": empty_assistant["role"], "content": rejected}
+        j_out = {"prompt": undo_spl_prompt, "chosen_response": chosen_message, "rejected_response": rejected_message, "chosen_reward": j["chosen_reward"], "rejected_reward": j["rejected_reward"]}
+        w.write(json.dumps(j_out) + "\n")
+
+old_format_example="""<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\nHere are some more pros and cons of RNN and transformer models:\n \n\n Pros of RNNs:\n \n\n -  Better sequential memory and handling of sequences\n -  Simpler and easier to understand\n -  Faster to run\n -  Good for teaching and research\n \n\n Cons of RNNs:\n \n\n -  Limited parallelism\n -  Poor handling of large input datasets\n -  Not as accurate as transformers\n -  Not as flexible as transformers\n \n\n Pros of transformers:\n \n\n -  Better parallelism and handling of large input datasets\n -  More flexible and versatile\n -  Higher accuracy\n -  Good for actual applications in industry and the real world\n \n\n Cons of transformers:\n \n\n -  More complex and harder to understand and debug\n -  Slower to run\n -  Not as good for teaching and research\n<extra_id_1>User\nFor a chat-based music recommendation platform, which model is recommended? Please mention any model that works well, including those mentioned above or any others.\n<extra_id_1>Assistant\nFor a chat-based music recommendation platform, I would recommend using a transformer architecture, such as the GPT-4 model. Transformers are better for handling large input datasets, and they are more flexible and versatile than RNNs. They are also more accurate, so they are better for actually recommending music to users in a real-world application. RNNs are better for teaching and research, as they are simpler and easier to understand and debug.\n<extra_id_1>User\ncould you add more?\n<extra_id_1>Assistant\n"""
+
+print("****************************************")
+print("old format prompt")
+print("****************************************")
+print(old_format_example)
+print("****************************************")
+print("new format prompt")
+print("****************************************")
+new_format_example, _ = format_conversation(old_format_example)
+print(json.dumps(new_format_example, indent=2))
diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
index d3a615500..15bc69c00 100644
--- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
+++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
@@ -22,6 +22,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
 from nemo.collections.nlp.modules.common.text_generation_strategy import TextGenerationStrategy
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -225,3 +226,8 @@ def finish_inference(self):
         self._restore_activation_checkpointing_args()
         self._restore_sequence_parallelism_args()
         set_train(self)
+
+
+class MambaSFTModel(MegatronMambaModel, GPTSFTModel):
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer=trainer)
\ No newline at end of file
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
index 952b4e897..d7e69d7ef 100644
--- a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
+++ b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
@@ -16,6 +16,8 @@
 from functools import partial
 
 import torch
+from megatron.core import parallel_state
+from megatron.core.models.mamba import MambaModel
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 from megatron.core.utils import divide
@@ -23,6 +25,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     get_iterator_k_split,
@@ -460,3 +463,7 @@ def get_ref_policy_logprobs(self, batch):
 
         # return in GPU, trainer needs to move to cpu
         return ref_log_probs
+
+class MegatronMambaDPOModel(MegatronMambaModel, MegatronGPTDPOModel): # @adithyare inherence order matters
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer=trainer)
\ No newline at end of file

From 6394adb4c51b54fbe0357f3c24cd98b33a90e4d5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 23:30:20 +0000
Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: NeMo-Aligner CI <nemo-aligner-ci@nvidia.com>
---
 nemo_aligner/data/nlp/builders.py             |  4 ++-
 .../data/nlp/scripts/undo_special_tokens.py   | 27 +++++++++++--------
 nemo_aligner/models/nlp/gpt/gpt_sft_model.py  |  2 +-
 .../models/nlp/gpt/megatron_gpt_dpo_model.py  |  5 ++--
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index ef5c02438..5f51ab951 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,7 +379,9 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, is_mamba=False, answer_only_loss=True, is_chat=True, special_tokens=None):
+def build_sft_dataset(
+    data_cfg, tokenizer, num_samples, is_mamba=False, answer_only_loss=True, is_chat=True, special_tokens=None
+):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
diff --git a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
index 3b06f9c8a..602f106a2 100644
--- a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
+++ b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
@@ -18,32 +18,31 @@
 import json
 import re
 import sys
+
 input_jsonl = sys.argv[1]
 output_jsonl = input_jsonl.replace(".jsonl", ".no_special_toks.jsonl")
 
+
 def format_conversation(input_string):
     # Define roles and patterns
-    role_patterns = {
-        "<extra_id_0>System": "system",
-        "<extra_id_1>User": "user",
-        "<extra_id_1>Assistant": "assistant"
-    }
-    
+    role_patterns = {"<extra_id_0>System": "system", "<extra_id_1>User": "user", "<extra_id_1>Assistant": "assistant"}
+
     # Initialize an empty output list
     conversation = []
-    
+
     # Use regex to find each segment's role and content
     segments = re.findall(r"(<extra_id_[0-1]>[^\n]+)\n(.*?)((?=<extra_id_)|$)", input_string, re.DOTALL)
-    
+
     for segment in segments:
         role_tag, content, _ = segment
         role = role_patterns.get(role_tag.strip(), "unknown")
         conversation.append({"role": role, "content": content.strip()})
 
     empty_asst = conversation.pop()
-    
+
     return conversation, empty_asst
 
+
 with open(input_jsonl, "r") as f, open(output_jsonl, "w") as w:
     for line in f:
         j = json.loads(line)
@@ -54,10 +53,16 @@ def format_conversation(input_string):
         rejected = rejected.split("\n<extra_id_1>")[0]
         chosen_message = {"role": empty_assistant["role"], "content": chosen}
         rejected_message = {"role": empty_assistant["role"], "content": rejected}
-        j_out = {"prompt": undo_spl_prompt, "chosen_response": chosen_message, "rejected_response": rejected_message, "chosen_reward": j["chosen_reward"], "rejected_reward": j["rejected_reward"]}
+        j_out = {
+            "prompt": undo_spl_prompt,
+            "chosen_response": chosen_message,
+            "rejected_response": rejected_message,
+            "chosen_reward": j["chosen_reward"],
+            "rejected_reward": j["rejected_reward"],
+        }
         w.write(json.dumps(j_out) + "\n")
 
-old_format_example="""<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\nHere are some more pros and cons of RNN and transformer models:\n \n\n Pros of RNNs:\n \n\n -  Better sequential memory and handling of sequences\n -  Simpler and easier to understand\n -  Faster to run\n -  Good for teaching and research\n \n\n Cons of RNNs:\n \n\n -  Limited parallelism\n -  Poor handling of large input datasets\n -  Not as accurate as transformers\n -  Not as flexible as transformers\n \n\n Pros of transformers:\n \n\n -  Better parallelism and handling of large input datasets\n -  More flexible and versatile\n -  Higher accuracy\n -  Good for actual applications in industry and the real world\n \n\n Cons of transformers:\n \n\n -  More complex and harder to understand and debug\n -  Slower to run\n -  Not as good for teaching and research\n<extra_id_1>User\nFor a chat-based music recommendation platform, which model is recommended? Please mention any model that works well, including those mentioned above or any others.\n<extra_id_1>Assistant\nFor a chat-based music recommendation platform, I would recommend using a transformer architecture, such as the GPT-4 model. Transformers are better for handling large input datasets, and they are more flexible and versatile than RNNs. They are also more accurate, so they are better for actually recommending music to users in a real-world application. RNNs are better for teaching and research, as they are simpler and easier to understand and debug.\n<extra_id_1>User\ncould you add more?\n<extra_id_1>Assistant\n"""
+old_format_example = """<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\nHere are some more pros and cons of RNN and transformer models:\n \n\n Pros of RNNs:\n \n\n -  Better sequential memory and handling of sequences\n -  Simpler and easier to understand\n -  Faster to run\n -  Good for teaching and research\n \n\n Cons of RNNs:\n \n\n -  Limited parallelism\n -  Poor handling of large input datasets\n -  Not as accurate as transformers\n -  Not as flexible as transformers\n \n\n Pros of transformers:\n \n\n -  Better parallelism and handling of large input datasets\n -  More flexible and versatile\n -  Higher accuracy\n -  Good for actual applications in industry and the real world\n \n\n Cons of transformers:\n \n\n -  More complex and harder to understand and debug\n -  Slower to run\n -  Not as good for teaching and research\n<extra_id_1>User\nFor a chat-based music recommendation platform, which model is recommended? Please mention any model that works well, including those mentioned above or any others.\n<extra_id_1>Assistant\nFor a chat-based music recommendation platform, I would recommend using a transformer architecture, such as the GPT-4 model. Transformers are better for handling large input datasets, and they are more flexible and versatile than RNNs. They are also more accurate, so they are better for actually recommending music to users in a real-world application. RNNs are better for teaching and research, as they are simpler and easier to understand and debug.\n<extra_id_1>User\ncould you add more?\n<extra_id_1>Assistant\n"""
 
 print("****************************************")
 print("old format prompt")
diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
index 15bc69c00..ed0e7d476 100644
--- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
+++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
@@ -230,4 +230,4 @@ def finish_inference(self):
 
 class MambaSFTModel(MegatronMambaModel, GPTSFTModel):
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        super().__init__(cfg, trainer=trainer)
\ No newline at end of file
+        super().__init__(cfg, trainer=trainer)
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
index d7e69d7ef..5b52efb3d 100644
--- a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
+++ b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
@@ -464,6 +464,7 @@ def get_ref_policy_logprobs(self, batch):
         # return in GPU, trainer needs to move to cpu
         return ref_log_probs
 
-class MegatronMambaDPOModel(MegatronMambaModel, MegatronGPTDPOModel): # @adithyare inherence order matters
+
+class MegatronMambaDPOModel(MegatronMambaModel, MegatronGPTDPOModel):  # @adithyare inherence order matters
     def __init__(self, cfg: DictConfig, trainer: Trainer):
-        super().__init__(cfg, trainer=trainer)
\ No newline at end of file
+        super().__init__(cfg, trainer=trainer)

From 77238793d4e73e284c2d544f682264c82ffb74df Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Thu, 21 Nov 2024 17:48:59 -0800
Subject: [PATCH 4/6] feat: dpo dataset new openai chat completion format

---
 examples/nlp/gpt/train_gpt_sft.py             |   1 +
 nemo_aligner/data/nlp/datasets.py             | 103 ++++++++++++++++--
 .../data/nlp/scripts/undo_special_tokens.py   |  63 +++++------
 setup/requirements.txt                        |   1 +
 tests/test_datasets.py                        |  56 ++++++++++
 5 files changed, 181 insertions(+), 43 deletions(-)

diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index 7a2b4a2fb..0e2759f56 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -52,6 +52,7 @@
 @hydra_runner(config_path="conf", config_name="gpt_sft")
 def main(cfg) -> None:
     cfg.model = load_and_override_model_config(cfg.model.restore_from_path, cfg.model)
+
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
 
diff --git a/nemo_aligner/data/nlp/datasets.py b/nemo_aligner/data/nlp/datasets.py
index b605942b0..a07bf61a1 100644
--- a/nemo_aligner/data/nlp/datasets.py
+++ b/nemo_aligner/data/nlp/datasets.py
@@ -15,13 +15,19 @@
 """Custom datasets for RLHF training"""
 
 import os
+from typing import Dict, List
 
 import numpy as np
 import scipy
 import torch
+from omegaconf import OmegaConf
 
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import _create_ltor_masks_and_position_ids
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import GPTSFTChatDataset
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
+    GPTSFTChatDataset,
+    _get_header_conversation_type_mask_role,
+    get_prompt_template_example,
+)
 from nemo.core import Dataset
 from nemo.utils import logging
 
@@ -344,16 +350,97 @@ def encode(self, text, append_eod=False):
 
         return text_ids, len(text_ids)
 
+    @staticmethod
+    def _convert_messages(
+        input_list: List[Dict[str, str]]
+    ) -> Dict:  # TODO: (@adithyare) this method should live elsewhare..
+        """
+        args:
+            input_list: is a list of dicts in the openai format
+                for example:
+                [{"role": "system", "content": "you are helpful},
+                {"role": "user", "content": "Why is the sky blue?"},
+                {"role": "assistant", "content": "Because blablabla"},
+                ...]
+        returns:
+            output_dict: a dict in nemo's format {"system": "sytem prompt",
+                                                 "conversation": [],
+                                                 ...
+                                                } 
+        """
+        output_dict = {
+            "system": "",
+            "conversations": [],
+            "mask": "User",
+            "type": "VALUE_TO_TEXT",
+        }
+
+        # Extract the system message
+        num_system_msg = 0
+        for msg in input_list:
+            if msg["role"] == "system":
+                output_dict["system"] = msg["content"]
+                num_system_msg += 1
+            if num_system_msg > 1:
+                raise RuntimeError("Multiple system messages seen, please consolidate into a single system message.")
+
+        # Build the conversations list
+        for msg in input_list:
+            if msg["role"] != "system":
+                conversation_entry = {
+                    "from": msg["role"].capitalize(),  # Capitalize 'user' and 'assistant'
+                    "value": msg["content"],
+                    "label": None,
+                }
+                output_dict["conversations"].append(conversation_entry)
+
+        return output_dict
+
+    def convert(self, messages):
+        """
+        args:
+            messages: is a list of dicts in the openai format
+                for example:
+                [{"role": "system", "content": "you are helpful},
+                {"role": "user", "content": "Why is the sky blue?"},
+                {"role": "assistant", "content": "Because blablabla"},
+                ...]
+        returns:
+            conversation:  is a string formatted with the chat template
+        """
+        if OmegaConf.select(self.cfg, "data.chat_prompt_tokens") is None:
+            raise RuntimeError(
+                "You don't have a model (model_config.yaml) which has chat_prompt_tokens, are you sure this is a Chat/Instruction model?"
+            )
+        special_tokens = self.cfg.data.chat_prompt_tokens
+        nemo_source = self._convert_messages(messages)
+        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
+            nemo_source, special_tokens
+        )
+        return conversation
+
     def __getitem__(self, idx):
         """Returns a pair of chosen/rejected pairs, their respective lengths, and labels."""
         payload = self.data[idx]
-        prompt, prompt_len = self.encode(payload["prompt"], append_eod=False)
-        chosen, chosen_len = self.encode(
-            payload["prompt"] + payload["chosen_response"], append_eod=self.cfg.data.get("append_eod", False)
-        )
-        reject, reject_len = self.encode(
-            payload["prompt"] + payload["rejected_response"], append_eod=self.cfg.data.get("append_eod", False)
-        )
+
+        if isinstance(payload["prompt"], str):
+            # (@adithyare) format with hardcoded chat tokens
+            # will allow this for the time being.
+            prompt_fmtd = payload["prompt"]
+            chosen_fmtd = payload["prompt"] + payload["chosen_response"]
+            rejected_fmtd = payload["prompt"] + payload["rejected_response"]
+            logging.warning(
+                "Pre-formatting chat conversation as string with hardcoded chat tokens will be deprecated."
+            )  # (@adithyare) this will spam the console for now.
+        else:
+            prompt_fmtd = self.convert(payload["prompt"])  # (@adithyare) read var as "prompt formatted"
+            chosen_fmtd = self.convert(payload["prompt"] + [payload["chosen_response"]])
+            rejected_fmtd = self.convert(payload["prompt"] + [payload["rejected_response"]])
+
+        prompt, prompt_len = self.encode(prompt_fmtd, append_eod=False)
+        chosen, chosen_len = self.encode(chosen_fmtd, append_eod=self.cfg.data.get("append_eod", False))
+        reject, reject_len = self.encode(rejected_fmtd, append_eod=self.cfg.data.get("append_eod", False))
+
         # chosen_response_only, chosen_response_len = self.encode(payload['chosen_response'])
         # reject_response_only, reject_response_len = self.encode(payload['rejected_response'])
         chosen_labels = ([-100] * prompt_len) + chosen[prompt_len:]
diff --git a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
index 602f106a2..591fabd6a 100644
--- a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
+++ b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
@@ -15,12 +15,9 @@
 """Script to remove special tokens from dpo datasets 
 and convert them into list of messages format"""
 
+import argparse
 import json
 import re
-import sys
-
-input_jsonl = sys.argv[1]
-output_jsonl = input_jsonl.replace(".jsonl", ".no_special_toks.jsonl")
 
 
 def format_conversation(input_string):
@@ -38,38 +35,34 @@ def format_conversation(input_string):
         role = role_patterns.get(role_tag.strip(), "unknown")
         conversation.append({"role": role, "content": content.strip()})
 
-    empty_asst = conversation.pop()
-
-    return conversation, empty_asst
+    return conversation
 
 
-with open(input_jsonl, "r") as f, open(output_jsonl, "w") as w:
-    for line in f:
-        j = json.loads(line)
-        prompt = j["prompt"]
-        undo_spl_prompt, empty_assistant = format_conversation(prompt)
-        chosen, rejected = j["chosen_response"], j["rejected_response"]
-        chosen = chosen.split("\n<extra_id_1>")[0]
-        rejected = rejected.split("\n<extra_id_1>")[0]
-        chosen_message = {"role": empty_assistant["role"], "content": chosen}
-        rejected_message = {"role": empty_assistant["role"], "content": rejected}
-        j_out = {
-            "prompt": undo_spl_prompt,
-            "chosen_response": chosen_message,
-            "rejected_response": rejected_message,
-            "chosen_reward": j["chosen_reward"],
-            "rejected_reward": j["rejected_reward"],
-        }
-        w.write(json.dumps(j_out) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process a JSONL file.")
+    parser.add_argument("input_jsonl", type=str, help="Path to the input JSONL file.")
+    # Parse the arguments
+    args = parser.parse_args()
 
-old_format_example = """<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\nHere are some more pros and cons of RNN and transformer models:\n \n\n Pros of RNNs:\n \n\n -  Better sequential memory and handling of sequences\n -  Simpler and easier to understand\n -  Faster to run\n -  Good for teaching and research\n \n\n Cons of RNNs:\n \n\n -  Limited parallelism\n -  Poor handling of large input datasets\n -  Not as accurate as transformers\n -  Not as flexible as transformers\n \n\n Pros of transformers:\n \n\n -  Better parallelism and handling of large input datasets\n -  More flexible and versatile\n -  Higher accuracy\n -  Good for actual applications in industry and the real world\n \n\n Cons of transformers:\n \n\n -  More complex and harder to understand and debug\n -  Slower to run\n -  Not as good for teaching and research\n<extra_id_1>User\nFor a chat-based music recommendation platform, which model is recommended? Please mention any model that works well, including those mentioned above or any others.\n<extra_id_1>Assistant\nFor a chat-based music recommendation platform, I would recommend using a transformer architecture, such as the GPT-4 model. Transformers are better for handling large input datasets, and they are more flexible and versatile than RNNs. They are also more accurate, so they are better for actually recommending music to users in a real-world application. RNNs are better for teaching and research, as they are simpler and easier to understand and debug.\n<extra_id_1>User\ncould you add more?\n<extra_id_1>Assistant\n"""
+    input_jsonl = args.input_jsonl
+    output_jsonl = input_jsonl.replace(".jsonl", ".no_special_toks.jsonl")
 
-print("****************************************")
-print("old format prompt")
-print("****************************************")
-print(old_format_example)
-print("****************************************")
-print("new format prompt")
-print("****************************************")
-new_format_example, _ = format_conversation(old_format_example)
-print(json.dumps(new_format_example, indent=2))
+    with open(input_jsonl, "r") as f, open(output_jsonl, "w") as w:
+        for line in f:
+            j = json.loads(line)
+            prompt = j["prompt"]
+            undo_spl_prompt = format_conversation(prompt)
+            empty_assistant = undo_spl_prompt.pop()
+            chosen, rejected = j["chosen_response"], j["rejected_response"]
+            chosen = chosen.split("\n<extra_id_1>")[0]
+            rejected = rejected.split("\n<extra_id_1>")[0]
+            chosen_message = {"role": empty_assistant["role"], "content": chosen}
+            rejected_message = {"role": empty_assistant["role"], "content": rejected}
+            j_out = {
+                "prompt": undo_spl_prompt,
+                "chosen_response": chosen_message,
+                "rejected_response": rejected_message,
+                "chosen_reward": j["chosen_reward"],
+                "rejected_reward": j["rejected_reward"],
+            }
+            w.write(json.dumps(j_out) + "\n")
diff --git a/setup/requirements.txt b/setup/requirements.txt
index 198d2e07a..d074f3672 100644
--- a/setup/requirements.txt
+++ b/setup/requirements.txt
@@ -1,3 +1,4 @@
+Jinja2~=3.1.4
 jsonlines
 megatron_core>=0.8
 nemo_toolkit[nlp]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 01425357b..701ff33e5 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -22,8 +22,16 @@
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo_aligner.algorithms.dpo import dpo_custom_collate
 from nemo_aligner.data.nlp.builders import build_dataloader, build_train_valid_test_dpo_datasets
+from nemo_aligner.data.nlp.scripts.undo_special_tokens import format_conversation
 from nemo_aligner.utils import parallel_state
 
+try:
+    from jinja2 import Template
+
+    HAS_JINJA2 = True
+except:
+    HAS_JINJA2 = False
+
 
 @pytest.fixture
 def llama3_tokenizer():
@@ -136,6 +144,54 @@ def test_dpo_loader(init_model_parallel, make_tmp_jsonl, llama3_tokenizer):
     assert num_mini_batches == 2
 
 
+@pytest.mark.run_only_on("GPU")
+def test_dpo_dataset_conversion():
+    prompt_str = """<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\n"""
+
+    expected_oai_messages = [
+        {"role": "system", "content": ""},
+        {
+            "role": "user",
+            "content": "Does GPT-4 use RNN or Transformer models, and which one is better for this type of project?",
+        },
+        {
+            "role": "assistant",
+            "content": "GPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.",
+        },
+        {
+            "role": "user",
+            "content": "Could you explain in detail both the advantages and disadvantages from different perspectives?",
+        },
+        {
+            "role": "assistant",
+            "content": """Yes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.""",
+        },
+        {"role": "user", "content": "could you add more in a table"},
+        {"role": "assistant", "content": ""},
+    ]
+
+    oai_messages_prompt = format_conversation(prompt_str)
+    assert expected_oai_messages == oai_messages_prompt
+
+    if HAS_JINJA2:
+        # (@adithyare) bonus test! convert oai style messages back into a string using Jinja
+
+        def remove_trailing(s, t):
+            if s.endswith(t):
+                s = s[: -len(t)]
+            return s
+
+        jinja_template = """{% for message in conversation %}{%- if message.role == "system" -%}<extra_id_0>System\n{{ message.content }}\n{% elif message.role == "user" -%}<extra_id_1>User\n{{ message.content }}\n{% elif message.role == "assistant" -%}<extra_id_1>Assistant\n{{ message.content }}\n{% endif %}{% endfor %}"""
+        jinja_template = Template(jinja_template)
+        prompt_str_jinja_rendered = jinja_template.render(conversation=oai_messages_prompt)
+        prompt_str_jinja_rendered = remove_trailing(
+            prompt_str_jinja_rendered, "\n"
+        )  # (@adithyare) jinja will add the ending of message token which we should remove to make a prompt.
+        assert prompt_str == prompt_str_jinja_rendered
+
+    return True
+
+
 @pytest.mark.run_only_on("GPU")
 def test_dpo_loader_original(init_model_parallel, make_tmp_jsonl, llama3_tokenizer):
     init_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)

From d97f29929e0be2d861a89224c01760dc62cfb554 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 2 Dec 2024 23:22:14 -0800
Subject: [PATCH 5/6] remove dockerfile patches for nemotron-5 since nemo
 includes PRs needed

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 Dockerfile | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..b3df1844b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -121,26 +121,3 @@ RUN cd /opt/NeMo-Aligner && \
 
 RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
-# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
-RUN <<"EOF" bash -exu
-cd NeMo
-# Ensures we don't cherry-pick "future" origin/main commits
-git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-; do
-  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
-  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
-  git fetch origin $head_pr_commit:PR-${pr}
-  # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
-  # Tag cherry-picks to help
-  git tag cherry-pick-PR-${pr}
-done
-EOF

From a9c8b0a4d1209f328bb730684b18d1ce93b84d70 Mon Sep 17 00:00:00 2001
From: arendu <adithya.r@gmail.com>
Date: Tue, 17 Dec 2024 20:23:45 +0000
Subject: [PATCH 6/6] meta tokens support

Signed-off-by: arendu <adithya.r@gmail.com>
---
 nemo_aligner/data/nlp/builders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 5f51ab951..72f0eb1fa 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -417,6 +417,7 @@ def build_sft_dataset(
         index_mapping_dir=data_cfg.get("index_mapping_dir", None),
         prompt_template=data_cfg.get("prompt_template", None),
         virtual_tokens=0,
+        meta_tokens=data_cfg.get("meta_tokens", 0),
         memmap_workers=data_cfg.get(
             "memmap_workers", None
         ),  # used to set num. of workers to create the memmap index files