diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..b3df1844b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -121,26 +121,3 @@ RUN cd /opt/NeMo-Aligner && \
 
 RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
 
-# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
-RUN <<"EOF" bash -exu
-cd NeMo
-# Ensures we don't cherry-pick "future" origin/main commits
-git fetch -a
-# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
-# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
-# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
-# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-for pr_and_commit in \
-  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
-  "10652 60e677423667c029dd05875da72bf0719774f844" \
-  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-; do
-  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
-  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
-  git fetch origin $head_pr_commit:PR-${pr}
-  # cherry-picks all commits between main and the top of the PR
-  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
-  # Tag cherry-picks to help
-  git tag cherry-pick-PR-${pr}
-done
-EOF
diff --git a/examples/nlp/gpt/conf/gpt_dpo.yaml b/examples/nlp/gpt/conf/gpt_dpo.yaml
index 192265244..4a240bc38 100644
--- a/examples/nlp/gpt/conf/gpt_dpo.yaml
+++ b/examples/nlp/gpt/conf/gpt_dpo.yaml
@@ -57,6 +57,7 @@ model:
   micro_batch_size: 1
   global_batch_size: 64
   megatron_amp_O2: True
+  mamba_hybrid: False
 
   dpo:
     # This default value ensures there are no numeric differences beween trained and reference policies when computing log probs.
diff --git a/examples/nlp/gpt/conf/gpt_sft.yaml b/examples/nlp/gpt/conf/gpt_sft.yaml
index bdd757f31..745f6ae01 100644
--- a/examples/nlp/gpt/conf/gpt_sft.yaml
+++ b/examples/nlp/gpt/conf/gpt_sft.yaml
@@ -191,7 +191,7 @@ model:
       output_original_text: True  # needed for the proper metrics support
 
   optim:
-    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
+    name: fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
     lr: 3e-5
     weight_decay: 0.01
     betas:
diff --git a/examples/nlp/gpt/train_gpt_dpo.py b/examples/nlp/gpt/train_gpt_dpo.py
index f16a9dacf..4fcac6700 100644
--- a/examples/nlp/gpt/train_gpt_dpo.py
+++ b/examples/nlp/gpt/train_gpt_dpo.py
@@ -21,7 +21,7 @@
 from nemo.utils.exp_manager import exp_manager
 from nemo_aligner.algorithms.dpo import DPOTrainer, dpo_custom_collate
 from nemo_aligner.data.nlp.builders import build_dataloader, build_train_valid_test_dpo_datasets, identity_collate
-from nemo_aligner.models.nlp.gpt.megatron_gpt_dpo_model import MegatronGPTDPOModel
+from nemo_aligner.models.nlp.gpt.megatron_gpt_dpo_model import MegatronGPTDPOModel, MegatronMambaDPOModel
 from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
@@ -53,7 +53,7 @@ def main(cfg) -> None:
     logger = CustomLoggerWrapper(trainer.loggers)
 
     ptl_model = load_from_nemo(
-        MegatronGPTDPOModel,
+        MegatronMambaDPOModel if cfg.model.mamba_hybrid else MegatronGPTDPOModel,
         cfg.model,
         trainer,
         strict=True,
diff --git a/examples/nlp/gpt/train_gpt_sft.py b/examples/nlp/gpt/train_gpt_sft.py
index 371c0f5aa..0e2759f56 100644
--- a/examples/nlp/gpt/train_gpt_sft.py
+++ b/examples/nlp/gpt/train_gpt_sft.py
@@ -27,7 +27,7 @@
 from nemo.utils.exp_manager import exp_manager
 from nemo_aligner.algorithms.supervised import SupervisedTrainer
 from nemo_aligner.data.nlp.builders import build_dataloader, build_sft_dataset
-from nemo_aligner.models.nlp.gpt.gpt_sft_model import GPTSFTModel
+from nemo_aligner.models.nlp.gpt.gpt_sft_model import GPTSFTModel, MambaSFTModel
 from nemo_aligner.utils.distributed import Timer
 from nemo_aligner.utils.train_script_utils import (
     CustomLoggerWrapper,
@@ -39,7 +39,7 @@
     resolve_and_create_trainer,
     retrieve_custom_trainer_state_dict,
 )
-from nemo_aligner.utils.utils import load_from_nemo
+from nemo_aligner.utils.utils import load_and_override_model_config, load_from_nemo
 
 """Script to start SFT training"""
 
@@ -49,75 +49,10 @@
 mp.set_start_method("spawn", force=True)
 
 
-def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
-    """
-    This function modifies the original gpt pre-training config (gpt_cfg) with attributes from the finetuning config (cfg).
-    The `add_cfg_to_tree` arg adds `cfg` to the top of the yaml tree which is needed for all `hparams.yaml` files when passed as an arg to `load_from_checkpoint()`.
-    """
-    OmegaConf.set_struct(gpt_cfg, True)
-    OmegaConf.resolve(cfg)
-    with open_dict(gpt_cfg):
-        gpt_cfg.megatron_amp_O2 = cfg.model.get("megatron_amp_O2", False)
-        gpt_cfg.micro_batch_size = cfg.model.data.train_ds.micro_batch_size
-        gpt_cfg.global_batch_size = cfg.model.data.train_ds.global_batch_size
-        gpt_cfg.sequence_parallel = cfg.model.get("sequence_parallel", False)
-        gpt_cfg.activations_checkpoint_granularity = cfg.model.get("activations_checkpoint_granularity", None)
-        gpt_cfg.activations_checkpoint_num_layers = cfg.model.get("activations_checkpoint_num_layers", None)
-        gpt_cfg.activations_checkpoint_method = cfg.model.get("activations_checkpoint_method", None)
-        gpt_cfg.activations_checkpoint_layers_per_pipeline = cfg.model.get(
-            "activations_checkpoint_layers_per_pipeline", None
-        )
-        gpt_cfg.peft = cfg.model.peft
-        gpt_cfg.data = cfg.model.data
-        gpt_cfg.optim = cfg.model.optim
-        gpt_cfg.precision = cfg.trainer.precision
-        gpt_cfg.answer_only_loss = cfg.model.answer_only_loss
-        gpt_cfg.restore_from_path = cfg.model.restore_from_path
-        gpt_cfg.resume_from_checkpoint = cfg.model.resume_from_checkpoint
-        gpt_cfg.save_nemo_on_validation_end = cfg.model.save_nemo_on_validation_end
-        gpt_cfg.gradient_as_bucket_view = cfg.model.gradient_as_bucket_view
-        gpt_cfg.hidden_dropout = cfg.model.get("hidden_dropout", 0.0)
-        gpt_cfg.attention_dropout = cfg.model.get("attention_dropout", 0.0)
-        gpt_cfg.ffn_dropout = cfg.model.ffn_dropout
-        gpt_cfg.use_flash_attention = cfg.model.get("use_flash_attention", False)
-        # if TP/PP size is -1, use default TP/PP size as original model
-        if cfg.model.get("tensor_model_parallel_size", 1) > 0:
-            gpt_cfg.tensor_model_parallel_size = cfg.model.get("tensor_model_parallel_size", 1)
-        if cfg.model.get("pipeline_model_parallel_size", 1) > 0:
-            gpt_cfg.pipeline_model_parallel_size = cfg.model.get("pipeline_model_parallel_size", 1)
-        gpt_cfg.pipeline_model_parallel_split_rank = cfg.model.get("pipeline_model_parallel_split_rank", 0)
-
-        if cfg.model.data.get("chat", False):
-            # chat model, overwrite the prompt template
-            prompt_template = get_prompt_template_example(cfg.model.data.chat_prompt_tokens)
-            gpt_cfg.data.train_ds.prompt_template = prompt_template
-            gpt_cfg.data.validation_ds.prompt_template = prompt_template
-
-        sft_cls = GPTSFTModel
-        gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
-
-        if cfg.model.get("use_flash_attention", None) is not None:
-            gpt_cfg.use_flash_attention = cfg.model.use_flash_attention
-
-        if cfg.model.get("seq_len_interpolation_factor", None) is not None:
-            gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor
-
-        if cfg.model.get("dist_ckpt_load_strictness", None) is not None:
-            gpt_cfg.dist_ckpt_load_strictness = cfg.model.dist_ckpt_load_strictness
-
-        gpt_cfg.inference = cfg.model.get("inference", {})
-
-        # This is needed when modifying a hparam file directly to load `.ckpt` files.
-        # This is not needed to modify the cfg in `.nemo` files.
-        if add_cfg_to_tree:
-            OmegaConf.resolve(gpt_cfg)
-            gpt_cfg.cfg = gpt_cfg
-
-    return gpt_cfg
-
-
 @hydra_runner(config_path="conf", config_name="gpt_sft")
 def main(cfg) -> None:
+    cfg.model = load_and_override_model_config(cfg.model.restore_from_path, cfg.model)
+
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f"\n{OmegaConf.to_yaml(cfg)}")
 
@@ -129,17 +64,15 @@ def main(cfg) -> None:
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
 
-    ptl_model, updated_cfg = load_from_nemo(
-        GPTSFTModel,
+    ptl_model = load_from_nemo(
+        MambaSFTModel if cfg.model.get("mamba_hybrid", False) else GPTSFTModel,
         cfg,
         trainer,
         strict=True,
-        modify_config_fn=_modify_config,
         restore_path=cfg.model.restore_from_path,
-        return_updated_cfg=True,
     )
 
-    init_peft(ptl_model, updated_cfg)
+    init_peft(ptl_model, cfg.model)
 
     with open_dict(cfg):
         # overwrite the model config with the config from the checkpoint
@@ -173,6 +106,7 @@ def main(cfg) -> None:
         train_data_cfg,
         ptl_model.tokenizer,
         num_samples,
+        is_mamba=cfg.model.get("mamba_hybrid", False),
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
@@ -185,6 +119,7 @@ def main(cfg) -> None:
         val_data_cfg,
         ptl_model.tokenizer,
         num_samples,
+        is_mamba=cfg.model.get("mamba_hybrid", False),
         answer_only_loss=True,
         is_chat=cfg.model.data.chat,
         special_tokens=cfg.model.data.chat_prompt_tokens,
diff --git a/nemo_aligner/data/nlp/builders.py b/nemo_aligner/data/nlp/builders.py
index 97b68ffe4..72f0eb1fa 100644
--- a/nemo_aligner/data/nlp/builders.py
+++ b/nemo_aligner/data/nlp/builders.py
@@ -379,7 +379,9 @@ def build_dataset(index, name):
 )
 
 
-def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, is_chat=True, special_tokens=None):
+def build_sft_dataset(
+    data_cfg, tokenizer, num_samples, is_mamba=False, answer_only_loss=True, is_chat=True, special_tokens=None
+):
     packed_sequence = data_cfg.get("packed_sequence", False)
     dataset_kwargs = {}
 
@@ -411,9 +413,11 @@ def build_sft_dataset(data_cfg, tokenizer, num_samples, answer_only_loss=True, i
         answer_only_loss=answer_only_loss,
         truncation_field=data_cfg.get("truncation_field", "text"),
         pad_to_max_length=data_cfg.get("pad_to_max_length", False),
+        pad_seq_length_to_mult=256 if is_mamba else 16,
         index_mapping_dir=data_cfg.get("index_mapping_dir", None),
         prompt_template=data_cfg.get("prompt_template", None),
         virtual_tokens=0,
+        meta_tokens=data_cfg.get("meta_tokens", 0),
         memmap_workers=data_cfg.get(
             "memmap_workers", None
         ),  # used to set num. of workers to create the memmap index files
diff --git a/nemo_aligner/data/nlp/datasets.py b/nemo_aligner/data/nlp/datasets.py
index b605942b0..a07bf61a1 100644
--- a/nemo_aligner/data/nlp/datasets.py
+++ b/nemo_aligner/data/nlp/datasets.py
@@ -15,13 +15,19 @@
 """Custom datasets for RLHF training"""
 
 import os
+from typing import Dict, List
 
 import numpy as np
 import scipy
 import torch
+from omegaconf import OmegaConf
 
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import _create_ltor_masks_and_position_ids
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import GPTSFTChatDataset
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_chat_dataset import (
+    GPTSFTChatDataset,
+    _get_header_conversation_type_mask_role,
+    get_prompt_template_example,
+)
 from nemo.core import Dataset
 from nemo.utils import logging
 
@@ -344,16 +350,97 @@ def encode(self, text, append_eod=False):
 
         return text_ids, len(text_ids)
 
+    @staticmethod
+    def _convert_messages(
+        input_list: List[Dict[str, str]]
+    ) -> Dict:  # TODO: (@adithyare) this method should live elsewhare..
+        """
+        args:
+            input_list: is a list of dicts in the openai format
+                for example:
+                [{"role": "system", "content": "you are helpful},
+                {"role": "user", "content": "Why is the sky blue?"},
+                {"role": "assistant", "content": "Because blablabla"},
+                ...]
+        returns:
+            output_dict: a dict in nemo's format {"system": "sytem prompt",
+                                                 "conversation": [],
+                                                 ...
+                                                } 
+        """
+        output_dict = {
+            "system": "",
+            "conversations": [],
+            "mask": "User",
+            "type": "VALUE_TO_TEXT",
+        }
+
+        # Extract the system message
+        num_system_msg = 0
+        for msg in input_list:
+            if msg["role"] == "system":
+                output_dict["system"] = msg["content"]
+                num_system_msg += 1
+            if num_system_msg > 1:
+                raise RuntimeError("Multiple system messages seen, please consolidate into a single system message.")
+
+        # Build the conversations list
+        for msg in input_list:
+            if msg["role"] != "system":
+                conversation_entry = {
+                    "from": msg["role"].capitalize(),  # Capitalize 'user' and 'assistant'
+                    "value": msg["content"],
+                    "label": None,
+                }
+                output_dict["conversations"].append(conversation_entry)
+
+        return output_dict
+
+    def convert(self, messages):
+        """
+        args:
+            messages: is a list of dicts in the openai format
+                for example:
+                [{"role": "system", "content": "you are helpful},
+                {"role": "user", "content": "Why is the sky blue?"},
+                {"role": "assistant", "content": "Because blablabla"},
+                ...]
+        returns:
+            conversation:  is a string formatted with the chat template
+        """
+        if OmegaConf.select(self.cfg, "data.chat_prompt_tokens") is None:
+            raise RuntimeError(
+                "You don't have a model (model_config.yaml) which has chat_prompt_tokens, are you sure this is a Chat/Instruction model?"
+            )
+        special_tokens = self.cfg.data.chat_prompt_tokens
+        nemo_source = self._convert_messages(messages)
+        header, conversation, data_type, mask_role = _get_header_conversation_type_mask_role(
+            nemo_source, special_tokens
+        )
+        return conversation
+
     def __getitem__(self, idx):
         """Returns a pair of chosen/rejected pairs, their respective lengths, and labels."""
         payload = self.data[idx]
-        prompt, prompt_len = self.encode(payload["prompt"], append_eod=False)
-        chosen, chosen_len = self.encode(
-            payload["prompt"] + payload["chosen_response"], append_eod=self.cfg.data.get("append_eod", False)
-        )
-        reject, reject_len = self.encode(
-            payload["prompt"] + payload["rejected_response"], append_eod=self.cfg.data.get("append_eod", False)
-        )
+
+        if isinstance(payload["prompt"], str):
+            # (@adithyare) format with hardcoded chat tokens
+            # will allow this for the time being.
+            prompt_fmtd = payload["prompt"]
+            chosen_fmtd = payload["prompt"] + payload["chosen_response"]
+            rejected_fmtd = payload["prompt"] + payload["rejected_response"]
+            logging.warning(
+                "Pre-formatting chat conversation as string with hardcoded chat tokens will be deprecated."
+            )  # (@adithyare) this will spam the console for now.
+        else:
+            prompt_fmtd = self.convert(payload["prompt"])  # (@adithyare) read var as "prompt formatted"
+            chosen_fmtd = self.convert(payload["prompt"] + [payload["chosen_response"]])
+            rejected_fmtd = self.convert(payload["prompt"] + [payload["rejected_response"]])
+
+        prompt, prompt_len = self.encode(prompt_fmtd, append_eod=False)
+        chosen, chosen_len = self.encode(chosen_fmtd, append_eod=self.cfg.data.get("append_eod", False))
+        reject, reject_len = self.encode(rejected_fmtd, append_eod=self.cfg.data.get("append_eod", False))
+
         # chosen_response_only, chosen_response_len = self.encode(payload['chosen_response'])
         # reject_response_only, reject_response_len = self.encode(payload['rejected_response'])
         chosen_labels = ([-100] * prompt_len) + chosen[prompt_len:]
diff --git a/nemo_aligner/data/nlp/scripts/undo_special_tokens.py b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
new file mode 100644
index 000000000..591fabd6a
--- /dev/null
+++ b/nemo_aligner/data/nlp/scripts/undo_special_tokens.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to remove special tokens from dpo datasets 
+and convert them into list of messages format"""
+
+import argparse
+import json
+import re
+
+
+def format_conversation(input_string):
+    # Define roles and patterns
+    role_patterns = {"<extra_id_0>System": "system", "<extra_id_1>User": "user", "<extra_id_1>Assistant": "assistant"}
+
+    # Initialize an empty output list
+    conversation = []
+
+    # Use regex to find each segment's role and content
+    segments = re.findall(r"(<extra_id_[0-1]>[^\n]+)\n(.*?)((?=<extra_id_)|$)", input_string, re.DOTALL)
+
+    for segment in segments:
+        role_tag, content, _ = segment
+        role = role_patterns.get(role_tag.strip(), "unknown")
+        conversation.append({"role": role, "content": content.strip()})
+
+    return conversation
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process a JSONL file.")
+    parser.add_argument("input_jsonl", type=str, help="Path to the input JSONL file.")
+    # Parse the arguments
+    args = parser.parse_args()
+
+    input_jsonl = args.input_jsonl
+    output_jsonl = input_jsonl.replace(".jsonl", ".no_special_toks.jsonl")
+
+    with open(input_jsonl, "r") as f, open(output_jsonl, "w") as w:
+        for line in f:
+            j = json.loads(line)
+            prompt = j["prompt"]
+            undo_spl_prompt = format_conversation(prompt)
+            empty_assistant = undo_spl_prompt.pop()
+            chosen, rejected = j["chosen_response"], j["rejected_response"]
+            chosen = chosen.split("\n<extra_id_1>")[0]
+            rejected = rejected.split("\n<extra_id_1>")[0]
+            chosen_message = {"role": empty_assistant["role"], "content": chosen}
+            rejected_message = {"role": empty_assistant["role"], "content": rejected}
+            j_out = {
+                "prompt": undo_spl_prompt,
+                "chosen_response": chosen_message,
+                "rejected_response": rejected_message,
+                "chosen_reward": j["chosen_reward"],
+                "rejected_reward": j["rejected_reward"],
+            }
+            w.write(json.dumps(j_out) + "\n")
diff --git a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
index d3a615500..ed0e7d476 100644
--- a/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
+++ b/nemo_aligner/models/nlp/gpt/gpt_sft_model.py
@@ -22,6 +22,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
 from nemo.collections.nlp.modules.common.text_generation_strategy import TextGenerationStrategy
 from nemo.collections.nlp.modules.common.text_generation_utils import (
@@ -225,3 +226,8 @@ def finish_inference(self):
         self._restore_activation_checkpointing_args()
         self._restore_sequence_parallelism_args()
         set_train(self)
+
+
+class MambaSFTModel(MegatronMambaModel, GPTSFTModel):
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer=trainer)
diff --git a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
index 952b4e897..5b52efb3d 100644
--- a/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
+++ b/nemo_aligner/models/nlp/gpt/megatron_gpt_dpo_model.py
@@ -16,6 +16,8 @@
 from functools import partial
 
 import torch
+from megatron.core import parallel_state
+from megatron.core.models.mamba import MambaModel
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 from megatron.core.utils import divide
@@ -23,6 +25,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.models.language_modeling.megatron_mamba_model import MegatronMambaModel
 from nemo.collections.nlp.modules.common.megatron.utils import (
     average_losses_across_data_parallel_group,
     get_iterator_k_split,
@@ -460,3 +463,8 @@ def get_ref_policy_logprobs(self, batch):
 
         # return in GPU, trainer needs to move to cpu
         return ref_log_probs
+
+
+class MegatronMambaDPOModel(MegatronMambaModel, MegatronGPTDPOModel):  # @adithyare inherence order matters
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+        super().__init__(cfg, trainer=trainer)
diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py
index 1f879064d..9f1a9345f 100644
--- a/nemo_aligner/utils/trt_llm.py
+++ b/nemo_aligner/utils/trt_llm.py
@@ -44,8 +44,9 @@ def append_and_repad_list(list_of_items, item_to_append, pad_id):
 
 
 class GPTGenerateTRTLLM:
-    # If a tokenizer does not have a pad_id, we use a large negative number and replace
-    # with self.eos_id after generation.
+    # Use a reserved negative number since there is variation between tokenizers if
+    #  they (1) have a pad_id (2) don't have a pad_id or (3) have None as the pad_id.
+    #  This pad_id is replaced with eos_id after generation.
     DEFAULT_PAD_ID = -42
 
     def __init__(
@@ -72,12 +73,6 @@ def __init__(
                 "You are trying to use NeMo-Aligner's TensorRT-LLM acceleration for LLM generation. Please build the dockerfile to enable this feature: https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile"
             )
 
-        # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
-        #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
-        #     be a token that the model is guaranteed to never generate)
-        assert (
-            tokenizer.pad_id != tokenizer.eos_id
-        ), f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} when using TRT-LLM. This is to make sure all code goes into the same path and include the eos_id when the response lengths are computed"
         assert max_input_len > 0
         assert max_generation_length > 0
         assert (
@@ -104,7 +99,7 @@ def __init__(
         rng_generator.manual_seed(seed)
         self.rng_generator = rng_generator
 
-        self.pad_id = tokenizer.pad_id if tokenizer.pad_id is not None else GPTGenerateTRTLLM.DEFAULT_PAD_ID
+        self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
         self.eos_id = tokenizer.eos_id
         end_strings = list(end_strings)
 
diff --git a/setup/requirements.txt b/setup/requirements.txt
index 198d2e07a..d074f3672 100644
--- a/setup/requirements.txt
+++ b/setup/requirements.txt
@@ -1,3 +1,4 @@
+Jinja2~=3.1.4
 jsonlines
 megatron_core>=0.8
 nemo_toolkit[nlp]
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 01425357b..701ff33e5 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -22,8 +22,16 @@
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo_aligner.algorithms.dpo import dpo_custom_collate
 from nemo_aligner.data.nlp.builders import build_dataloader, build_train_valid_test_dpo_datasets
+from nemo_aligner.data.nlp.scripts.undo_special_tokens import format_conversation
 from nemo_aligner.utils import parallel_state
 
+try:
+    from jinja2 import Template
+
+    HAS_JINJA2 = True
+except:
+    HAS_JINJA2 = False
+
 
 @pytest.fixture
 def llama3_tokenizer():
@@ -136,6 +144,54 @@ def test_dpo_loader(init_model_parallel, make_tmp_jsonl, llama3_tokenizer):
     assert num_mini_batches == 2
 
 
+@pytest.mark.run_only_on("GPU")
+def test_dpo_dataset_conversion():
+    prompt_str = """<extra_id_0>System\n\n<extra_id_1>User\nDoes GPT-4 use RNN or Transformer models, and which one is better for this type of project?\n<extra_id_1>Assistant\nGPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.\n<extra_id_1>User\nCould you explain in detail both the advantages and disadvantages from different perspectives?\n<extra_id_1>Assistant\nYes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.\n<extra_id_1>User\ncould you add more in a table\n<extra_id_1>Assistant\n"""
+
+    expected_oai_messages = [
+        {"role": "system", "content": ""},
+        {
+            "role": "user",
+            "content": "Does GPT-4 use RNN or Transformer models, and which one is better for this type of project?",
+        },
+        {
+            "role": "assistant",
+            "content": "GPT-4 uses a transformer architecture, not a recurrent neural network. Both models are commonly used for natural language processing tasks, and both have advantages and disadvantages, so it depends on the specific application of the model.",
+        },
+        {
+            "role": "user",
+            "content": "Could you explain in detail both the advantages and disadvantages from different perspectives?",
+        },
+        {
+            "role": "assistant",
+            "content": """Yes, here are a few different perspectives on the pros and cons of RNN and transformer models:\n \n\n The RNN model has better sequential memory and can better model sequences of data, whereas the transformer model has better parallel computation and can better model large input datasets. This makes RNNs better for applications like speech recognition and machine translation, where the input is a sequence of sounds or words, and transformers better for applications like natural language processing and text classification, where the input is a large collection of documents or social media posts.\n \n\n RNNs are more straightforward and easier to understand and debug, whereas transformers are more flexible and can be used for more diverse applications. This makes RNNs better for teaching and research, and transformers better for actual applications in industry and the real world.\n \n\n RNNs are more efficient and faster to run, whereas transformers are more accurate and produce better results. This makes RNNs better for fast prototyping and testing, and transformers better for actual deployment in production applications.""",
+        },
+        {"role": "user", "content": "could you add more in a table"},
+        {"role": "assistant", "content": ""},
+    ]
+
+    oai_messages_prompt = format_conversation(prompt_str)
+    assert expected_oai_messages == oai_messages_prompt
+
+    if HAS_JINJA2:
+        # (@adithyare) bonus test! convert oai style messages back into a string using Jinja
+
+        def remove_trailing(s, t):
+            if s.endswith(t):
+                s = s[: -len(t)]
+            return s
+
+        jinja_template = """{% for message in conversation %}{%- if message.role == "system" -%}<extra_id_0>System\n{{ message.content }}\n{% elif message.role == "user" -%}<extra_id_1>User\n{{ message.content }}\n{% elif message.role == "assistant" -%}<extra_id_1>Assistant\n{{ message.content }}\n{% endif %}{% endfor %}"""
+        jinja_template = Template(jinja_template)
+        prompt_str_jinja_rendered = jinja_template.render(conversation=oai_messages_prompt)
+        prompt_str_jinja_rendered = remove_trailing(
+            prompt_str_jinja_rendered, "\n"
+        )  # (@adithyare) jinja will add the ending of message token which we should remove to make a prompt.
+        assert prompt_str == prompt_str_jinja_rendered
+
+    return True
+
+
 @pytest.mark.run_only_on("GPU")
 def test_dpo_loader_original(init_model_parallel, make_tmp_jsonl, llama3_tokenizer):
     init_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)