From a7119b7fda07d30a5dd6ef2a9eb6350e6de6ea99 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 22 Jan 2025 06:35:11 +0000
Subject: [PATCH] chore: update dockerfile deps to 24.12 state

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 Dockerfile         |  12 +---
 setup/trtllm.patch | 175 ---------------------------------------------
 2 files changed, 3 insertions(+), 184 deletions(-)
 delete mode 100644 setup/trtllm.patch

diff --git a/Dockerfile b/Dockerfile
index 40ca71d96..13de668ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,13 +12,13 @@
 ARG MAX_JOBS=8
 # Git refs for dependencies
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG PYTRITON_VERSION=0.5.10
+ARG PYTRITON_VERSION=0.5.13
 ARG NEMO_TAG=633cb602777bffefbe12066b0c915c87e7b469e9 # On: v2.1.0
 ARG MLM_TAG=d15cec53beb283e7127b7d594e1c46b8a0719b6d  # On: core_r0.10.0
 ARG ALIGNER_COMMIT=main
-ARG TRTLLM_VERSION=v0.13.0
+ARG TRTLLM_VERSION=v0.15.0
 ARG PROTOBUF_VERSION=4.24.4
-ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3
 
 FROM ${BASE_IMAGE} AS aligner-bump
 ARG ALIGNER_COMMIT
@@ -70,10 +70,6 @@ RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
     pip install -e .
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
 
-# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a
-#   breaking change. The last known working verison is 11.5.3
-RUN pip install pynvml==11.5.3
-
 # install TransformerEngine
 ARG MAX_JOBS
 ARG TE_TAG
@@ -136,8 +132,6 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
 RUN cd /opt/NeMo-Aligner && \
     pip install --no-deps -e .
 
-RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
-
 # NOTE: Comment this layer out if it is not needed
 # NOTE: This section exists to allow cherry-picking PRs in cases where
 #  we do not wish to simply update to the top-of-tree. Sometimes PRs
diff --git a/setup/trtllm.patch b/setup/trtllm.patch
deleted file mode 100644
index 27dacae48..000000000
--- a/setup/trtllm.patch
+++ /dev/null
@@ -1,175 +0,0 @@
-diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
-index 527f8ccfd..222c69fc6 100644
---- a/tensorrt_llm/builder.py
-+++ b/tensorrt_llm/builder.py
-@@ -660,10 +660,14 @@ class EngineConfig:
-     @classmethod
-     def from_json_file(cls, config_file):
-         with open(config_file) as f:
--            config = json.load(f)
--            return cls(PretrainedConfig.from_dict(config['pretrained_config']),
--                       BuildConfig.from_dict(config['build_config']),
--                       config['version'])
-+            return cls.from_json_str(f.read())
-+
-+    @classmethod
-+    def from_json_str(cls, config_str):
-+        config = json.loads(config_str)
-+        return cls(PretrainedConfig.from_dict(config['pretrained_config']),
-+                   BuildConfig.from_dict(config['build_config']),
-+                   config['version'])
- 
-     def to_dict(self):
-         build_config = self.build_config.to_dict()
-@@ -770,6 +774,15 @@ class Engine:
- 
-         return cls(config, engine_buffer, managed_weights)
- 
-+    @classmethod
-+    def from_buffer(cls,
-+                    engine_buffer: Union[trt.IHostMemory, bytes],
-+                    json_config_str: str,
-+                    rank: int = 0):
-+        config = EngineConfig.from_json_str(json_config_str)
-+        config.pretrained_config.set_rank(rank)
-+        return cls(config, engine_buffer)
-+
- 
- def get_engine_version(engine_dir: str) -> Union[None, str]:
-     engine_dir = Path(engine_dir)
-diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
-index 983d458b8..af8eceb7f 100755
---- a/tensorrt_llm/runtime/generation.py
-+++ b/tensorrt_llm/runtime/generation.py
-@@ -15,6 +15,7 @@
- 
- import copy
- import math
-+import os
- import platform
- from collections import Counter
- from dataclasses import dataclass, field
-@@ -47,6 +48,10 @@ from ..quantization import QuantMode
- from .kv_cache_manager import GenerationSequence, KVCacheUpdater
- from .session import _scoped_stream
- 
-+# When variable is set, this will disable torch.cuda.set_device(...) calls
-+# Useful in situations where device is already assigned by another library, i.e., megatron.
-+DISABLE_TORCH_DEVICE_SET = os.environ.get("DISABLE_TORCH_DEVICE_SET", False)
-+
- 
- def decode_words_list(word_dict: List[List[str]],
-                       tokenizer=None,
-@@ -247,8 +252,11 @@ class _Runtime(object):
-     def __prepare(self, mapping: Mapping, engine_buffer):
-         self.runtime_rank = mapping.rank
-         local_rank = self.runtime_rank % mapping.gpus_per_node
--        torch.cuda.set_device(local_rank)
--        CUASSERT(cudart.cudaSetDevice(local_rank))
-+        if DISABLE_TORCH_DEVICE_SET:
-+            CUASSERT(cudart.cudaSetDevice(torch.cuda.current_device()))
-+        else:
-+            torch.cuda.set_device(local_rank)
-+            CUASSERT(cudart.cudaSetDevice(local_rank))
- 
-         self.runtime = trt.Runtime(logger.trt_logger)
-         self.engine = self.runtime.deserialize_cuda_engine(engine_buffer)
-@@ -535,8 +543,10 @@ class SamplingConfig:
-     sink_token_length: Optional[int] = field(default=None)
-     output_sequence_lengths: bool = field(default=False)
-     return_dict: bool = field(default=False)
--    stop_words_list: Optional[torch.Tensor] = field(default=None)
--    bad_words_list: Optional[torch.Tensor] = field(default=None)
-+    stop_words_list: Optional[Union[list, np.ndarray,
-+                                    torch.Tensor]] = field(default=None)
-+    bad_words_list: Optional[Union[list, np.ndarray,
-+                                   torch.Tensor]] = field(default=None)
- 
-     temperature: Union[float, torch.Tensor] = field(default=1.0)
-     top_k: Union[int, torch.Tensor] = field(default=1)
-@@ -698,9 +708,12 @@ class GenerationSession(object):
-         self._model_config = model_config
-         self.mapping = mapping
-         self.runtime = _Runtime(engine_buffer, mapping)
--        self.device = torch.device(
--            f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}')
--        torch.cuda.set_device(self.device)
-+        if DISABLE_TORCH_DEVICE_SET:
-+            self.device = torch.device(f'cuda:{torch.cuda.current_device()}')
-+        else:
-+            self.device = torch.device(
-+                f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}')
-+            torch.cuda.set_device(self.device)
-         # dynamic_decoder currently use torch's current stream, so must let TRT enqueue use same stream here
-         self.stream = stream
-         if self.stream is None:
-diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py
-index d2ba7edfa..e02310c3a 100644
---- a/tensorrt_llm/runtime/model_runner.py
-+++ b/tensorrt_llm/runtime/model_runner.py
-@@ -31,10 +31,10 @@ from ..builder import Engine, EngineConfig, get_engine_version
- from ..logger import logger
- from ..mapping import Mapping
- from ..quantization import QuantMode
--from .generation import (ChatGLMGenerationSession, GenerationSession,
--                         LogitsProcessor, LoraManager, ModelConfig,
--                         QWenForCausalLMGenerationSession, SamplingConfig,
--                         StoppingCriteria, to_word_list_format)
-+from .generation import (DISABLE_TORCH_DEVICE_SET, ChatGLMGenerationSession,
-+                         GenerationSession, LogitsProcessor, LoraManager,
-+                         ModelConfig, QWenForCausalLMGenerationSession,
-+                         SamplingConfig, StoppingCriteria, to_word_list_format)
- 
- 
- def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int,
-@@ -554,7 +554,8 @@ class ModelRunner(ModelRunnerMixin):
- 
-         if MpiComm.size() > runtime_mapping.gpus_per_node:
-             assert MpiComm.local_size() == runtime_mapping.gpus_per_node
--        torch.cuda.set_device(rank % runtime_mapping.gpus_per_node)
-+        if not DISABLE_TORCH_DEVICE_SET:
-+            torch.cuda.set_device(rank % runtime_mapping.gpus_per_node)
-         session = session_cls(model_config,
-                               engine_buffer,
-                               runtime_mapping,
-@@ -656,7 +657,8 @@ class ModelRunner(ModelRunnerMixin):
-                 assert model_config.max_medusa_tokens > 0, \
-                     "medusa_choice is specified but model_config.max_medusa_tokens is 0."
- 
--            torch.cuda.set_device(rank % runtime_mapping.gpus_per_node)
-+            if not DISABLE_TORCH_DEVICE_SET:
-+                torch.cuda.set_device(rank % runtime_mapping.gpus_per_node)
-             session = session_cls(model_config,
-                                   engine_buffer,
-                                   runtime_mapping,
-@@ -840,12 +842,24 @@ class ModelRunner(ModelRunnerMixin):
-         batch_input_ids, input_lengths = self._prepare_inputs(
-             batch_input_ids, sampling_config.pad_id)
- 
--        if sampling_config.bad_words_list is not None:
--            sampling_config.bad_words_list = to_word_list_format(
--                sampling_config.bad_words_list)
--        if sampling_config.stop_words_list is not None:
--            sampling_config.stop_words_list = to_word_list_format(
--                sampling_config.stop_words_list)
-+        def maybe_convert_to_words_list_format(
-+            words_list: Optional[Union[list, np.ndarray, torch.Tensor]]
-+        ) -> Optional[np.ndarray]:
-+            if words_list is None or isinstance(words_list, np.ndarray):
-+                return words_list
-+            elif isinstance(words_list, torch.Tensor):
-+                return words_list.numpy()
-+            elif isinstance(words_list, list):
-+                return to_word_list_format(words_list)
-+            else:
-+                raise TypeError(
-+                    f"Unexpected words_list type={type(words_list)}. Only list, np.ndarray, and torch.Tensor are supported."
-+                )
-+
-+        sampling_config.bad_words_list = maybe_convert_to_words_list_format(
-+            sampling_config.bad_words_list)
-+        sampling_config.stop_words_list = maybe_convert_to_words_list_format(
-+            sampling_config.stop_words_list)
- 
-         if not self.kv_cache_type and sampling_config.max_new_tokens > 1:
-             raise RuntimeError(