From a7119b7fda07d30a5dd6ef2a9eb6350e6de6ea99 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 22 Jan 2025 06:35:11 +0000 Subject: [PATCH] chore: update dockerfile deps to 24.12 state Signed-off-by: Terry Kong --- Dockerfile | 12 +--- setup/trtllm.patch | 175 --------------------------------------------- 2 files changed, 3 insertions(+), 184 deletions(-) delete mode 100644 setup/trtllm.patch diff --git a/Dockerfile b/Dockerfile index 40ca71d96..13de668ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,13 +12,13 @@ ARG MAX_JOBS=8 # Git refs for dependencies ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG PYTRITON_VERSION=0.5.10 +ARG PYTRITON_VERSION=0.5.13 ARG NEMO_TAG=633cb602777bffefbe12066b0c915c87e7b469e9 # On: v2.1.0 ARG MLM_TAG=d15cec53beb283e7127b7d594e1c46b8a0719b6d # On: core_r0.10.0 ARG ALIGNER_COMMIT=main -ARG TRTLLM_VERSION=v0.13.0 +ARG TRTLLM_VERSION=v0.15.0 ARG PROTOBUF_VERSION=4.24.4 -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.10-py3 FROM ${BASE_IMAGE} AS aligner-bump ARG ALIGNER_COMMIT @@ -70,10 +70,6 @@ RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ pip install -e . ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ -# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a -# breaking change. The last known working verison is 11.5.3 -RUN pip install pynvml==11.5.3 - # install TransformerEngine ARG MAX_JOBS ARG TE_TAG @@ -136,8 +132,6 @@ COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner RUN cd /opt/NeMo-Aligner && \ pip install --no-deps -e . -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch - # NOTE: Comment this layer out if it is not needed # NOTE: This section exists to allow cherry-picking PRs in cases where # we do not wish to simply update to the top-of-tree. Sometimes PRs diff --git a/setup/trtllm.patch b/setup/trtllm.patch deleted file mode 100644 index 27dacae48..000000000 --- a/setup/trtllm.patch +++ /dev/null @@ -1,175 +0,0 @@ -diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py -index 527f8ccfd..222c69fc6 100644 ---- a/tensorrt_llm/builder.py -+++ b/tensorrt_llm/builder.py -@@ -660,10 +660,14 @@ class EngineConfig: - @classmethod - def from_json_file(cls, config_file): - with open(config_file) as f: -- config = json.load(f) -- return cls(PretrainedConfig.from_dict(config['pretrained_config']), -- BuildConfig.from_dict(config['build_config']), -- config['version']) -+ return cls.from_json_str(f.read()) -+ -+ @classmethod -+ def from_json_str(cls, config_str): -+ config = json.loads(config_str) -+ return cls(PretrainedConfig.from_dict(config['pretrained_config']), -+ BuildConfig.from_dict(config['build_config']), -+ config['version']) - - def to_dict(self): - build_config = self.build_config.to_dict() -@@ -770,6 +774,15 @@ class Engine: - - return cls(config, engine_buffer, managed_weights) - -+ @classmethod -+ def from_buffer(cls, -+ engine_buffer: Union[trt.IHostMemory, bytes], -+ json_config_str: str, -+ rank: int = 0): -+ config = EngineConfig.from_json_str(json_config_str) -+ config.pretrained_config.set_rank(rank) -+ return cls(config, engine_buffer) -+ - - def get_engine_version(engine_dir: str) -> Union[None, str]: - engine_dir = Path(engine_dir) -diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py -index 983d458b8..af8eceb7f 100755 ---- a/tensorrt_llm/runtime/generation.py -+++ b/tensorrt_llm/runtime/generation.py -@@ -15,6 +15,7 @@ - - import copy - import math -+import os - import platform - from collections import Counter - from dataclasses import dataclass, field -@@ -47,6 +48,10 @@ from ..quantization import QuantMode - from .kv_cache_manager import GenerationSequence, KVCacheUpdater - from .session import _scoped_stream - -+# When variable is set, this will disable torch.cuda.set_device(...) calls -+# Useful in situations where device is already assigned by another library, i.e., megatron. -+DISABLE_TORCH_DEVICE_SET = os.environ.get("DISABLE_TORCH_DEVICE_SET", False) -+ - - def decode_words_list(word_dict: List[List[str]], - tokenizer=None, -@@ -247,8 +252,11 @@ class _Runtime(object): - def __prepare(self, mapping: Mapping, engine_buffer): - self.runtime_rank = mapping.rank - local_rank = self.runtime_rank % mapping.gpus_per_node -- torch.cuda.set_device(local_rank) -- CUASSERT(cudart.cudaSetDevice(local_rank)) -+ if DISABLE_TORCH_DEVICE_SET: -+ CUASSERT(cudart.cudaSetDevice(torch.cuda.current_device())) -+ else: -+ torch.cuda.set_device(local_rank) -+ CUASSERT(cudart.cudaSetDevice(local_rank)) - - self.runtime = trt.Runtime(logger.trt_logger) - self.engine = self.runtime.deserialize_cuda_engine(engine_buffer) -@@ -535,8 +543,10 @@ class SamplingConfig: - sink_token_length: Optional[int] = field(default=None) - output_sequence_lengths: bool = field(default=False) - return_dict: bool = field(default=False) -- stop_words_list: Optional[torch.Tensor] = field(default=None) -- bad_words_list: Optional[torch.Tensor] = field(default=None) -+ stop_words_list: Optional[Union[list, np.ndarray, -+ torch.Tensor]] = field(default=None) -+ bad_words_list: Optional[Union[list, np.ndarray, -+ torch.Tensor]] = field(default=None) - - temperature: Union[float, torch.Tensor] = field(default=1.0) - top_k: Union[int, torch.Tensor] = field(default=1) -@@ -698,9 +708,12 @@ class GenerationSession(object): - self._model_config = model_config - self.mapping = mapping - self.runtime = _Runtime(engine_buffer, mapping) -- self.device = torch.device( -- f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}') -- torch.cuda.set_device(self.device) -+ if DISABLE_TORCH_DEVICE_SET: -+ self.device = torch.device(f'cuda:{torch.cuda.current_device()}') -+ else: -+ self.device = torch.device( -+ f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}') -+ torch.cuda.set_device(self.device) - # dynamic_decoder currently use torch's current stream, so must let TRT enqueue use same stream here - self.stream = stream - if self.stream is None: -diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py -index d2ba7edfa..e02310c3a 100644 ---- a/tensorrt_llm/runtime/model_runner.py -+++ b/tensorrt_llm/runtime/model_runner.py -@@ -31,10 +31,10 @@ from ..builder import Engine, EngineConfig, get_engine_version - from ..logger import logger - from ..mapping import Mapping - from ..quantization import QuantMode --from .generation import (ChatGLMGenerationSession, GenerationSession, -- LogitsProcessor, LoraManager, ModelConfig, -- QWenForCausalLMGenerationSession, SamplingConfig, -- StoppingCriteria, to_word_list_format) -+from .generation import (DISABLE_TORCH_DEVICE_SET, ChatGLMGenerationSession, -+ GenerationSession, LogitsProcessor, LoraManager, -+ ModelConfig, QWenForCausalLMGenerationSession, -+ SamplingConfig, StoppingCriteria, to_word_list_format) - - - def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int, -@@ -554,7 +554,8 @@ class ModelRunner(ModelRunnerMixin): - - if MpiComm.size() > runtime_mapping.gpus_per_node: - assert MpiComm.local_size() == runtime_mapping.gpus_per_node -- torch.cuda.set_device(rank % runtime_mapping.gpus_per_node) -+ if not DISABLE_TORCH_DEVICE_SET: -+ torch.cuda.set_device(rank % runtime_mapping.gpus_per_node) - session = session_cls(model_config, - engine_buffer, - runtime_mapping, -@@ -656,7 +657,8 @@ class ModelRunner(ModelRunnerMixin): - assert model_config.max_medusa_tokens > 0, \ - "medusa_choice is specified but model_config.max_medusa_tokens is 0." - -- torch.cuda.set_device(rank % runtime_mapping.gpus_per_node) -+ if not DISABLE_TORCH_DEVICE_SET: -+ torch.cuda.set_device(rank % runtime_mapping.gpus_per_node) - session = session_cls(model_config, - engine_buffer, - runtime_mapping, -@@ -840,12 +842,24 @@ class ModelRunner(ModelRunnerMixin): - batch_input_ids, input_lengths = self._prepare_inputs( - batch_input_ids, sampling_config.pad_id) - -- if sampling_config.bad_words_list is not None: -- sampling_config.bad_words_list = to_word_list_format( -- sampling_config.bad_words_list) -- if sampling_config.stop_words_list is not None: -- sampling_config.stop_words_list = to_word_list_format( -- sampling_config.stop_words_list) -+ def maybe_convert_to_words_list_format( -+ words_list: Optional[Union[list, np.ndarray, torch.Tensor]] -+ ) -> Optional[np.ndarray]: -+ if words_list is None or isinstance(words_list, np.ndarray): -+ return words_list -+ elif isinstance(words_list, torch.Tensor): -+ return words_list.numpy() -+ elif isinstance(words_list, list): -+ return to_word_list_format(words_list) -+ else: -+ raise TypeError( -+ f"Unexpected words_list type={type(words_list)}. Only list, np.ndarray, and torch.Tensor are supported." -+ ) -+ -+ sampling_config.bad_words_list = maybe_convert_to_words_list_format( -+ sampling_config.bad_words_list) -+ sampling_config.stop_words_list = maybe_convert_to_words_list_format( -+ sampling_config.stop_words_list) - - if not self.kv_cache_type and sampling_config.max_new_tokens > 1: - raise RuntimeError(