-
Notifications
You must be signed in to change notification settings - Fork 147
Description
when trying to run Llama framework with fsdp turned on, this is the error I get :
Traceback (most recent call last):
File "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 21, in
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
File "/opt/NeMo/nemo/collections/nlp/init.py", line 15, in
from nemo.collections.nlp import data, losses, models, modules
File "/opt/NeMo/nemo/collections/nlp/data/init.py", line 42, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/init.py", line 16, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py", line 30, in
from nemo.collections.nlp.parts.utils_funcs import tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/init.py", line 17, in
from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/utils_funcs.py", line 37, in
from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu
File "/opt/NeMo/nemo/collections/nlp/modules/init.py", line 16, in
from nemo.collections.nlp.modules.common import (
File "/opt/NeMo/nemo/collections/nlp/modules/common/init.py", line 36, in
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer, get_tokenizer_list
File "/opt/NeMo/nemo/collections/nlp/modules/common/tokenizer_utils.py", line 28, in
from nemo.collections.nlp.parts.nlp_overrides import HAVE_MEGATRON_CORE
File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 37, in
from pytorch_lightning.plugins.precision import FSDPPrecision, MixedPrecisionPlugin
ImportError: cannot import name 'FSDPPrecision' from 'pytorch_lightning.plugins.precision' (/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/init.py)
The command I used is this :
I ran using version 24.01
export HYDRA_FULL_ERROR=1; venv/bin/python3 launcher_scripts/main.py training=llama/llama2_70b stages=["training"] numa_mapping.enable=True container=/mtrsysgwork/liorpa/containers/nemo_24_01.sqsh training.trainer.num_nodes=16 training.model.global_batch_size=256 training.model.virtual_pipeline_model_parallel_size=null training.model.tensor_model_parallel_size=4 training.model.pipeline_model_parallel_size=1 training.model.micro_batch_size=2 training.model.data.data_prefix=["1.0",'${data_dir}/my-gpt3_00_text_document'] data_dir=/raid/dataset/the_pile/shard00 launcher_scripts_path=$PWD/launcher_scripts base_results_dir=/mtrsysgwork/liorpa/llama_results training.model.data.index_mapping_dir=/mtrsysgwork/liorpa/llama_results training.trainer.max_steps=100 training.trainer.val_check_interval=100 training.exp_manager.create_checkpoint_callback=false training.trainer.enable_checkpointing=False training.trainer.log_every_n_steps=1 training.run.name=llama2_70b_release_24_01 cluster.partition=ISR1-ALL +training.model.fsdp=true training.model.megatron_amp_O2=false training.model.activations_checkpoint_num_layers=null training.model.optim.name=fused_adam cluster.gpus_per_node=8 training.model.tokenizer.model=/mtrsysgwork/liorpa/llama_results/llama/tokenizer.model training.model.gradient_accumulation_fusion=false training.exp_manager.create_wandb_logger=false ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer
Perhaps it could be fixed using this :
NVIDIA-NeMo/NeMo#8689