Skip to content

Test a40x4 #23

@AfrosamuraiNo1

Description

@AfrosamuraiNo1

Пробывал с разных мест запускать но финал такой. Помогите вдруг сталкивались.

Makefile
PATH_STORAGE ?= "/home/ubuntu"

NAME_NGC=pytorch:22.10-py3
NAME_DATASET=all
#NAME_CONFIG=StorageTest_24GB_v1
NAME_CONFIG=4xA40
NAME_MODEL=all
TIME_OUT=3000
PATH_OUTPUT=results/4_a40_test/PyTorch_tacotron2_fp32

install:
docker pull nvcr.io/nvidia/pytorch:22.10-py3 &&
cd ${PATH_STORAGE} &&
chmod +x setup.sh &&
./setup.sh pytorch:22.10-py3

create_data:
cd ${PATH_STORAGE}/deeplearning-benchmark/pytorch &&
docker run --env http_proxy="http://00.00.00.00:3128" --env https_proxy="http://00.00.00.00:3128" --gpus all --rm -t --shm-size=128g
-v ${PATH_STORAGE}/DeepLearningExamples/PyTorch:/workspace/benchmark
-v ${PATH_STORAGE}/data:/data
-v ${PATH_STORAGE}/deeplearning-benchmark/pytorch/scripts:/scripts
nvcr.io/nvidia/${NAME_NGC}
/bin/bash -c "cp -r /scripts/* /workspace; ./run_prepare.sh ${NAME_DATASET}"

benchmark:
cd ${PATH_STORAGE}/deeplearning-benchmark/pytorch &&
docker run --env http_proxy="http://00.00.00.00:3128" --env https_proxy="http://00.00.00.00:3128"
--rm --shm-size=128g
--gpus all
-v ${PATH_STORAGE}/DeepLearningExamples/PyTorch:/workspace/benchmark
-v ${PATH_STORAGE}/data:/data
-v ${PATH_STORAGE}/deeplearning-benchmark/pytorch/scripts:/scripts
-v ${PATH_STORAGE}/deeplearning-benchmark/pytorch/results:/results
nvcr.io/nvidia/${NAME_NGC}
/bin/bash -c "cp -r /scripts/* /workspace; ./run_benchmark.sh ${NAME_CONFIG} ${NAME_MODEL} ${TIME_OUT}" &&
grep -r '^Training performance = ' ${PATH_STORAGE}/deeplearning-benchmark/pytorch/${PATH_OUTPUT}/*.txt

$sudo make benchmark PATH_STORAGE=${PATH_STORAGE}
cd /home/user/torch_test/deeplearning-benchmark/pytorch &&
docker run --env http_proxy="http://00.00.00.00:3128" --env https_proxy="http://00.00.00.00:3128"
--rm --shm-size=128g
--gpus all
-v /home/user/torch_test/DeepLearningExamples/PyTorch:/workspace/benchmark
-v /home/user/torch_test/data:/data
-v /home/user/torch_test/deeplearning-benchmark/pytorch/scripts:/scripts
-v /home/user/torch_test/deeplearning-benchmark/pytorch/results:/results
nvcr.io/nvidia/pytorch:22.10-py3
/bin/bash -c "cp -r /scripts/* /workspace; ./run_benchmark.sh 4xA40 all 3000" &&
grep -r '^Training performance = ' /home/user/torch_test/deeplearning-benchmark/pytorch/results/4_a40_test/PyTorch_tacotron2_fp32/*.txt

=============
== PyTorch ==

NVIDIA Release 22.10 (build 46164382)
PyTorch Version 1.13.0a0+d0d6b1f

Container image Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.

Copyright (c) 2014-2022 Facebook Inc.
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2015 Google Inc.
Copyright (c) 2015 Yangqing Jia
Copyright (c) 2013-2016 The Caffe contributors
All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved.

This container image and its contents are governed by the NVIDIA Deep Learning Container License.
By pulling and using the container, you accept the terms and conditions of this license:
https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting termcolor
Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)
Installing collected packages: termcolor
Successfully installed termcolor-2.4.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting git+https://github.com/NVIDIA/dllogger
Cloning https://github.com/NVIDIA/dllogger to /tmp/pip-req-build-g_sbaxsm
Running command git clone -q https://github.com/NVIDIA/dllogger /tmp/pip-req-build-g_sbaxsm
Resolved https://github.com/NVIDIA/dllogger to commit 0540a43971f4a8a16693a9de9de73c1072020769
Building wheels for collected packages: DLLogger
Building wheel for DLLogger (setup.py): started
Building wheel for DLLogger (setup.py): finished with status 'done'
Created wheel for DLLogger: filename=DLLogger-1.0.0-py3-none-any.whl size=5670 sha256=8ce665039162cf5467e52bc66ff2352bb6d24f3d1b945b489268fa8b7c5eace9
Stored in directory: /tmp/pip-ephem-wheel-cache-m1fv_kwf/wheels/ad/94/cf/8f3396cb8d62d532695ec557e193fada55cd366e14fd9a02be
Successfully built DLLogger
Installing collected packages: DLLogger
Successfully installed DLLogger-1.0.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
cp: cannot create regular file 'benchmark/LanguageModeling/BERT': No such file or directory
cp: cannot create regular file 'benchmark/SpeechSynthesis/Tacotron2': No such file or directory
./run_benchmark.sh: line 16: cd: benchmark/Detection/SSD: No such file or directory
/workspace /workspace
ERROR: Directory '.' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.
/workspace
4xA40

PyTorch_ncf_FP32 started:
./benchmark_pytorch.sh: line 242: cd: examples/ncf: No such file or directory
/workspace /workspace


--data /data/ncf/cache/ml-20m --epochs 2 --batch_size 20000000 --opt_level O0


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 336) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

ncf.py FAILED

Failures:
[1]:
time : 2024-03-21_17:03:11
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 337)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:03:11
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 338)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:03:11
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 339)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:03:11
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 336)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

^CPyTorch_ncf_FP32 ended.
/workspace
PyTorch_tacotron2_FP16 started:
./benchmark_pytorch.sh: line 242: cd: examples/tacotron2: No such file or directory
/workspace /workspace
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--model-name Tacotron2 --output ./ --learning-rate 0.0 --epochs 2 --batch-size 148 --weight-decay 1e-6 --grad-clip-thresh 1.0 --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_2500_filelist.txt --dataset-path /data/tacotron2/LJSpeech-1.1 --cudnn-enabled --amp-run


/opt/conda/bin/python: No module named multiproc
PyTorch_tacotron2_FP16 ended.
/workspace
PyTorch_resnet50_AMP started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/resnet50v1.5: No such file or directory


/data/imagenet --arch resnet50 --amp --static-loss-scale 256 --epochs 2 --prof 100 --batch-size 928 --raport-file benchmark.json --print-freq 1 --training-only --data-backend syntetic


python: can't open file './multiproc.py': [Errno 2] No such file or directory
PyTorch_resnet50_AMP ended.
/workspace
PyTorch_bert_large_squad_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/bert: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


/data/bert_large/bert_large_uncased.pt 2.0 18 0.0 fp32 4 1 /data/squad/v1.1 /data/bert_large/bert-large-uncased-vocab.txt . train /data/bert_large/bert_config.json 100


bash: scripts/run_squad.sh: No such file or directory
PyTorch_bert_large_squad_FP32 ended.
/workspace
PyTorch_ncf_FP16 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/ncf: No such file or directory


--data /data/ncf/cache/ml-20m --epochs 2 --batch_size 40000000 --opt_level O2


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'ncf.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 509) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

ncf.py FAILED

Failures:
[1]:
time : 2024-03-21_17:03:41
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 510)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:03:41
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 511)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:03:41
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 512)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:03:41
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 509)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_ncf_FP16 ended.
/workspace
PyTorch_waveglow_FP16 started:
./benchmark_pytorch.sh: line 242: cd: examples/tacotron2: No such file or directory
/workspace /workspace
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


-o ./ --model-name WaveGlow --learning-rate 0.0 --epochs 2 --segment-length 8000 --batch-size 32 --weight-decay 0 --grad-clip-thresh 65504 --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_625_filelist.txt --dataset-path /data/tacotron2/LJSpeech-1.1 --cudnn-enabled --cudnn-benchmark --amp-run


/opt/conda/bin/python: No module named multiproc
PyTorch_waveglow_FP16 ended.
/workspace
PyTorch_transformerxllarge_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/transformer-xl/pytorch: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--data /data/transformer-xl/wikitext-103 --max_step 100 --batch_size 64 --dataset wt103 --n_layer 18 --d_model 1024 --n_head 16 --d_head 64 --d_inner 4096 --dropout 0.2 --dropatt 0.2 --optim adam --lr 0.0 --warmup_step 16000 --tgt_len 256 --mem_len 256 --eval_tgt_len 128 --eval_interval 5000 --roll --cuda


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 633) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:04:01
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 634)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:04:01
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 635)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:04:01
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 636)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:04:01
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 633)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_transformerxllarge_FP32 ended.
/workspace
PyTorch_gnmt_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/gnmt: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--dataset-dir /data/gnmt/wmt16_de_en --train-batch-size 648 --val-batch-size 32 --test-batch-size 32 --math fp32 --epochs 2 --seed 2


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 730) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:04:14
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 731)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:04:14
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 732)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:04:14
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 733)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:04:14
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 730)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_gnmt_FP32 ended.
/workspace
PyTorch_bert_large_squad_FP16 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/bert: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


/data/bert_large/bert_large_uncased.pt 2.0 36 0.0 fp16 4 1 /data/squad/v1.1 /data/bert_large/bert-large-uncased-vocab.txt . train /data/bert_large/bert_config.json 200


bash: scripts/run_squad.sh: No such file or directory
PyTorch_bert_large_squad_FP16 ended.
/workspace
PyTorch_SSD_AMP started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/ssd: No such file or directory


--data /data/object_detection --batch-size 256 --benchmark-warmup 10 --benchmark-iterations 20 --amp --learning-rate 0


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 853) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

main.py FAILED

Failures:
[1]:
time : 2024-03-21_17:04:32
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 854)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:04:32
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 855)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:04:32
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 856)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:04:32
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 853)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_SSD_AMP ended.
/workspace
PyTorch_transformerxlbase_FP16 started:
./benchmark_pytorch.sh: line 242: cd: examples/transformer-xl/pytorch: No such file or directory
/workspace /workspace
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--data /data/transformer-xl/wikitext-103 --max_step 100 --batch_size 256 --dataset wt103 --n_layer 16 --d_model 512 --n_head 8 --d_head 64 --d_inner 2048 --dropout 0.1 --dropatt 0.0 --optim jitlamb --lr 0.0 --eta_min 0.001 --warmup_step 1000 --tgt_len 192 --mem_len 192 --eval_tgt_len 192 --log_interval 10 --eval_interval 5000 --roll --cuda --fp16


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 950) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:04:45
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 951)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:04:45
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 952)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:04:45
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 953)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:04:45
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 950)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_transformerxlbase_FP16 ended.
/workspace
PyTorch_bert_base_squad_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/bert: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


/data/bert_base/bert_base_uncased.pt 2.0 56 0.0 fp32 4 1 /data/squad/v1.1 /data/bert_base/bert-base-uncased-vocab.txt /results train /data/bert_base/bert_config.json 100


bash: scripts/run_squad.sh: No such file or directory
PyTorch_bert_base_squad_FP32 ended.
/workspace
PyTorch_transformerxllarge_FP16 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/transformer-xl/pytorch: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--data /data/transformer-xl/wikitext-103 --max_step 100 --batch_size 128 --dataset wt103 --n_layer 18 --d_model 1024 --n_head 16 --d_head 64 --d_inner 4096 --dropout 0.2 --dropatt 0.2 --optim adam --lr 0.0 --warmup_step 16000 --tgt_len 256 --mem_len 256 --eval_tgt_len 128 --eval_interval 5000 --cuda --fp16


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1077) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:05:05
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1078)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:05:05
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 1079)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:05:05
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 1080)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:05:05
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1077)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_transformerxllarge_FP16 ended.
/workspace
PyTorch_tacotron2_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/tacotron2: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--model-name Tacotron2 --output ./ --learning-rate 0.0 --epochs 2 --batch-size 136 --weight-decay 1e-6 --grad-clip-thresh 1.0 --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_1250_filelist.txt --dataset-path /data/tacotron2/LJSpeech-1.1 --cudnn-enabled


/opt/conda/bin/python: No module named multiproc
PyTorch_tacotron2_FP32 ended.
/workspace
PyTorch_waveglow_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/tacotron2: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


-o ./ --model-name WaveGlow --learning-rate 0.0 --epochs 2 --segment-length 8000 --batch-size 26 --weight-decay 0 --grad-clip-thresh 65504 --log-file nvlog.json --training-files filelists/ljs_audio_text_train_subset_625_filelist.txt --dataset-path /data/tacotron2/LJSpeech-1.1 --cudnn-enabled --cudnn-benchmark


/opt/conda/bin/python: No module named multiproc
PyTorch_waveglow_FP32 ended.
/workspace
PyTorch_SSD_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/ssd: No such file or directory


--data /data/object_detection --batch-size 144 --benchmark-warmup 50 --benchmark-iterations 100 --learning-rate 0


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'main.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1224) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

main.py FAILED

Failures:
[1]:
time : 2024-03-21_17:05:30
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1225)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:05:30
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 1226)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:05:30
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 1227)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:05:30
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1224)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_SSD_FP32 ended.
/workspace
PyTorch_resnet50_FP32 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/resnet50v1.5: No such file or directory


/data/imagenet --arch resnet50 --epochs 2 --prof 100 --batch-size 496 --raport-file benchmark.json --print-freq 1 --training-only --data-backend syntetic


python: can't open file './multiproc.py': [Errno 2] No such file or directory
PyTorch_resnet50_FP32 ended.
/workspace
PyTorch_transformerxlbase_FP32 started:
./benchmark_pytorch.sh: line 242: cd: examples/transformer-xl/pytorch: No such file or directory
/workspace /workspace
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--data /data/transformer-xl/wikitext-103 --max_step 100 --batch_size 128 --dataset wt103 --n_layer 16 --d_model 512 --n_head 8 --d_head 64 --d_inner 2048 --dropout 0.1 --dropatt 0.0 --optim jitlamb --lr 0.0 --eta_min 0.001 --warmup_step 1000 --tgt_len 192 --mem_len 192 --eval_tgt_len 192 --log_interval 10 --eval_interval 5000 --roll --cuda


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1344) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:05:48
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1345)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:05:48
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 1346)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:05:48
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 1347)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:05:48
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1344)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_transformerxlbase_FP32 ended.
/workspace
PyTorch_gnmt_FP16 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/gnmt: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


--dataset-dir /data/gnmt/wmt16_de_en --train-batch-size 880 --val-batch-size 32 --test-batch-size 32 --math fp16 --epochs 2 --seed 2


WARNING:torch.distributed.run:


Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.


/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
/opt/conda/bin/python: can't open file 'train.py': [Errno 2] No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 1441) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==1.13.0a0+d0d6b1f', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 345, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

train.py FAILED

Failures:
[1]:
time : 2024-03-21_17:06:01
host : fccafc2c856c
rank : 1 (local_rank: 1)
exitcode : 2 (pid: 1442)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_17:06:01
host : fccafc2c856c
rank : 2 (local_rank: 2)
exitcode : 2 (pid: 1443)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_17:06:01
host : fccafc2c856c
rank : 3 (local_rank: 3)
exitcode : 2 (pid: 1444)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure):
[0]:
time : 2024-03-21_17:06:01
host : fccafc2c856c
rank : 0 (local_rank: 0)
exitcode : 2 (pid: 1441)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

PyTorch_gnmt_FP16 ended.
/workspace
PyTorch_bert_base_squad_FP16 started:
/workspace /workspace
./benchmark_pytorch.sh: line 242: cd: examples/bert: No such file or directory
ERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'


/data/bert_base/bert_base_uncased.pt 2.0 108 0.0 fp16 4 1 /data/squad/v1.1 /data/bert_base/bert-base-uncased-vocab.txt . train /data/bert_base/bert_config.json 200


bash: scripts/run_squad.sh: No such file or directory
PyTorch_bert_base_squad_FP16 ended.
/workspace
Check results folder : /results/4xA40
['PyTorch_SSD_AMP', 'PyTorch_SSD_FP32', 'PyTorch_bert_base_squad_FP16', 'PyTorch_bert_base_squad_FP32', 'PyTorch_bert_large_squad_FP16', 'PyTorch_bert_large_squad_FP32', 'PyTorch_gnmt_FP16', 'PyTorch_gnmt_FP32', 'PyTorch_maskrcnn_FP16', 'PyTorch_maskrcnn_FP32', 'PyTorch_ncf_FP16', 'PyTorch_ncf_FP32', 'PyTorch_resnet50_AMP', 'PyTorch_resnet50_FP16', 'PyTorch_resnet50_FP32', 'PyTorch_tacotron2_FP16', 'PyTorch_tacotron2_FP32', 'PyTorch_transformerxlbase_FP16', 'PyTorch_transformerxlbase_FP32', 'PyTorch_transformerxllarge_FP16', 'PyTorch_transformerxllarge_FP32', 'PyTorch_waveglow_FP16', 'PyTorch_waveglow_FP32', 'summary.txt', 'sys_pytorch.txt']
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_AMP : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_SSD_FP32 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP16 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_base_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP16 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_bert_large_squad_FP32 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP16 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_gnmt_FP32 : sucessful
PyTorch_maskrcnn_FP16 : sucessful
PyTorch_maskrcnn_FP32 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP16 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_ncf_FP32 : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_AMP : sucessful
PyTorch_resnet50_FP16 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_resnet50_FP32 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP16 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_tacotron2_FP32 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP16 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxlbase_FP32 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP16 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_transformerxllarge_FP32 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP16 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
PyTorch_waveglow_FP32 : sucessful
make: *** [Makefile:30: benchmark] Interrupt

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions