-
Notifications
You must be signed in to change notification settings - Fork 4
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Hi,
I wrote a minimal code to run GMM on 2 gpus but it doesn't work.
import torch
from torchgmm.bayes import GaussianMixture
def main():
torch.manual_seed(0)
X = torch.cat([
torch.randn(10000, 2) + torch.tensor([-5.0, 0.0]),
torch.randn(10000, 2) + torch.tensor([+5.0, 0.0]),
torch.randn(10000, 2) + torch.tensor([0.0, +5.0]),
])
n_gpus = torch.cuda.device_count()
gmm = GaussianMixture(
num_components=3,
covariance_type="diag",
trainer_params=dict(
accelerator="gpu",
devices=2, # 单机两卡;多机时把 num_nodes=2
num_nodes=1,
#strategy="ddp",
#use_distributed_sampler=False,
),
)
gmm.fit(X)
rank = 0
if torch.distributed.is_available() and torch.distributed.is_initialized():
rank = torch.distributed.get_rank()
if rank == 0:
print("converged:", bool(getattr(gmm, "converged_", False)))
print("iters:", int(getattr(gmm, "num_iter_", 0)))
print("avg NLL:", float(getattr(gmm, "nll_", float("nan"))))
if __name__ == "__main__":
main()
Here's the error:
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
[rank1]: Traceback (most recent call last):
[rank1]: File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 41, in <module>
[rank1]: main()
[rank1]: File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 29, in main
[rank1]: gmm.fit(X)
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/bayes/gmm/estimator.py", line 172, in fit
[rank1]: ).fit(data)
[rank1]: ^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/clustering/kmeans/estimator.py", line 139, in fit
[rank1]: self.trainer(max_epochs=num_epochs, enable_progress_bar=False).fit(module, loader)
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
[rank1]: call._call_and_handle_interrupt(
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
[rank1]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
[rank1]: return function(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
[rank1]: self._run(model, ckpt_path=ckpt_path)
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run
[rank1]: results = self._run_stage()
[rank1]: ^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage
[rank1]: self.fit_loop.run()
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 197, in run
[rank1]: self.setup_data()
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 241, in setup_data
[rank1]: dl = _process_dataloader(trainer, trainer_fn, stage, dl)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 484, in _process_dataloader
[rank1]: dataloader = trainer._data_connector._prepare_dataloader(dataloader, shuffle=is_shuffled, mode=stage)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 190, in _prepare_dataloader
[rank1]: return _update_dataloader(dataloader, sampler, mode=mode)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 134, in _update_dataloader
[rank1]: dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 193, in _get_dataloader_init_args_and_kwargs
[rank1]: dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode))
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 283, in _dataloader_init_kwargs_resolve_sampler
[rank1]: batch_sampler = batch_sampler_cls(
[rank1]: ^^^^^^^^^^^^^^^^^^
[rank1]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/base/data/sampler.py", line 25, in __init__
[rank1]: assert isinstance(sampler, SequentialSampler), f"{self.__class__.__name__} only works with sequential samplers."
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: AssertionError: RangeBatchSampler only works with sequential samplers.
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 41, in <module>
[rank0]: main()
[rank0]: File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 29, in main
[rank0]: gmm.fit(X)
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/bayes/gmm/estimator.py", line 172, in fit
[rank0]: ).fit(data)
[rank0]: ^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/clustering/kmeans/estimator.py", line 139, in fit
[rank0]: self.trainer(max_epochs=num_epochs, enable_progress_bar=False).fit(module, loader)
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
[rank0]: call._call_and_handle_interrupt(
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
[rank0]: return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]: return function(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
[rank0]: self._run(model, ckpt_path=ckpt_path)
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run
[rank0]: results = self._run_stage()
[rank0]: ^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage
[rank0]: self.fit_loop.run()
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 197, in run
[rank0]: self.setup_data()
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 241, in setup_data
[rank0]: dl = _process_dataloader(trainer, trainer_fn, stage, dl)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 484, in _process_dataloader
[rank0]: dataloader = trainer._data_connector._prepare_dataloader(dataloader, shuffle=is_shuffled, mode=stage)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 190, in _prepare_dataloader
[rank0]: return _update_dataloader(dataloader, sampler, mode=mode)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 134, in _update_dataloader
[rank0]: dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 193, in _get_dataloader_init_args_and_kwargs
[rank0]: dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode))
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 283, in _dataloader_init_kwargs_resolve_sampler
[rank0]: batch_sampler = batch_sampler_cls(
[rank0]: ^^^^^^^^^^^^^^^^^^
[rank0]: File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/base/data/sampler.py", line 25, in __init__
[rank0]: assert isinstance(sampler, SequentialSampler), f"{self.__class__.__name__} only works with sequential samplers."
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: AssertionError: RangeBatchSampler only works with sequential samplers.
Please let me know what's going wrong here, thanks!
Version information
No response
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working