Skip to content

Unable to run torchgmm.bayes.GaussianMixture using 2 gpus #102

@YanjunLiu2

Description

@YanjunLiu2

Hi,

I wrote a minimal code to run GMM on 2 gpus but it doesn't work.

import torch
from torchgmm.bayes import GaussianMixture

def main():
    torch.manual_seed(0)
    X = torch.cat([
        torch.randn(10000, 2) + torch.tensor([-5.0, 0.0]),
        torch.randn(10000, 2) + torch.tensor([+5.0, 0.0]),
        torch.randn(10000, 2) + torch.tensor([0.0, +5.0]),
    ])

    n_gpus = torch.cuda.device_count()

    gmm = GaussianMixture(
        num_components=3,
        covariance_type="diag",
        trainer_params=dict(
            accelerator="gpu",
            devices=2,  # 单机两卡;多机时把 num_nodes=2
            num_nodes=1,
            #strategy="ddp",
            #use_distributed_sampler=False,
        ),

    )

    gmm.fit(X)
    rank = 0
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        rank = torch.distributed.get_rank()
    if rank == 0:
        print("converged:", bool(getattr(gmm, "converged_", False)))
        print("iters:", int(getattr(gmm, "num_iter_", 0)))
        print("avg NLL:", float(getattr(gmm, "nll_", float("nan"))))

if __name__ == "__main__":
    main()


Here's the error:

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
[rank1]: Traceback (most recent call last):
[rank1]:   File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 41, in <module>
[rank1]:     main()
[rank1]:   File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 29, in main
[rank1]:     gmm.fit(X)
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/bayes/gmm/estimator.py", line 172, in fit
[rank1]:     ).fit(data)
[rank1]:       ^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/clustering/kmeans/estimator.py", line 139, in fit
[rank1]:     self.trainer(max_epochs=num_epochs, enable_progress_bar=False).fit(module, loader)
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
[rank1]:     call._call_and_handle_interrupt(
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
[rank1]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
[rank1]:     return function(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
[rank1]:     self._run(model, ckpt_path=ckpt_path)
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run
[rank1]:     results = self._run_stage()
[rank1]:               ^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage
[rank1]:     self.fit_loop.run()
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 197, in run
[rank1]:     self.setup_data()
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 241, in setup_data
[rank1]:     dl = _process_dataloader(trainer, trainer_fn, stage, dl)
[rank1]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 484, in _process_dataloader
[rank1]:     dataloader = trainer._data_connector._prepare_dataloader(dataloader, shuffle=is_shuffled, mode=stage)
[rank1]:                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 190, in _prepare_dataloader
[rank1]:     return _update_dataloader(dataloader, sampler, mode=mode)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 134, in _update_dataloader
[rank1]:     dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode)
[rank1]:                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 193, in _get_dataloader_init_args_and_kwargs
[rank1]:     dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode))
[rank1]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 283, in _dataloader_init_kwargs_resolve_sampler
[rank1]:     batch_sampler = batch_sampler_cls(
[rank1]:                     ^^^^^^^^^^^^^^^^^^
[rank1]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/base/data/sampler.py", line 25, in __init__
[rank1]:     assert isinstance(sampler, SequentialSampler), f"{self.__class__.__name__} only works with sequential samplers."
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: AssertionError: RangeBatchSampler only works with sequential samplers.
[rank0]: Traceback (most recent call last):
[rank0]:   File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 41, in <module>
[rank0]:     main()
[rank0]:   File "/data/UTe2XTEC/Untitled Folder/minimal_gmm.py", line 29, in main
[rank0]:     gmm.fit(X)
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/bayes/gmm/estimator.py", line 172, in fit
[rank0]:     ).fit(data)
[rank0]:       ^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/clustering/kmeans/estimator.py", line 139, in fit
[rank0]:     self.trainer(max_epochs=num_epochs, enable_progress_bar=False).fit(module, loader)
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
[rank0]:     call._call_and_handle_interrupt(
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
[rank0]:     return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
[rank0]:     return function(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
[rank0]:     self._run(model, ckpt_path=ckpt_path)
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run
[rank0]:     results = self._run_stage()
[rank0]:               ^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage
[rank0]:     self.fit_loop.run()
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 197, in run
[rank0]:     self.setup_data()
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py", line 241, in setup_data
[rank0]:     dl = _process_dataloader(trainer, trainer_fn, stage, dl)
[rank0]:          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 484, in _process_dataloader
[rank0]:     dataloader = trainer._data_connector._prepare_dataloader(dataloader, shuffle=is_shuffled, mode=stage)
[rank0]:                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py", line 190, in _prepare_dataloader
[rank0]:     return _update_dataloader(dataloader, sampler, mode=mode)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 134, in _update_dataloader
[rank0]:     dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode)
[rank0]:                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 193, in _get_dataloader_init_args_and_kwargs
[rank0]:     dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode))
[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py", line 283, in _dataloader_init_kwargs_resolve_sampler
[rank0]:     batch_sampler = batch_sampler_cls(
[rank0]:                     ^^^^^^^^^^^^^^^^^^
[rank0]:   File "/data/yanjun_data/anaconda3/envs/ICSD_client/lib/python3.12/site-packages/torchgmm/base/data/sampler.py", line 25, in __init__
[rank0]:     assert isinstance(sampler, SequentialSampler), f"{self.__class__.__name__} only works with sequential samplers."
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: AssertionError: RangeBatchSampler only works with sequential samplers.

Please let me know what's going wrong here, thanks!

Version information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions