diff --git a/openml/utils.py b/openml/utils.py index 7e72e7aee..156a6257f 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -427,6 +427,18 @@ def safe_func(*args: P.args, **kwargs: P.kwargs) -> R: return func +def get_cache_size() -> int: + """Calculate the size of OpenML cache directory + + Returns + ------- + cache_size: int + Total size of cache in bytes + """ + path = Path(config.get_cache_directory()) + return sum(f.stat().st_size for f in path.rglob("*") if f.is_file()) + + def _create_lockfiles_dir() -> Path: path = Path(config.get_cache_directory()) / "locks" # TODO(eddiebergman): Not sure why this is allowed to error and ignore??? diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 35be84903..6e91cca1d 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -152,3 +152,28 @@ def test_correct_test_server_download_state(): task = openml.tasks.get_task(119) dataset = task.get_dataset() assert len(dataset.features) == dataset.get_data()[0].shape[1] + +@unittest.mock.patch("openml.config.get_cache_directory") +def test_get_cache_size(config_mock,tmp_path): + """ + Test that the OpenML cache size utility correctly reports the cache directory + size before and after fetching a dataset. + + This test uses a temporary directory (tmp_path) as the cache location by + patching the configuration via config_mock. It verifies two conditions: + empty cache and after dataset fetch. + + Parameters + ---------- + config_mock : unittest.mock.Mock + A mock that overrides the configured cache directory to point to tmp_path. + tmp_path : pathlib.Path + A pytest-provided temporary directory used as an isolated cache location. + """ + + config_mock.return_value = tmp_path + cache_size = openml.utils.get_cache_size() + assert cache_size == 0 + dataset = openml.datasets.get_dataset(dataset_id=3) + cache_size = openml.utils.get_cache_size() + assert cache_size == 2009 \ No newline at end of file