@@ -63,16 +63,49 @@ def get_croissant_version(version: str | None) -> str | None:
6363 return version
6464
6565
66- def get_dataset_name (dataset : mlc .Dataset ) -> str :
67- """Returns dataset name of the given MLcroissant dataset."""
66+ def get_dataset_name (dataset : mlc .Dataset , language : str | None = None ) -> str :
67+ """Returns dataset name of the given MLcroissant dataset.
68+
69+ Args:
70+ dataset: The MLcroissant dataset.
71+ language: For datasets with multiple names in different languages, this
72+ argument specifies the language to use.
73+ """
6874 if (url := dataset .metadata .url ) and url .startswith (_HUGGINGFACE_URL_PREFIX ):
6975 return url .removeprefix (_HUGGINGFACE_URL_PREFIX )
70- return dataset .metadata .name
76+ name = dataset .metadata .name
77+ if isinstance (name , dict ):
78+ if language is None :
79+ # Try a heuristic language, e.g., 'en'.
80+ if "en" in name :
81+ return name ["en" ]
82+ # Otherwise, take the first language in the dict.
83+ try :
84+ first_lang = next (iter (name ))
85+ return name [first_lang ]
86+ except StopIteration as exc :
87+ raise ValueError ("Dataset name dictionary is empty." ) from exc
88+ elif language not in dataset .metadata .name :
89+ raise ValueError (
90+ f"Language { language } not found in dataset names { name } ."
91+ )
92+ else :
93+ return name [language ]
94+ # At this point, name is not a dict anymore.
95+ return typing .cast (str , name )
96+
97+
98+ def get_tfds_dataset_name (
99+ dataset : mlc .Dataset , language : str | None = None
100+ ) -> str :
101+ """Returns TFDS compatible dataset name of the given MLcroissant dataset.
71102
72-
73- def get_tfds_dataset_name (dataset : mlc .Dataset ) -> str :
74- """Returns TFDS compatible dataset name of the given MLcroissant dataset."""
75- dataset_name = get_dataset_name (dataset )
103+ Args:
104+ dataset: The MLcroissant dataset.
105+ language: For datasets with multiple names in different languages, this
106+ argument specifies the language to use.
107+ """
108+ dataset_name = get_dataset_name (dataset , language = language )
76109 return conversion_utils .to_tfds_name (dataset_name )
77110
78111
0 commit comments