Skip to content

Commit ad18063

Browse files
committed
refactor: hf download
1 parent 1fc9b34 commit ad18063

File tree

2 files changed

+23
-19
lines changed

2 files changed

+23
-19
lines changed

bigcodebench/data/bigcodebench.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ def _ready_bigcodebench_path(mini=False, noextreme=False, version="default") ->
2727

2828
try:
2929
dataset = load_dataset(BIGCODEBENCH_HF, split=BIGCODEBENCH_VERSION)
30-
dataset.to_json(path)
30+
make_cache(url, dataset, path)
3131
except:
32-
make_cache(url, path)
32+
make_cache(url, None, path, gh=True)
3333

3434
return path
3535

bigcodebench/data/utils.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,29 @@ def get_dataset_metadata(name: str, version: str, mini: bool, noextreme: bool =
2424
return url, cache_path
2525

2626

27-
def make_cache(gzip_url, cache_path):
27+
def make_cache(gzip_url, hf_data, cache_path, gh=False):
2828
# Check if open eval file exists in CACHE_DIR
2929
if not os.path.exists(cache_path):
30-
# Install BigCodeBench dataset and parse as jsonl
31-
print(f"Downloading dataset from {gzip_url}")
32-
with tempdir.TempDir() as tmpdir:
33-
gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
34-
wget.download(gzip_url, gz_path)
35-
36-
with gzip.open(gz_path, "rb") as f:
37-
data = f.read().decode("utf-8")
38-
39-
# create CACHE_DIR if not exists
40-
if not os.path.exists(CACHE_DIR):
41-
os.makedirs(CACHE_DIR)
42-
43-
# Write the original open eval file to CACHE_DIR
44-
with open(cache_path, "w") as f:
45-
f.write(data)
30+
31+
if gh:
32+
# Install BigCodeBench dataset and parse as jsonl
33+
print(f"Downloading dataset from {gzip_url}")
34+
with tempdir.TempDir() as tmpdir:
35+
gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
36+
wget.download(gzip_url, gz_path)
37+
38+
with gzip.open(gz_path, "rb") as f:
39+
data = f.read().decode("utf-8")
40+
41+
# create CACHE_DIR if not exists
42+
if not os.path.exists(CACHE_DIR):
43+
os.makedirs(CACHE_DIR)
44+
45+
# Write the original open eval file to CACHE_DIR
46+
with open(cache_path, "w") as f:
47+
f.write(data)
48+
else:
49+
hf_data.to_json(cache_path)
4650

4751

4852
def write_jsonl(

0 commit comments

Comments
 (0)