@@ -104,10 +104,12 @@ pip install -U flash-attn
104104To generate code samples from a model, you can use the following command:
105105>
106106``` bash
107+ # when greedy, there is no need for temperature and n_samples
107108bigcodebench.generate \
108109 --model [model_name] \
109- --subset [complete| instruct] \
110- --greedy \
110+ --split [complete| instruct] \
111+ --subset [full| hard] \
112+ [--greedy] \
111113 --bs [bs] \
112114 --temperature [temp] \
113115 --n_samples [n_samples] \
@@ -124,7 +126,8 @@ The generated code samples will be stored in a file named `[model_name]--bigcode
124126# If you are using GPUs
125127docker run --gpus ' "device=$CUDA_VISIBLE_DEVICES"' -v $( pwd) :/app -t bigcodebench/bigcodebench-generate:latest \
126128 --model [model_name] \
127- --subset [complete| instruct] \
129+ --split [complete| instruct] \
130+ --subset [full| hard] \
128131 [--greedy] \
129132 --bs [bs] \
130133 --temperature [temp] \
@@ -136,7 +139,8 @@ docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebenc
136139# ...Or if you are using CPUs
137140docker run -v $( pwd) :/app -t bigcodebench/bigcodebench-generate:latest \
138141 --model [model_name] \
139- --subset [complete| instruct] \
142+ --split [complete| instruct] \
143+ --subset [full| hard] \
140144 [--greedy] \
141145 --bs [bs] \
142146 --temperature [temp] \
@@ -233,10 +237,10 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
233237# If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
234238# If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
235239# If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
236- docker run -v $( pwd) :/app bigcodebench/bigcodebench-evaluate:latest --subset [complete| instruct] --samples samples-sanitized-calibrated.jsonl
240+ docker run -v $( pwd) :/app bigcodebench/bigcodebench-evaluate:latest --split [complete| instruct] --subset [full | hard ] --samples samples-sanitized-calibrated.jsonl
237241
238242# If you only want to check the ground truths
239- docker run -v $( pwd) :/app bigcodebench/bigcodebench-evaluate:latest --subset [complete| instruct] --samples samples-sanitized-calibrated.jsonl --check-gt-only
243+ docker run -v $( pwd) :/app bigcodebench/bigcodebench-evaluate:latest --split [complete| instruct] --subset [full | hard ] --samples samples-sanitized-calibrated.jsonl --check-gt-only
240244` ` `
241245
242246...Or if you want to try it locally regardless of the risks ⚠️:
@@ -251,12 +255,12 @@ Then, run the evaluation:
251255
252256` ` ` bash
253257# ...Or locally ⚠️
254- bigcodebench.evaluate --subset [complete| instruct] --samples samples-sanitized-calibrated.jsonl
258+ bigcodebench.evaluate --split [complete| instruct] --subset [full | hard ] --samples samples-sanitized-calibrated.jsonl
255259# ...If you really don't want to check the ground truths
256- bigcodebench.evaluate --subset [complete| instruct] --samples samples-sanitized-calibrated.jsonl --no-gt
260+ bigcodebench.evaluate --split [complete| instruct] --subset [full | hard ] --samples samples-sanitized-calibrated.jsonl --no-gt
257261
258262# You are strongly recommended to use the following command to clean up the environment after evaluation:
259- pids=$( ps -u $( id -u) -o pid,comm | grep ' ^ *[0-9]\\+ bigcodebench' | awk ' {print $1}' ) ; if [ -n \" $pids \" ]; then echo $pids | xargs -r kill ; fi ;
263+ pids=$( ps -u $( id -u) -o pid,comm | grep ' bigcodebench' | awk ' {print $1}' ) ; if [ -n \" $pids \" ]; then echo $pids | xargs -r kill ; fi ;
260264rm -rf /tmp/*
261265` ` `
262266
0 commit comments