bigcode-project
diff --git a/‎Docker/Evaluate.Dockerfile‎
Lines changed: 3 additions & 1 deletion b/‎Docker/Evaluate.Dockerfile‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 22 additions & 13 deletions b/‎README.md‎
Lines changed: 22 additions & 13 deletions
diff --git a/‎analysis/get_results.py‎
Lines changed: 13 additions & 6 deletions b/‎analysis/get_results.py‎
Lines changed: 13 additions & 6 deletions
@@ -31,4 +31,6 @@ RUN chmod -R 777 /app
 
 USER bigcodebenchuser
 
-ENTRYPOINT ["python3", "-m", "bigcodebench.evaluate"]
+ENTRYPOINT ["python3", "-m", "bigcodebench.evaluate"]
+
+CMD ["sh", "-c", "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"]
@@ -4,25 +4,30 @@
 </center>
 
 <p align="center">
+    <a href="https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard"><img src="https://img.shields.io/badge/🤗&nbsp&nbsp%F0%9F%8F%86-leaderboard-%23ff8811"></a>
+    <a href="https://huggingface.co/collections/bigcode/bigcodebench-666ed21a5039c618e608ab06"><img src="https://img.shields.io/badge/🤗-collection-pink"></a>
+    <a href="https://bigcode-bench.github.io/"><img src="https://img.shields.io/badge/%F0%9F%8F%86-website-8A2BE2"></a>
+    <a href="https://arxiv.org/abs/2406.15877"><img src="https://img.shields.io/badge/arXiv-2406.15877-b31b1b.svg"></a>
     <a href="https://pypi.org/project/bigcodebench/"><img src="https://img.shields.io/pypi/v/bigcodebench?color=g"></a>
+    <a href="https://pepy.tech/project/bigcodebench"><img src="https://static.pepy.tech/badge/bigcodebench"></a>
+    <a href="https://github.com/bigcodebench/bigcodebench/blob/master/LICENSE"><img src="https://img.shields.io/pypi/l/bigcodebench"></a>
     <a href="https://hub.docker.com/r/bigcodebench/bigcodebench-evaluate" title="Docker-Eval"><img src="https://img.shields.io/docker/image-size/bigcodebench/bigcodebench-evaluate"></a>
     <a href="https://hub.docker.com/r/bigcodebench/bigcodebench-generate" title="Docker-Gen"><img src="https://img.shields.io/docker/image-size/bigcodebench/bigcodebench-generate"></a>
-    <a href="https://github.com/bigcodebench/bigcodebench/blob/master/LICENSE"><img src="https://img.shields.io/pypi/l/bigcodebench"></a>
 </p>
 
 <p align="center">
     <a href="#-about">🌸About</a> •
     <a href="#-quick-start">🔥Quick Start</a> •
-    <a href="#-llm-generated-code">💻LLM code</a> •
-    <a href="#-failure-inspection">🔍Failure inspection</a> •
+    <a href="#-failure-inspection">🔍Failure Inspection</a> •
     <a href="#-full-script">🚀Full Script</a> •
     <a href="#-result-analysis">📊Result Analysis</a> •
-    <a href="#-known-issues">🐞Known issues</a> •
+    <a href="#-llm-generated-code">💻LLM-generated Code</a> •
+    <a href="#-known-issues">🐞Known Issues</a> •
     <a href="#-citation">📜Citation</a> •
     <a href="#-acknowledgement">🙏Acknowledgement</a>
 </p>
 
-## About
+## 🌸 About
 
 ### BigCodeBench
 
@@ -249,6 +254,10 @@ Then, run the evaluation:
 bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
 # ...If you really don't want to check the ground truths
 bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --no-gt
+
+# You are strongly recommended to use the following command to clean up the environment after evaluation:
+pids=$(ps -u $(id -u) -o pid,comm | grep '^ *[0-9]\\+ bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;
+rm -rf /tmp/*
 ```
 
 > [!Tip]
@@ -298,23 +307,23 @@ Here are some tips to speed up the evaluation:
 </div>
 </details>
 
-## Failure Inspection
+## 🔍 Failure Inspection
 
 You can inspect the failed samples by using the following command:
 
 ```bash
 bigcodebench.inspect --eval-results sample-sanitized-calibrated_eval_results.json --in-place
 ```
 
-## Full Script
+## 🚀 Full Script
 
 We provide a sample script to run the full pipeline:
 
 ```bash
 bash run.sh
 ```
 
-## Result Analysis
+## 📊 Result Analysis
 
 We provide a script to replicate the analysis like Elo Rating and Task Solve Rate, which helps you understand the performance of the models further.
 
@@ -331,7 +340,7 @@ python get_results.py
 We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard):
 *  See the attachment of our [v0.1.5](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.1.5). We include both `sanitized_samples.zip` and `sanitized_samples_calibrated.zip` for your convenience.
 
-## Known Issues
+## 🐞 Known Issues
 
 - [ ] Due to the flakes in the evaluation, the execution results may vary slightly (~0.2%) between runs. We are working on improving the evaluation stability.
 
@@ -343,10 +352,10 @@ We share pre-generated code samples from LLMs we have [evaluated](https://huggin
 
 ```bibtex
 @article{zhuo2024bigcodebench,
-    title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions}, 
-    author={Terry Yue Zhuo and Minh Chien Vu and Jenny Chim and Han Hu and Wenhao Yu and Ratnadira Widyasari and Imam Nur Bani Yusuf and Haolan Zhan and Junda He and Indraneil Paul and Simon Brunner and Chen Gong and Thong Hoang and Armel Randy Zebaze and Xiaoheng Hong and Wen-Ding Li and Jean Kaddour and Ming Xu and Zhihan Zhang and Prateek Yadav and Naman Jain and Alex Gu and Zhoujun Cheng and Jiawei Liu and Qian Liu and Zijian Wang and David Lo and Binyuan Hui and Niklas Muennighoff and Daniel Fried and Xiaoning Du and Harm de Vries and Leandro Von Werra},
-    journal={arXiv preprint arXiv:2406.15877},
-    year={2024}
+  title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
+  author={Zhuo, Terry Yue and Vu, Minh Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and others},
+  journal={arXiv preprint arXiv:2406.15877},
+  year={2024}
 }
 ```
 
 
@@ -144,6 +144,7 @@ def split_gen():
 
 def read_task_perf(task="complete"):
     model_results = dict()
+    result_files = []
     for model, info in model_info.items():
         if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
             continue
@@ -164,13 +165,14 @@ def read_task_perf(task="complete"):
         except:
             continue
 
+        result_files.append(file)
         with open(file, "r") as f:
             data = json.load(f)
         for task_id, perfs in data["eval"].items():
             status = 1 if perfs[0]["status"] == "pass" else 0
             task_perf[task_id] = status
         model_results[info["name"]] = task_perf
-    return model_results
+    return model_results, result_files
 
 
 def get_winner_df(data_dict, task, task_level=True, no_tie=True):
@@ -267,9 +269,6 @@ def get_solve_rate(data_dict, task="complete"):
         for task_id in range(1140):
             task_solve_count[f"BigCodeBench/{task_id}"].append(task_perf[f"BigCodeBench/{task_id}"])
     solve_rate = {task_id: round(np.mean(perfs) * 100, 1) for task_id, perfs in task_solve_count.items()}
-    with open(f"{task}_solve_rate.txt", "w") as f:
-        f.write(f"Number of unsolved tasks: {sum([1 for task_id, solve_rate in solve_rate.items() if solve_rate == 0])}\n")
-        f.write(f"Number of fully solved tasks: {sum([1 for task_id, solve_rate in solve_rate.items() if solve_rate == 100])}\n")
     return Dataset.from_dict({"task_id": list(solve_rate.keys()), "solve_rate": list(solve_rate.values())})
 
 
@@ -313,8 +312,16 @@ def push_ds(ds, path, local=False):
 
     model_info = update_model_info(model_info)
     results = get_results()
-    complete_data = read_task_perf("complete")
-    instruct_data = read_task_perf("instruct")
+    files = []
+    complete_data, complete_files = read_task_perf("complete")
+    instruct_data, instruct_files = read_task_perf("instruct")
+    files.extend(complete_files)
+    files.extend(instruct_files)
+    shutil.rmtree("eval_results", ignore_errors=True)
+    os.makedirs("eval_results", exist_ok=True)
+    for file in files:
+        shutil.copy(file, "eval_results")
+    
     complete_solve_rate = get_solve_rate(complete_data, task="complete")
     instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
     solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})