fix: update v0.1.8

terryyz · web-flow · commit 8a4b4029b857 · 2024-07-17T17:13:58.000+08:00
Update on v0.1.8
diff --git a/README.md b/README.md
@@ -104,18 +104,21 @@ pip install -U flash-attn
 To generate code samples from a model, you can use the following command:
 >
 ```bash
+# when greedy, there is no need for temperature and n_samples
 bigcodebench.generate \
     --model [model_name] \
-    --subset [complete|instruct] \
-    --greedy \
+    --split [complete|instruct] \
+    --subset [full|hard] \
+    [--greedy] \
     --bs [bs] \
     --temperature [temp] \
     --n_samples [n_samples] \
     --resume \
     --backend [vllm|hf|openai|mistral|anthropic|google] \
     --tp [gpu_number] \
     [--trust_remote_code] \
-    [--base_url [base_url]]
+    [--base_url [base_url]] \
+    [--tokenizer_name [tokenizer_name]]
 ```
 >
 The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples].jsonl`. Alternatively, you can use the following command to utilize our pre-built docker images for generating code samples:
@@ -124,7 +127,8 @@ The generated code samples will be stored in a file named `[model_name]--bigcode
 # If you are using GPUs
 docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebench/bigcodebench-generate:latest \
     --model [model_name] \ 
-    --subset [complete|instruct] \
+    --split [complete|instruct] \
+    --subset [full|hard] \
     [--greedy] \
     --bs [bs] \   
     --temperature [temp] \
@@ -136,7 +140,8 @@ docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebenc
 # ...Or if you are using CPUs
 docker run -v $(pwd):/app -t bigcodebench/bigcodebench-generate:latest \
     --model [model_name] \ 
-    --subset [complete|instruct] \
+    --split [complete|instruct] \
+    --subset [full|hard] \
     [--greedy] \
     --bs [bs] \   
     --temperature [temp] \
@@ -233,10 +238,10 @@ You are strongly recommended to use a sandbox such as [docker](https://docs.dock
 # If you want to change the RAM address space limit (in MB, 128 GB by default): `--max-as-limit XXX`
 # If you want to change the RAM data segment limit (in MB, 4 GB by default): `--max-data-limit`
 # If you want to change the RAM stack limit (in MB, 4 MB by default): `--max-stack-limit`
-docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
+docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
 
 # If you only want to check the ground truths
-docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --check-gt-only
+docker run -v $(pwd):/app bigcodebench/bigcodebench-evaluate:latest --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --check-gt-only
 ```
 
 ...Or if you want to try it locally regardless of the risks ⚠️:
@@ -251,12 +256,12 @@ Then, run the evaluation:
 
 ```bash
 # ...Or locally ⚠️
-bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl
+bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl
 # ...If you really don't want to check the ground truths
-bigcodebench.evaluate --subset [complete|instruct] --samples samples-sanitized-calibrated.jsonl --no-gt
+bigcodebench.evaluate --split [complete|instruct] --subset [full|hard] --samples samples-sanitized-calibrated.jsonl --no-gt
 
 # You are strongly recommended to use the following command to clean up the environment after evaluation:
-pids=$(ps -u $(id -u) -o pid,comm | grep '^ *[0-9]\\+ bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;
+pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;
 rm -rf /tmp/*
 ```
 
diff --git a/analysis/utils.py b/analysis/utils.py
@@ -640,7 +640,7 @@
     },
     "deepseek-coder": {
         "name": "DeepSeek-Coder-V2-Instruct",
-        "link": "https://www.deepseek.com/",
+        "link": "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Instruct",
         "prompted": True,
         "moe": True,
         "size": 236,
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
@@ -123,8 +123,7 @@ def main():
 
     args = parser.parse_args()
 
-    if args.greedy and (args.temperature != 0 or args.bs != 1 or args.n_samples != 1)\
-        or (args.temperature == 0 and args.n_samples == 1):
+    if args.greedy or (args.temperature == 0 and args.n_samples == 1):
         args.temperature = 0
         args.bs = 1
         args.n_samples = 1
diff --git a/bigcodebench/model.py b/bigcodebench/model.py
@@ -26,7 +26,6 @@
     warn("GoogleGenAI decoder will not work. Fix by `pip install google-generativeai`")
 
 import torch
-from stop_sequencer import StopSequencer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 try:
@@ -137,7 +136,8 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs)
         if self.tokenizer.chat_template is None:
             self.eos += extra_eos_for_direct_completion(dataset)
-        self.llm = LLM(model=name, max_model_len=2048, tokenizer=self.tokenizer_name, **kwargs)
+        self.llm = LLM(model=name, max_model_len=2048, **kwargs)
+        self.llm.set_tokenizer(tokenizer=self.tokenizer)
 
     def is_direct_completion(self) -> bool:
         return self.tokenizer.chat_template is None
@@ -190,11 +190,11 @@ def __init__(self, name: str, dataset: str, **kwargs):
         self.skip_special_tokens = True
 
         print(f"{kwargs = }", self.tokenizer_name)
-
         if self.tokenizer_name is None:
             self.tokenizer_name = self.name
-
+        
         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs)
+        
         if self.tokenizer.chat_template is None:
             self.eos += extra_eos_for_direct_completion(dataset)
 
@@ -220,18 +220,7 @@ def codegen(
             kwargs["top_p"] = 0.95
             kwargs["temperature"] = self.temperature
 
-        stop_sequencer = StopSequencer(
-            self.model,
-            model_type="causal",  # or seq2seq
-            tokenizer=self.tokenizer,
-        )
-
-        model = stop_sequencer.register_stop_texts(
-            stop_texts=self.eos,
-            input_length=input_tokens.size(-1),
-        )
-
-        outputs = model.generate(
+        outputs = self.model.generate(
             input_tokens,
             max_new_tokens=self.max_new_tokens,
             do_sample=do_sample,
diff --git a/run.sh b/run.sh
@@ -5,7 +5,8 @@ BACKEND=openai
 TEMP=0
 N_SAMPLES=1
 NUM_GPU=1
-SUBSET=instruct
+SPLIT=complete
+SUBSET=hard
 if [[ $MODEL == *"/"* ]]; then
   ORG=$(echo $MODEL | cut -d'/' -f1)--
   BASE_MODEL=$(echo $MODEL | cut -d'/' -f2)
@@ -14,24 +15,26 @@ else
   BASE_MODEL=$MODEL
 fi
 
-FILE_HEADER=$ORG$BASE_MODEL--$DATASET-$SUBSET--$BACKEND-$TEMP-$N_SAMPLES
+if [ "$SUBSET" = "full" ]; then
+    FILE_HEADER="${ORG}${BASE_MODEL}--${DATASET}-${SPLIT}--${BACKEND}-${TEMP}-${N_SAMPLES}"
+  else
+    FILE_HEADER="${ORG}${BASE_MODEL}--${DATASET}-${SUBSET}-${SPLIT}--${BACKEND}-${TEMP}-${N_SAMPLES}"
+  fi
 
 echo $FILE_HEADER
 bigcodebench.generate \
-  --id_range 0 1 \
   --tp $NUM_GPU \
   --model $MODEL \
-  --bs $BS \
-  --temperature $TEMP \
-  --n_samples $N_SAMPLES \
   --resume \
+  --split $SPLIT \
   --subset $SUBSET \
-  --backend $BACKEND
+  --backend $BACKEND \
+  --greedy
 
 bigcodebench.sanitize --samples $FILE_HEADER.jsonl --calibrate
 
 # Check if the ground truth works on your machine
-bigcodebench.evaluate --subset $SUBSET --samples $FILE_HEADER-sanitized-calibrated.jsonl
+bigcodebench.evaluate --split $SPLIT --subset $SUBSET --samples $FILE_HEADER-sanitized-calibrated.jsonl
 
 # If the execution is slow:
-bigcodebench.evaluate --subset $SUBSET --samples $FILE_HEADER-sanitized-calibrated.jsonl --parallel 32
+bigcodebench.evaluate --split $SPLIT --subset $SUBSET --samples $FILE_HEADER-sanitized-calibrated.jsonl --parallel 32
diff --git a/setup.cfg b/setup.cfg
@@ -37,7 +37,6 @@ generate =
     anthropic>=0.26.1
     google-generativeai>=0.5.4
     mistralai>=0.2.0
-    stop-sequencer>=1.2.3
     openai>=1.11.1
 
 [options.entry_points]