Merge branch 'main' into hard

terryyz · web-flow · commit 4b7e3af33dc0 · 2024-07-16T17:34:40.000+08:00
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
@@ -281,6 +281,32 @@ def stucking_checker():
     if not os.path.isfile(result_path):
         with open(result_path, "w") as f:
             json.dump(results, f, indent=2)
+    
+    pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json")
+    pass_at_k["model"] = flags.samples.split("/")[-1].replace(".jsonl", "")
+    pass_at_k["subset"] = flags.subset
+
+    def save_pass_at_k():
+        with open(pass_at_k_path, "w") as f:
+            json.dump(pass_at_k, f, indent=2)
+
+    if os.path.isfile(pass_at_k_path):
+        saved_pass_at_k = json.load(open(pass_at_k_path, "r"))
+        # compare saved_pass_at_k with pass_at_k
+        for k in saved_pass_at_k.keys():
+            if pass_at_k[k] != saved_pass_at_k[k]:
+                cprint(f"Warning: {k} is different from the saved one", "yellow")
+                
+        # ask user whether to save the pass@k
+        decision = ""
+        while decision.lower() not in ["y", "n"]:
+            print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
+            decision = input()
+        if decision.lower() == "y":
+            save_pass_at_k()
+            
+    else:
+        save_pass_at_k()
 
 
 def main():
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
@@ -36,7 +36,7 @@ def codegen(
 
         if model.is_direct_completion() and split == "instruct":
             raise Exception("Base model does not support direct completion for instruct tasks")
-
+        
         # create save_path if it doesn't exist, e.g., a/b.jsonl
         dirname = os.path.dirname(save_path)
         if not os.path.exists(dirname) and dirname != "":
@@ -119,6 +119,8 @@ def main():
     parser.add_argument("--base_url", default=None, type=str)
     parser.add_argument("--tp", default=1, type=int)
     parser.add_argument("--trust_remote_code", action="store_true")
+    parser.add_argument("--tokenizer_name", default=None, type=str)
+
     args = parser.parse_args()
 
     if args.greedy and (args.temperature != 0 or args.bs != 1 or args.n_samples != 1)\
@@ -142,7 +144,8 @@ def main():
         temperature=args.temperature,
         base_url=args.base_url,
         tp=args.tp,
-        trust_remote_code=args.trust_remote_code
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_name=args.tokenizer_name
     )
     
     extra = "-" + args.subset if args.subset != "full" else ""
@@ -160,7 +163,7 @@ def main():
         strip_newlines=args.strip_newlines,
         n_samples=args.n_samples,
         resume=args.resume,
-        id_range=args.id_range,
+        id_range=args.id_range
     )
 
 
diff --git a/bigcodebench/model.py b/bigcodebench/model.py
@@ -91,6 +91,7 @@ def __init__(
         max_new_tokens: int = 1280,
         dtype: str = "bfloat16",  # default
         trust_remote_code: bool = False,
+        tokenizer_name: str = None,
     ) -> None:
         print("Initializing a decoder model: {} ...".format(name))
         self.name = name
@@ -101,6 +102,7 @@ def __init__(
         self.max_new_tokens = max_new_tokens
         self.dtype = dtype
         self.trust_remote_code = trust_remote_code
+        self.tokenizer_name = tokenizer_name
 
     @abstractmethod
     def codegen(
@@ -185,7 +187,10 @@ def __init__(self, name: str, dataset: str, **kwargs):
         kwargs["torch_dtype"] = getattr(torch, self.dtype)
         self.skip_special_tokens = True
 
-        print(f"{kwargs = }")
+        print(f"{kwargs = }", self.tokenizer_name)
+
+        if self.tokenizer_name is None:
+            self.tokenizer_name = self.name
 
         self.tokenizer = AutoTokenizer.from_pretrained(name, legacy=False, **kwargs)
         if self.tokenizer.chat_template is None:
@@ -475,6 +480,7 @@ def make_model(
     tp=1,
     base_url=None,
     trust_remote_code=False,
+    tokenizer_name=None,
 ):
     if backend == "vllm":
         return GeneralVllmDecoder(
@@ -484,6 +490,7 @@ def make_model(
             dataset=dataset,
             tp=tp,
             trust_remote_code=trust_remote_code,
+            tokenizer_name=tokenizer_name,
         )
     elif backend == "hf":
         return GenenralHfTorchDecoder(
@@ -492,6 +499,7 @@ def make_model(
             temperature=temperature,
             dataset=dataset,
             trust_remote_code=trust_remote_code,
+            tokenizer_name=tokenizer_name,
         )
     elif backend == "openai":
         return OpenAIChatDecoder(