removed flash attention to sdpa, which is supported in T4 GPU

Ubuntu · Ubuntu · commit 76f379042556 · 2025-07-01T21:35:23.000Z
diff --git a/src/korea_travel_guide/inference.py b/src/korea_travel_guide/inference.py
@@ -34,8 +34,8 @@ class InferenceArgs:
         default_factory=list,
         metadata={"help": "One or more input texts for `predict` mode."},
     )
-    use_flash_attention: bool = field(
-        default=True, metadata={"help": "Enable Flash Attention v1 (via sdp_kernel)."}
+    use_sdpa_attention: bool = field(
+        default=True, metadata={"help": "Enable Sdpa for mem-efficient kernel."}
     )
 
 
@@ -57,18 +57,10 @@ def main():
     # load tokenizer + model
     tok = AutoTokenizer.from_pretrained("facebook/bart-base")
     base_model = build_base_model()
+    if training_args.use_sdpa_attention:
+        base_model.config.attn_implementation = "sdpa"
     model = load_peft_model_for_inference(base_model)
 
-    # prepare Flash‐Attention context
-    if inf_args.use_flash_attention and device >= 0:
-        logger.info("Using flash attention")
-        ctx = torch.backends.cuda.sdp_kernel(
-            enable_flash=True, enable_math=True, enable_mem_efficient=True
-        )
-    else:
-        logger.info("Skipping flash attention v1")
-        ctx = nullcontext()
-
     # tokenize & format depending on mode
     if inf_args.mode == "test":
         # load dataset
@@ -97,8 +89,7 @@ def main():
             compute_metrics=build_compute_metrics(tok),
         )
 
-        with ctx:
-            pred_output = trainer.predict(ds_tok["test"])
+        pred_output = trainer.predict(ds_tok["test"])
         metrics = pred_output.metrics
         logger.info(f"Test metrics: {metrics}")
 
@@ -113,16 +104,15 @@ def main():
             truncation=True,
         ).to(model.device)
 
-        with ctx:
-            # fast batched generate
-            out = model.generate(
-                **enc,
-                max_length=512,
-                num_beams=5,
-                early_stopping=True,
-                length_penalty=1.0,
-                repetition_penalty=1.1,
-            )
+        # fast batched generate
+        out = model.generate(
+            **enc,
+            max_length=512,
+            num_beams=5,
+            early_stopping=True,
+            length_penalty=1.0,
+            repetition_penalty=1.1,
+        )
 
         decoded = tok.batch_decode(out, skip_special_tokens=True)
         for inp, pred in zip(inf_args.texts, decoded):
diff --git a/src/korea_travel_guide/train.py b/src/korea_travel_guide/train.py
@@ -46,16 +46,16 @@ class CustomTrainingArgs(Seq2SeqTrainingArguments):
     )
     eval_strategy: str = "epoch"
     save_strategy: str = "epoch"
-    logging_steps: int = 5
+    logging_steps: int = 50
     learning_rate: float = 1e-4
     lr_scheduler_type: str = "linear"
     warmup_ratio: float = 0.05
-    num_train_epochs: int = 3
-    per_device_train_batch_size: int = 8
-    per_device_eval_batch_size: int = 16
+    num_train_epochs: int = 8
+    per_device_train_batch_size: int = 16
+    per_device_eval_batch_size: int = 32
     max_grad_norm: float = 0.5
     label_smoothing_factor: float = 0.1
-    weight_decay: float = 0.01
+    # weight_decay: float = 0.01
     generation_max_length: int = 384
     save_total_limit: int = 2
     fp16: bool = True
@@ -76,8 +76,8 @@ class CustomTrainingArgs(Seq2SeqTrainingArguments):
         default=False,
         metadata={"help": "If True, run the test-split evaluation after training."},
     )
-    use_flash_attention: bool = field(
-        default=True, metadata={"help": "Whether to enable Flash Attention v1."}
+    use_sdpa_attention: bool = field(
+        default=True, metadata={"help": "Enable Sdpa for mem-efficient kernel.}
     )
 
 
@@ -143,6 +143,8 @@ def main() -> None:
 
     # initialize base model and LoRA
     base_model = build_base_model()
+    if training_args.use_sdpa_attention:
+        base_model.config.attn_implementation = "sdpa"
     logger.info(
         f"Base model trainable params:\n{print_trainable_parameters(base_model)}"
     )
@@ -153,42 +155,7 @@ def main() -> None:
         f"LoRA model (peft_rank={training_args.peft_rank}, lora_alpha={training_args.lora_alpha}) trainable params:\n{print_trainable_parameters(lora_model)}"
     )
 
-    # from torch.utils.data import DataLoader
-
-    # data_collator = DataCollatorForSeq2Seq(
-    #     tok,
-    #     model=lora_model,
-    #     padding="longest",
-    #     label_pad_token_id=-100,
-    # )
-
-    # batch = next(iter(DataLoader(ds_tok["train"], batch_size=2, collate_fn=data_collator )))
-    # # 1) decode inputs normally
-    # print("INPUTS:")
-    # print(tok.batch_decode(batch["input_ids"], skip_special_tokens=True))
-
-    # # 2) map -100 → pad_token_id before decoding labels
-    # labels = batch["labels"].detach().cpu().numpy()
-    # labels = np.where(labels != -100, labels, tok.pad_token_id)
-
-    # print("LABELS:")
-    # print(tok.batch_decode(labels, skip_special_tokens=True))
-
-    # import sys
-    # sys.exit()
-
     # ---------- Train ----------
-    # toggle flash attention
-    if training_args.use_flash_attention:
-        logger.info("Using flash attention")
-        ctx = torch.backends.cuda.sdp_kernel(
-            enable_flash=True, enable_math=True, enable_mem_efficient=True
-        )
-    else:
-        logger.info("Skipping flash attention")
-        ctx = nullcontext()
-    # ctx = nullcontext()
-
     # data collator: dynamic padding per batch
     data_collator = DataCollatorForSeq2Seq(
         tok,
@@ -209,15 +176,13 @@ def main() -> None:
         compute_metrics=build_compute_metrics(tok),
     )
 
-    with ctx:
-        trainer.train()
+    trainer.train()
 
     # ---------- Save, Test or Push ----------
     # evaluate test
     if training_args.run_test:
         logger.info("Running final test-set evaluation...")
-        with ctx:
-            metrics = trainer.evaluate(ds_tok["test"])
+        metrics = trainer.evaluate(ds_tok["test"])
         logger.info(f"Test metrics:\n{metrics}")
     else:
         logger.info("Skipping test evaluation.")