increased dataset size to 3500 and optimized for training on T4 GPU

codinglabsong · codinglabsong · commit d69caeb7b481 · 2025-07-01T14:14:34.000-07:00
diff --git a/data/subreddit_size_map.json b/data/subreddit_size_map.json
@@ -1,4 +1,8 @@
 {
-    "askscience": 500,
-    "AskHistorians": 500
+    "ExplainLikeImFive": 500,
+    "AskPhysics": 500,
+    "AskSocialScience": 500,
+    "AskDocs": 500,
+    "askscience": 1500,
+    "AskHistorians": 1500
 }
diff --git a/src/korea_travel_guide/data.py b/src/korea_travel_guide/data.py
@@ -192,22 +192,27 @@ def split_and_save(df, out_dir: Union[str, Path]):
 def tokenize_and_format(
     ds: DatasetDict,
     checkpoint: str = "facebook/bart-base",
-    max_input_length: int = 1024,
-    max_target_length: int = 1024,
+    max_input_length: int = 224,  # max 1024
+    max_target_length: int = 800,  # max 1024
 ) -> Tuple[DatasetDict, AutoTokenizer]:
     tok = AutoTokenizer.from_pretrained(checkpoint)
 
     def _preprocess_batch(examples):
         # tokenize inputs
+        tok.truncation_side = "right"
         model_inputs = tok(
-            examples["question"], max_length=max_input_length, truncation=True
+            examples["question"],
+            max_length=max_input_length,
+            truncation=True,
         )
         # tokenize targets in “target” mode
+        tok.truncation_side = "left"
         labels = tok(
             text_target=examples["answer"],
             max_length=max_target_length,
             truncation=True,
         )
+        tok.truncation_side = "right"  # reset for safety
 
         model_inputs["labels"] = labels["input_ids"]
         return model_inputs
diff --git a/src/korea_travel_guide/train.py b/src/korea_travel_guide/train.py
@@ -51,12 +51,12 @@ class CustomTrainingArgs(Seq2SeqTrainingArguments):
     lr_scheduler_type: str = "linear"
     warmup_ratio: float = 0.05
     num_train_epochs: int = 3
-
     per_device_train_batch_size: int = 8
     per_device_eval_batch_size: int = 16
     max_grad_norm: float = 0.5
     label_smoothing_factor: float = 0.1
     weight_decay: float = 0.01
+    generation_max_length: int = 384
     save_total_limit: int = 2
     fp16: bool = True
     predict_with_generate: bool = True
@@ -195,6 +195,7 @@ def main() -> None:
         model=lora_model,
         padding="longest",  # or "max_length"
         label_pad_token_id=-100,
+        pad_to_multiple_of=8,  # tensor-core friendly
     )
 
     # initialize trainer & train

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,8 @@`
`1`	`1`	`{`
`2`		`- "askscience": 500,`
`3`		`- "AskHistorians": 500`
	`2`	`+ "ExplainLikeImFive": 500,`
	`3`	`+ "AskPhysics": 500,`
	`4`	`+ "AskSocialScience": 500,`
	`5`	`+ "AskDocs": 500,`
	`6`	`+ "askscience": 1500,`
	`7`	`+ "AskHistorians": 1500`
`4`	`8`	`}`