From c8273461529282e375ae11c35866617061438687 Mon Sep 17 00:00:00 2001
From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:19:07 +0100
Subject: [PATCH 1/3] Enhance logging for patient data completeness

Updated logging to include total patients and final usable patients.
---
 src/stamp/modeling/data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py
index f5d20fe..ae1eb9f 100755
--- a/src/stamp/modeling/data.py
+++ b/src/stamp/modeling/data.py
@@ -814,8 +814,9 @@ def filter_complete_patient_data_(
     }
 
     _logger.info(
-        f"Kept {len(patient_to_ground_truth)}/{len(patient_to_ground_truth)} \
-        patients with complete data ({len(patient_to_ground_truth) / len(patient_to_ground_truth):.1%})."
+        f"Total patients in clinical table: {total_clini}\n"
+        f"Patients appearing in slide table: {total_slides}\n"
+        f"Final usable patients (complete data): {final_patients}\n"
     )
     return patients
 

From 45bb2a2ccaf7a8a3d63a4858072a219d9361ec3e Mon Sep 17 00:00:00 2001
From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:21:06 +0100
Subject: [PATCH 2/3] Refactor survival status handling and cleanup code

---
 src/stamp/modeling/data.py | 97 ++++++++------------------------------
 1 file changed, 20 insertions(+), 77 deletions(-)

diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py
index ae1eb9f..c9859ee 100755
--- a/src/stamp/modeling/data.py
+++ b/src/stamp/modeling/data.py
@@ -281,8 +281,9 @@ def create_dataloader(
             for p in patient_data:
                 t, e = (p.ground_truth or "nan nan").split(" ", 1)
                 times.append(float(t) if t.lower() != "nan" else np.nan)
-                events.append(_parse_survival_status(e))
-
+                events.append(
+                    1.0 if e.lower() in {"dead", "event", "1", "Yes", "yes"} else 0.0
+                )
             labels = torch.tensor(np.column_stack([times, events]), dtype=torch.float32)
         else:
             raise ValueError(f"Unsupported task: {task}")
@@ -446,9 +447,12 @@ def __getitem__(
         coords_um = []
         for bag_file in self.bags[index]:
             with h5py.File(bag_file, "r") as h5:
-                feats.append(
-                    torch.from_numpy(h5["feats"][:])  # pyright: ignore[reportIndexIssue]
-                )
+                if "feats" in h5:
+                    arr = h5["feats"][:]  # pyright: ignore[reportIndexIssue] # original STAMP files
+                else:
+                    arr = h5["patch_embeddings"][:]  # type: ignore # your Kronos files
+
+                feats.append(torch.from_numpy(arr))
                 coords_um.append(torch.from_numpy(get_coords(h5).coords_um))
 
         feats = torch.concat(feats).float()
@@ -667,26 +671,7 @@ def patient_to_survival_from_clini_table_(
 
     # normalize values
     clini_df[time_label] = clini_df[time_label].replace(
-        [
-            "NA",
-            "NaN",
-            "nan",
-            "None",
-            "none",
-            "N/A",
-            "n/a",
-            "NULL",
-            "null",
-            "",
-            " ",
-            "?",
-            "-",
-            "--",
-            "#N/A",
-            "#NA",
-            "=#VALUE!",
-        ],
-        np.nan,
+        ["NA", "NaN", "nan", "", "=#VALUE!"], np.nan
     )
     clini_df[status_label] = clini_df[status_label].str.strip().str.lower()
 
@@ -704,10 +689,13 @@ def patient_to_survival_from_clini_table_(
             continue
 
         # Encode status: keep both dead (event=1) and alive (event=0)
-        status = _parse_survival_status(status_str)
-
-        # Encode back to "alive"/"dead" like before
-        # status = "dead" if status_val == 1 else "alive"
+        if status_str in {"dead", "event", "1"}:
+            status = "dead"
+        elif status_str in {"alive", "censored", "0"}:
+            status = "alive"
+        else:
+            # skip unknown status
+            continue
 
         patient_to_ground_truth[pid] = f"{time_str} {status}"
 
@@ -812,6 +800,9 @@ def filter_complete_patient_data_(
             }
         )
     }
+    total_clini = len(patient_to_ground_truth)
+    total_slides = len(patient_to_slides)
+    final_patients = len(patients)
 
     _logger.info(
         f"Total patients in clinical table: {total_clini}\n"
@@ -865,51 +856,3 @@ def get_stride(coords: Float[Tensor, "tile 2"]) -> float:
         ),
     )
     return stride
-
-
-def _parse_survival_status(value) -> int | None:
-    """
-    Parse a survival status value (string, numeric, or None) into a binary indicator.
-    Currently assume no None inputs.
-    Returns:
-        1 -> event/dead
-        0 -> censored/alive
-        None -> missing (None, NaN, '')
-
-    Raises:
-        ValueError if the input is non-missing but unrecognized.
-
-    Examples:
-        'dead', '1', 'event', 'yes'  -> 1
-        'alive', '0', 'censored', 'no' -> 0
-        None, NaN, '' -> None
-    """
-
-    # Handle missing inputs gracefully
-    # if value is None:
-    #     return 0  # treat empty/missing as censored
-    # if isinstance(value, float) and math.isnan(value):
-    #     return 0  # treat empty/missing as censored
-
-    s = str(value).strip().lower()
-    # if s in {"", "nan", "none"}:
-    #     return 0  # treat empty/missing as censored
-
-    # Known mappings
-    positives = {"1", "event", "dead", "deceased", "yes", "y", "True", "true"}
-    negatives = {"0", "alive", "censored", "no", "false"}
-
-    if s in positives:
-        return 1
-    elif s in negatives:
-        return 0
-
-    # Try numeric fallback
-    try:
-        f = float(s)
-        return 1 if f > 0 else 0
-    except ValueError:
-        raise ValueError(
-            f"Unrecognized survival status: '{value}'. "
-            f"Expected one of {sorted(positives | negatives)} or a numeric value."
-        )

From a2847ae95454e5a82a20438bbd78baa0ca6d66f5 Mon Sep 17 00:00:00 2001
From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com>
Date: Thu, 4 Dec 2025 14:29:41 +0100
Subject: [PATCH 3/3] Refactor and data cleaning

Refactor status handling
---
 src/stamp/modeling/data.py | 95 +++++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 17 deletions(-)

diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py
index c9859ee..74eb812 100755
--- a/src/stamp/modeling/data.py
+++ b/src/stamp/modeling/data.py
@@ -281,9 +281,8 @@ def create_dataloader(
             for p in patient_data:
                 t, e = (p.ground_truth or "nan nan").split(" ", 1)
                 times.append(float(t) if t.lower() != "nan" else np.nan)
-                events.append(
-                    1.0 if e.lower() in {"dead", "event", "1", "Yes", "yes"} else 0.0
-                )
+                events.append(_parse_survival_status(e))
+
             labels = torch.tensor(np.column_stack([times, events]), dtype=torch.float32)
         else:
             raise ValueError(f"Unsupported task: {task}")
@@ -447,12 +446,9 @@ def __getitem__(
         coords_um = []
         for bag_file in self.bags[index]:
             with h5py.File(bag_file, "r") as h5:
-                if "feats" in h5:
-                    arr = h5["feats"][:]  # pyright: ignore[reportIndexIssue] # original STAMP files
-                else:
-                    arr = h5["patch_embeddings"][:]  # type: ignore # your Kronos files
-
-                feats.append(torch.from_numpy(arr))
+                feats.append(
+                    torch.from_numpy(h5["feats"][:])  # pyright: ignore[reportIndexIssue]
+                )
                 coords_um.append(torch.from_numpy(get_coords(h5).coords_um))
 
         feats = torch.concat(feats).float()
@@ -671,7 +667,26 @@ def patient_to_survival_from_clini_table_(
 
     # normalize values
     clini_df[time_label] = clini_df[time_label].replace(
-        ["NA", "NaN", "nan", "", "=#VALUE!"], np.nan
+        [
+            "NA",
+            "NaN",
+            "nan",
+            "None",
+            "none",
+            "N/A",
+            "n/a",
+            "NULL",
+            "null",
+            "",
+            " ",
+            "?",
+            "-",
+            "--",
+            "#N/A",
+            "#NA",
+            "=#VALUE!",
+        ],
+        np.nan,
     )
     clini_df[status_label] = clini_df[status_label].str.strip().str.lower()
 
@@ -689,13 +704,10 @@ def patient_to_survival_from_clini_table_(
             continue
 
         # Encode status: keep both dead (event=1) and alive (event=0)
-        if status_str in {"dead", "event", "1"}:
-            status = "dead"
-        elif status_str in {"alive", "censored", "0"}:
-            status = "alive"
-        else:
-            # skip unknown status
-            continue
+        status = _parse_survival_status(status_str)
+
+        # Encode back to "alive"/"dead" like before
+        # status = "dead" if status_val == 1 else "alive"
 
         patient_to_ground_truth[pid] = f"{time_str} {status}"
 
@@ -800,6 +812,7 @@ def filter_complete_patient_data_(
             }
         )
     }
+
     total_clini = len(patient_to_ground_truth)
     total_slides = len(patient_to_slides)
     final_patients = len(patients)
@@ -856,3 +869,51 @@ def get_stride(coords: Float[Tensor, "tile 2"]) -> float:
         ),
     )
     return stride
+
+
+def _parse_survival_status(value) -> int | None:
+    """
+    Parse a survival status value (string, numeric, or None) into a binary indicator.
+    Currently assume no None inputs.
+    Returns:
+        1 -> event/dead
+        0 -> censored/alive
+        None -> missing (None, NaN, '')
+
+    Raises:
+        ValueError if the input is non-missing but unrecognized.
+
+    Examples:
+        'dead', '1', 'event', 'yes'  -> 1
+        'alive', '0', 'censored', 'no' -> 0
+        None, NaN, '' -> None
+    """
+
+    # Handle missing inputs gracefully
+    # if value is None:
+    #     return 0  # treat empty/missing as censored
+    # if isinstance(value, float) and math.isnan(value):
+    #     return 0  # treat empty/missing as censored
+
+    s = str(value).strip().lower()
+    # if s in {"", "nan", "none"}:
+    #     return 0  # treat empty/missing as censored
+
+    # Known mappings
+    positives = {"1", "event", "dead", "deceased", "yes", "y", "True", "true"}
+    negatives = {"0", "alive", "censored", "no", "false"}
+
+    if s in positives:
+        return 1
+    elif s in negatives:
+        return 0
+
+    # Try numeric fallback
+    try:
+        f = float(s)
+        return 1 if f > 0 else 0
+    except ValueError:
+        raise ValueError(
+            f"Unrecognized survival status: '{value}'. "
+            f"Expected one of {sorted(positives | negatives)} or a numeric value."
+        )