From c8273461529282e375ae11c35866617061438687 Mon Sep 17 00:00:00 2001 From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com> Date: Thu, 4 Dec 2025 14:19:07 +0100 Subject: [PATCH 1/3] Enhance logging for patient data completeness Updated logging to include total patients and final usable patients. --- src/stamp/modeling/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py index f5d20fe..ae1eb9f 100755 --- a/src/stamp/modeling/data.py +++ b/src/stamp/modeling/data.py @@ -814,8 +814,9 @@ def filter_complete_patient_data_( } _logger.info( - f"Kept {len(patient_to_ground_truth)}/{len(patient_to_ground_truth)} \ - patients with complete data ({len(patient_to_ground_truth) / len(patient_to_ground_truth):.1%})." + f"Total patients in clinical table: {total_clini}\n" + f"Patients appearing in slide table: {total_slides}\n" + f"Final usable patients (complete data): {final_patients}\n" ) return patients From 45bb2a2ccaf7a8a3d63a4858072a219d9361ec3e Mon Sep 17 00:00:00 2001 From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com> Date: Thu, 4 Dec 2025 14:21:06 +0100 Subject: [PATCH 2/3] Refactor survival status handling and cleanup code --- src/stamp/modeling/data.py | 97 ++++++++------------------------------ 1 file changed, 20 insertions(+), 77 deletions(-) diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py index ae1eb9f..c9859ee 100755 --- a/src/stamp/modeling/data.py +++ b/src/stamp/modeling/data.py @@ -281,8 +281,9 @@ def create_dataloader( for p in patient_data: t, e = (p.ground_truth or "nan nan").split(" ", 1) times.append(float(t) if t.lower() != "nan" else np.nan) - events.append(_parse_survival_status(e)) - + events.append( + 1.0 if e.lower() in {"dead", "event", "1", "Yes", "yes"} else 0.0 + ) labels = torch.tensor(np.column_stack([times, events]), dtype=torch.float32) else: raise ValueError(f"Unsupported task: {task}") @@ -446,9 +447,12 @@ def __getitem__( coords_um = [] for bag_file in self.bags[index]: with h5py.File(bag_file, "r") as h5: - feats.append( - torch.from_numpy(h5["feats"][:]) # pyright: ignore[reportIndexIssue] - ) + if "feats" in h5: + arr = h5["feats"][:] # pyright: ignore[reportIndexIssue] # original STAMP files + else: + arr = h5["patch_embeddings"][:] # type: ignore # your Kronos files + + feats.append(torch.from_numpy(arr)) coords_um.append(torch.from_numpy(get_coords(h5).coords_um)) feats = torch.concat(feats).float() @@ -667,26 +671,7 @@ def patient_to_survival_from_clini_table_( # normalize values clini_df[time_label] = clini_df[time_label].replace( - [ - "NA", - "NaN", - "nan", - "None", - "none", - "N/A", - "n/a", - "NULL", - "null", - "", - " ", - "?", - "-", - "--", - "#N/A", - "#NA", - "=#VALUE!", - ], - np.nan, + ["NA", "NaN", "nan", "", "=#VALUE!"], np.nan ) clini_df[status_label] = clini_df[status_label].str.strip().str.lower() @@ -704,10 +689,13 @@ def patient_to_survival_from_clini_table_( continue # Encode status: keep both dead (event=1) and alive (event=0) - status = _parse_survival_status(status_str) - - # Encode back to "alive"/"dead" like before - # status = "dead" if status_val == 1 else "alive" + if status_str in {"dead", "event", "1"}: + status = "dead" + elif status_str in {"alive", "censored", "0"}: + status = "alive" + else: + # skip unknown status + continue patient_to_ground_truth[pid] = f"{time_str} {status}" @@ -812,6 +800,9 @@ def filter_complete_patient_data_( } ) } + total_clini = len(patient_to_ground_truth) + total_slides = len(patient_to_slides) + final_patients = len(patients) _logger.info( f"Total patients in clinical table: {total_clini}\n" @@ -865,51 +856,3 @@ def get_stride(coords: Float[Tensor, "tile 2"]) -> float: ), ) return stride - - -def _parse_survival_status(value) -> int | None: - """ - Parse a survival status value (string, numeric, or None) into a binary indicator. - Currently assume no None inputs. - Returns: - 1 -> event/dead - 0 -> censored/alive - None -> missing (None, NaN, '') - - Raises: - ValueError if the input is non-missing but unrecognized. - - Examples: - 'dead', '1', 'event', 'yes' -> 1 - 'alive', '0', 'censored', 'no' -> 0 - None, NaN, '' -> None - """ - - # Handle missing inputs gracefully - # if value is None: - # return 0 # treat empty/missing as censored - # if isinstance(value, float) and math.isnan(value): - # return 0 # treat empty/missing as censored - - s = str(value).strip().lower() - # if s in {"", "nan", "none"}: - # return 0 # treat empty/missing as censored - - # Known mappings - positives = {"1", "event", "dead", "deceased", "yes", "y", "True", "true"} - negatives = {"0", "alive", "censored", "no", "false"} - - if s in positives: - return 1 - elif s in negatives: - return 0 - - # Try numeric fallback - try: - f = float(s) - return 1 if f > 0 else 0 - except ValueError: - raise ValueError( - f"Unrecognized survival status: '{value}'. " - f"Expected one of {sorted(positives | negatives)} or a numeric value." - ) From a2847ae95454e5a82a20438bbd78baa0ca6d66f5 Mon Sep 17 00:00:00 2001 From: Minh Duc Nguyen <37109868+mducducd@users.noreply.github.com> Date: Thu, 4 Dec 2025 14:29:41 +0100 Subject: [PATCH 3/3] Refactor and data cleaning Refactor status handling --- src/stamp/modeling/data.py | 95 +++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/src/stamp/modeling/data.py b/src/stamp/modeling/data.py index c9859ee..74eb812 100755 --- a/src/stamp/modeling/data.py +++ b/src/stamp/modeling/data.py @@ -281,9 +281,8 @@ def create_dataloader( for p in patient_data: t, e = (p.ground_truth or "nan nan").split(" ", 1) times.append(float(t) if t.lower() != "nan" else np.nan) - events.append( - 1.0 if e.lower() in {"dead", "event", "1", "Yes", "yes"} else 0.0 - ) + events.append(_parse_survival_status(e)) + labels = torch.tensor(np.column_stack([times, events]), dtype=torch.float32) else: raise ValueError(f"Unsupported task: {task}") @@ -447,12 +446,9 @@ def __getitem__( coords_um = [] for bag_file in self.bags[index]: with h5py.File(bag_file, "r") as h5: - if "feats" in h5: - arr = h5["feats"][:] # pyright: ignore[reportIndexIssue] # original STAMP files - else: - arr = h5["patch_embeddings"][:] # type: ignore # your Kronos files - - feats.append(torch.from_numpy(arr)) + feats.append( + torch.from_numpy(h5["feats"][:]) # pyright: ignore[reportIndexIssue] + ) coords_um.append(torch.from_numpy(get_coords(h5).coords_um)) feats = torch.concat(feats).float() @@ -671,7 +667,26 @@ def patient_to_survival_from_clini_table_( # normalize values clini_df[time_label] = clini_df[time_label].replace( - ["NA", "NaN", "nan", "", "=#VALUE!"], np.nan + [ + "NA", + "NaN", + "nan", + "None", + "none", + "N/A", + "n/a", + "NULL", + "null", + "", + " ", + "?", + "-", + "--", + "#N/A", + "#NA", + "=#VALUE!", + ], + np.nan, ) clini_df[status_label] = clini_df[status_label].str.strip().str.lower() @@ -689,13 +704,10 @@ def patient_to_survival_from_clini_table_( continue # Encode status: keep both dead (event=1) and alive (event=0) - if status_str in {"dead", "event", "1"}: - status = "dead" - elif status_str in {"alive", "censored", "0"}: - status = "alive" - else: - # skip unknown status - continue + status = _parse_survival_status(status_str) + + # Encode back to "alive"/"dead" like before + # status = "dead" if status_val == 1 else "alive" patient_to_ground_truth[pid] = f"{time_str} {status}" @@ -800,6 +812,7 @@ def filter_complete_patient_data_( } ) } + total_clini = len(patient_to_ground_truth) total_slides = len(patient_to_slides) final_patients = len(patients) @@ -856,3 +869,51 @@ def get_stride(coords: Float[Tensor, "tile 2"]) -> float: ), ) return stride + + +def _parse_survival_status(value) -> int | None: + """ + Parse a survival status value (string, numeric, or None) into a binary indicator. + Currently assume no None inputs. + Returns: + 1 -> event/dead + 0 -> censored/alive + None -> missing (None, NaN, '') + + Raises: + ValueError if the input is non-missing but unrecognized. + + Examples: + 'dead', '1', 'event', 'yes' -> 1 + 'alive', '0', 'censored', 'no' -> 0 + None, NaN, '' -> None + """ + + # Handle missing inputs gracefully + # if value is None: + # return 0 # treat empty/missing as censored + # if isinstance(value, float) and math.isnan(value): + # return 0 # treat empty/missing as censored + + s = str(value).strip().lower() + # if s in {"", "nan", "none"}: + # return 0 # treat empty/missing as censored + + # Known mappings + positives = {"1", "event", "dead", "deceased", "yes", "y", "True", "true"} + negatives = {"0", "alive", "censored", "no", "false"} + + if s in positives: + return 1 + elif s in negatives: + return 0 + + # Try numeric fallback + try: + f = float(s) + return 1 if f > 0 else 0 + except ValueError: + raise ValueError( + f"Unrecognized survival status: '{value}'. " + f"Expected one of {sorted(positives | negatives)} or a numeric value." + )