From 1dc3f1369a91b3978c201445e7aba5d13bfb541c Mon Sep 17 00:00:00 2001
From: Sarah Huang <huangy0204@gmail.com>
Date: Fri, 28 Jun 2024 14:23:48 -0700
Subject: [PATCH] minor adjustment for load_transcripts function

---
 SEACells/genescores.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/SEACells/genescores.py b/SEACells/genescores.py
index 3ab455a..1298959 100644
--- a/SEACells/genescores.py
+++ b/SEACells/genescores.py
@@ -120,10 +120,20 @@ def _pyranges_to_strings(peaks):
 
 
 def load_transcripts(path_to_gtf):
-    """Load transcripts from GTF File. `chr` is preprended to each entry."""
+    """Load transcripts from GTF File. `chr` is prepended to each entry if not already present, and ensured to be lowercase if already present."""
     gtf = pr.read_gtf(path_to_gtf)
-    gtf.Chromosome = "chr" + gtf.Chromosome.astype(str)
+
+    # Vectorize chromosome formatting
+    # Use vectorized str methods to handle the 'chr' prefix and case conversion
+    gtf.Chromosome = "chr" + gtf.Chromosome.str.upper().str.replace("^CHR", "", regex=True, case=False)
+
+    # Check if 'transcript' is present in the 'Feature' column
+    if "transcript" not in gtf.Feature.unique():
+        raise ValueError("'transcript' not found in the 'Feature' column.")
+    
+    # Filter for transcripts only once and early
     transcripts = gtf[gtf.Feature == "transcript"]
+
     return transcripts
 
 
@@ -140,6 +150,7 @@ def _peaks_correlations_per_gene(
     # Gene transcript - use the longest transcript
     gene_transcripts = transcripts[transcripts.gene_name == gene]
     if len(gene_transcripts) == 0:
+        print(f"No transcripts found for gene: {gene}")
         return 0
     longest_transcript = gene_transcripts[
         np.arange(len(gene_transcripts))