From 1dc3f1369a91b3978c201445e7aba5d13bfb541c Mon Sep 17 00:00:00 2001 From: Sarah Huang Date: Fri, 28 Jun 2024 14:23:48 -0700 Subject: [PATCH] minor adjustment for load_transcripts function --- SEACells/genescores.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/SEACells/genescores.py b/SEACells/genescores.py index 3ab455a..1298959 100644 --- a/SEACells/genescores.py +++ b/SEACells/genescores.py @@ -120,10 +120,20 @@ def _pyranges_to_strings(peaks): def load_transcripts(path_to_gtf): - """Load transcripts from GTF File. `chr` is preprended to each entry.""" + """Load transcripts from GTF File. `chr` is prepended to each entry if not already present, and ensured to be lowercase if already present.""" gtf = pr.read_gtf(path_to_gtf) - gtf.Chromosome = "chr" + gtf.Chromosome.astype(str) + + # Vectorize chromosome formatting + # Use vectorized str methods to handle the 'chr' prefix and case conversion + gtf.Chromosome = "chr" + gtf.Chromosome.str.upper().str.replace("^CHR", "", regex=True, case=False) + + # Check if 'transcript' is present in the 'Feature' column + if "transcript" not in gtf.Feature.unique(): + raise ValueError("'transcript' not found in the 'Feature' column.") + + # Filter for transcripts only once and early transcripts = gtf[gtf.Feature == "transcript"] + return transcripts @@ -140,6 +150,7 @@ def _peaks_correlations_per_gene( # Gene transcript - use the longest transcript gene_transcripts = transcripts[transcripts.gene_name == gene] if len(gene_transcripts) == 0: + print(f"No transcripts found for gene: {gene}") return 0 longest_transcript = gene_transcripts[ np.arange(len(gene_transcripts))