From f3ff68c358b279bed23b7a9fe790a81f50499317 Mon Sep 17 00:00:00 2001
From: Dylan Taylor <dtaylo95@jhu.edu>
Date: Tue, 26 Sep 2023 14:29:52 -0400
Subject: [PATCH 1/3] Modified scripts to be agnostic to input variantID format

---
 src/afcn.py   |  6 +++---
 src/calc.pyx  | 18 +++++++++---------
 src/parse.pyx | 44 ++++++++++++++++++++++++++++----------------
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/src/afcn.py b/src/afcn.py
index affe54b..1840e9d 100755
--- a/src/afcn.py
+++ b/src/afcn.py
@@ -168,11 +168,11 @@ def main():
 
         thisgene_expressions = expression_matrix[expression_matrix.index.isin([gene])]
         #variants in this gene
-        thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id
+        thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id_clean
         #haplotypes for these variants for all inds
         thisgene_haplotypes = haplotype_matrix[haplotype_matrix.index.isin(list(thisgene_variants))]
         #the eqtl entries for this gene
-        thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id.isin(list(thisgene_variants)))]    
+        thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id_clean.isin(list(thisgene_variants)))]    
         #save in dictionary
         inputs[gene] = [thisgene_expressions, thisgene_eqtls, thisgene_haplotypes, cov_dataf_full, args]
 
@@ -192,7 +192,7 @@ def main():
     out_df = pd.merge(eqtl_dataframe_copy, final_df, how='left',on=original_columns )
 
     #now drop the gene_id_clean column
-    out_df = out_df.drop(columns=["gene_id_clean"])
+    out_df = out_df.drop(columns=["gene_id_clean", "variant_id_clean"])
     
     ###############################################################################
     #
diff --git a/src/calc.pyx b/src/calc.pyx
index 53bfcb2..eadde08 100644
--- a/src/calc.pyx
+++ b/src/calc.pyx
@@ -312,17 +312,17 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
         if parameter in variant_list:
             
        
-            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
-            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
+            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
+            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
        
             if ci_unsuccessful == 0:
        
                 #get +-95% intervals
                 minus_95 = ci[parameter][1][1]
-                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
+                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
        
                 plus_95 = ci[parameter][5][1]
-                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
+                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
        
        
         else: #its C0
@@ -344,7 +344,7 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
     for lowq_eqtl in zero_list.keys():
  
         # No need for confidence intervals
-        eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = 0
+        eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = 0
 
     return eqtl_dataf
 
@@ -403,17 +403,17 @@ def least_squares(h1, h2, thisgene_expressions, eqtl_dataf, covar, args):
         #if parameter is an s value, variant_order excludes variants with nan values
         if parameter in variant_list:
 
-            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
-            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
+            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
+            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
 
             if ci_unsuccessful == 0:
 
                 #get +-95% intervals
                 minus_95 = ci[parameter][1][1]
-                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
+                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
 
                 plus_95 = ci[parameter][5][1]
-                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
+                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
 
 
         else: #its C0 or one of the peer factors
diff --git a/src/parse.pyx b/src/parse.pyx
index 5d7c678..e3693bb 100644
--- a/src/parse.pyx
+++ b/src/parse.pyx
@@ -33,13 +33,20 @@ def read_eqtls(eqtl_filename):
     #open the eqtl file
     eqtl_file = pd.read_csv(eqtl_filename, sep='\t')
     
-    #drop gene versions
-    eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]
+    # #drop gene versions
+    # eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]
     
-    if is_variant_id_format_valid(eqtl_file) == False:
+    # Clean up gene and variant IDs
+    gene_map = {gene_id: f'g{i}' for i, gene_id in enumerate(eqtl_file.gene_id.unique())}
+    eqtl_file['gene_id_clean'] = eqtl_file.gene_id.map(gene_map)
 
-        raise Exception('''Variant IDs must not begin with a digit, 
-                reformat your vcf and EQTL matrix''')
+    variant_map = {variant_id: f'v{i}' for i, variant_id in enumerate(eqtl_file.variant_id.unique())}
+    eqtl_file['variant_id_clean'] = eqtl_file.variant_id.map(variant_map)
+
+    # if is_variant_id_format_valid(eqtl_file) == False:
+
+    #     raise Exception('''Variant IDs must not begin with a digit, 
+    #             reformat your vcf and EQTL matrix''')
  
 
     return eqtl_file
@@ -49,7 +56,7 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
     '''Used to read the relevant expressions into a pandas dataframe'''
     
-    chosen_genes = list(eqtl_file.gene_id_clean)
+    gene_map = eqtl_file.loc[:, ['gene_id', 'gene_id_clean']].set_index('gene_id')['gene_id_clean'].to_dict()
     expr_df = []
     expr_df_cols = []
     #open the expression file, get the expressions for the genes we have eqtls for
@@ -75,9 +82,9 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
                 #try splitting the gene by version number, in 
                 #case our user forgot
-                thisgene = expr_data[0].split(".")[0]
+                thisgene = expr_data[0]
                 
-                if thisgene in chosen_genes:
+                if thisgene in gene_map:
                     expr_df.append(expr_data)
 
     expr_dataframe = pd.DataFrame(expr_df, columns=expr_df_cols)
@@ -96,7 +103,7 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
  
     #now do the gene version correction once and for all if needed
-    expr_dataframe['Name'] = expr_dataframe['Name'].str.split(".").str[0]
+    expr_dataframe['Name'] = expr_dataframe['Name'].map(gene_map)
     
     return expr_dataframe
 
@@ -127,7 +134,7 @@ def read_haplotypes(vcfName, eqtl_file):
 
     '''Reads in the variants we have eqtls and expressions(genes) for'''
     
-    chosen_variants = eqtl_file.variant_id.unique()
+    variant_map = eqtl_file.loc[:, ['variant_id', 'variant_id_clean']].set_index('variant_id')['variant_id_clean'].to_dict()
 
     #match these to the vcf file
     vcf_df = []
@@ -140,10 +147,11 @@ def read_haplotypes(vcfName, eqtl_file):
     #write the header first
     vcf_df_cols = get_vcf_header(vcfName)
 
-    for variant in chosen_variants:
+    for variant in variant_map:
 
         #get coordinates
-        [chrom, pos] = variant.split("_")[:2]
+        chrom = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_chr']].iloc[0,0]
+        pos = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_pos']].iloc[0,0]
 
         #query vcf using tabix
         try:
@@ -151,8 +159,9 @@ def read_haplotypes(vcfName, eqtl_file):
             records = tabix_haplotypes.fetch(chrom, int(pos) - 1, int(pos))
 
             for record in records:
-
-                vcf_df.append(record.split('\t'))
+                record_info = record.split('\t')
+                if record_info[2] == variant:
+                    vcf_df.append(record_info)
 
 
         except:
@@ -162,10 +171,13 @@ def read_haplotypes(vcfName, eqtl_file):
 
     vcf_dataframe = pd.DataFrame(vcf_df, columns=vcf_df_cols).drop_duplicates()
 
+    # Use clean variant ids
+    vcf_dataframe['ID'] = vcf_dataframe.ID.map(variant_map)
+
     #do a quick check and raise an exception if no eqtls were found
     if vcf_dataframe.empty:
 
-        raise Exception("No mathing EQTLs were found in VCF file - check variant ID formatting")
+        raise Exception("No matching EQTLs were found in VCF file - check variant ID formatting")
 
     
     return vcf_dataframe
@@ -184,7 +196,7 @@ def drop_novcf_variants(eqtl_dataframe, vcf_dataframe):
     
     '''Drop variants that we couldnt get from the vcf file from the eqtl matrix'''
 
-    corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id.isin(vcf_dataframe.ID)]
+    corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id_clean.isin(vcf_dataframe.ID)]
     
     return corrected_eqtl_dataframe
     

From 07eab649e9c90742c77047f746ed73032dfacd52 Mon Sep 17 00:00:00 2001
From: Dylan Taylor <54147660+dtaylo95@users.noreply.github.com>
Date: Wed, 15 Nov 2023 12:03:39 -0500
Subject: [PATCH 2/3] Tab-separated output

---
 src/afcn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/afcn.py b/src/afcn.py
index 1840e9d..e8c8a43 100755
--- a/src/afcn.py
+++ b/src/afcn.py
@@ -207,7 +207,7 @@ def main():
 
     #Save results
     print("Writing results to file: " + str(args.output)) 
-    out_df.to_csv(args.output, index=None)
+    out_df.to_csv(args.output, index=None, sep='\t')
     
 
      

From cca1cea3b6f3b46c82fd1b833b5c618b1076e8be Mon Sep 17 00:00:00 2001
From: Dylan Taylor <dtaylo95@jhu.edu>
Date: Wed, 14 Feb 2024 16:05:13 -0500
Subject: [PATCH 3/3] Removed  function in parse.pyx 1) because it is no longer
 used with updated formatting behavior and 2) was causing a Cython type error
 on compile

---
 src/parse.pyx | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/parse.pyx b/src/parse.pyx
index e3693bb..dc702da 100644
--- a/src/parse.pyx
+++ b/src/parse.pyx
@@ -8,23 +8,6 @@ import warnings
 import sys
 warnings.filterwarnings("ignore")
 
-def is_variant_id_format_valid(eqtl_file):
-
-    '''Checks if the variant ids in the eqtl matrix begin with a valid character'''
-
-    #get the first character of all rows
-    variant_ids = eqtl_file['variant_id'].tolist()
-    firstchar_isdigit = [str(var_id)[0].isalpha() for var_id in variant_ids]
-
-    #check if any of these characters is a digit
-    if True in firstchar_isdigit:
-
-        return True
-
-    else:
-
-        return False
-
 
 def read_eqtls(eqtl_filename):