PejLab · dtaylo95 · Sep 26, 2023 · Nov 15, 2023 · Feb 14, 2024
diff --git a/src/afcn.py b/src/afcn.py
@@ -168,11 +168,11 @@ def main():
 
         thisgene_expressions = expression_matrix[expression_matrix.index.isin([gene])]
         #variants in this gene
-        thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id
+        thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id_clean
         #haplotypes for these variants for all inds
         thisgene_haplotypes = haplotype_matrix[haplotype_matrix.index.isin(list(thisgene_variants))]
         #the eqtl entries for this gene
-        thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id.isin(list(thisgene_variants)))]    
+        thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id_clean.isin(list(thisgene_variants)))]    
         #save in dictionary
         inputs[gene] = [thisgene_expressions, thisgene_eqtls, thisgene_haplotypes, cov_dataf_full, args]
 
@@ -192,7 +192,7 @@ def main():
     out_df = pd.merge(eqtl_dataframe_copy, final_df, how='left',on=original_columns )
 
     #now drop the gene_id_clean column
-    out_df = out_df.drop(columns=["gene_id_clean"])
+    out_df = out_df.drop(columns=["gene_id_clean", "variant_id_clean"])
 
     ###############################################################################
     #
@@ -207,7 +207,7 @@ def main():
 
     #Save results
     print("Writing results to file: " + str(args.output)) 
-    out_df.to_csv(args.output, index=None)
+    out_df.to_csv(args.output, index=None, sep='\t')
 
 
 

diff --git a/src/calc.pyx b/src/calc.pyx
@@ -312,17 +312,17 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
         if parameter in variant_list:
 
 
-            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
-            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
+            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
+            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
 
             if ci_unsuccessful == 0:
 
                 #get +-95% intervals
                 minus_95 = ci[parameter][1][1]
-                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
+                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
 
                 plus_95 = ci[parameter][5][1]
-                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
+                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
 
 
         else: #its C0
@@ -344,7 +344,7 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
     for lowq_eqtl in zero_list.keys():
 
         # No need for confidence intervals
-        eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = 0
+        eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = 0
 
     return eqtl_dataf
 
@@ -403,17 +403,17 @@ def least_squares(h1, h2, thisgene_expressions, eqtl_dataf, covar, args):
         #if parameter is an s value, variant_order excludes variants with nan values
         if parameter in variant_list:
 
-            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
-            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
+            eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
+            eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
 
             if ci_unsuccessful == 0:
 
                 #get +-95% intervals
                 minus_95 = ci[parameter][1][1]
-                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
+                eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
 
                 plus_95 = ci[parameter][5][1]
-                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
+                eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
 
 
         else: #its C0 or one of the peer factors

diff --git a/src/parse.pyx b/src/parse.pyx
@@ -8,23 +8,6 @@ import warnings
 import sys
 warnings.filterwarnings("ignore")
 
-def is_variant_id_format_valid(eqtl_file):
-
-    '''Checks if the variant ids in the eqtl matrix begin with a valid character'''
-
-    #get the first character of all rows
-    variant_ids = eqtl_file['variant_id'].tolist()
-    firstchar_isdigit = [str(var_id)[0].isalpha() for var_id in variant_ids]
-
-    #check if any of these characters is a digit
-    if True in firstchar_isdigit:
-
-        return True
-
-    else:
-
-        return False
-
 
 def read_eqtls(eqtl_filename):
 
@@ -33,13 +16,20 @@ def read_eqtls(eqtl_filename):
     #open the eqtl file
     eqtl_file = pd.read_csv(eqtl_filename, sep='\t')
 
-    #drop gene versions
-    eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]
+    # #drop gene versions
+    # eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]
 
-    if is_variant_id_format_valid(eqtl_file) == False:
+    # Clean up gene and variant IDs
+    gene_map = {gene_id: f'g{i}' for i, gene_id in enumerate(eqtl_file.gene_id.unique())}
+    eqtl_file['gene_id_clean'] = eqtl_file.gene_id.map(gene_map)
+
+    variant_map = {variant_id: f'v{i}' for i, variant_id in enumerate(eqtl_file.variant_id.unique())}
+    eqtl_file['variant_id_clean'] = eqtl_file.variant_id.map(variant_map)
+
+    # if is_variant_id_format_valid(eqtl_file) == False:
 
-        raise Exception('''Variant IDs must not begin with a digit, 
-                reformat your vcf and EQTL matrix''')
+    #     raise Exception('''Variant IDs must not begin with a digit, 
+    #             reformat your vcf and EQTL matrix''')
 
 
     return eqtl_file
@@ -49,7 +39,7 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
     '''Used to read the relevant expressions into a pandas dataframe'''
 
-    chosen_genes = list(eqtl_file.gene_id_clean)
+    gene_map = eqtl_file.loc[:, ['gene_id', 'gene_id_clean']].set_index('gene_id')['gene_id_clean'].to_dict()
     expr_df = []
     expr_df_cols = []
     #open the expression file, get the expressions for the genes we have eqtls for
@@ -75,9 +65,9 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
                 #try splitting the gene by version number, in 
                 #case our user forgot
-                thisgene = expr_data[0].split(".")[0]
+                thisgene = expr_data[0]
 
-                if thisgene in chosen_genes:
+                if thisgene in gene_map:
                     expr_df.append(expr_data)
 
     expr_dataframe = pd.DataFrame(expr_df, columns=expr_df_cols)
@@ -96,7 +86,7 @@ def read_expressions(expressions_filename, eqtl_file, args):
 
 
     #now do the gene version correction once and for all if needed
-    expr_dataframe['Name'] = expr_dataframe['Name'].str.split(".").str[0]
+    expr_dataframe['Name'] = expr_dataframe['Name'].map(gene_map)
 
     return expr_dataframe
 
@@ -127,7 +117,7 @@ def read_haplotypes(vcfName, eqtl_file):
 
     '''Reads in the variants we have eqtls and expressions(genes) for'''
 
-    chosen_variants = eqtl_file.variant_id.unique()
+    variant_map = eqtl_file.loc[:, ['variant_id', 'variant_id_clean']].set_index('variant_id')['variant_id_clean'].to_dict()
 
     #match these to the vcf file
     vcf_df = []
@@ -140,19 +130,21 @@ def read_haplotypes(vcfName, eqtl_file):
     #write the header first
     vcf_df_cols = get_vcf_header(vcfName)
 
-    for variant in chosen_variants:
+    for variant in variant_map:
 
         #get coordinates
-        [chrom, pos] = variant.split("_")[:2]
+        chrom = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_chr']].iloc[0,0]
+        pos = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_pos']].iloc[0,0]
 
         #query vcf using tabix
         try:
 
             records = tabix_haplotypes.fetch(chrom, int(pos) - 1, int(pos))
 
             for record in records:
-
-                vcf_df.append(record.split('\t'))
+                record_info = record.split('\t')
+                if record_info[2] == variant:
+                    vcf_df.append(record_info)
 
 
         except:
@@ -162,10 +154,13 @@ def read_haplotypes(vcfName, eqtl_file):
 
     vcf_dataframe = pd.DataFrame(vcf_df, columns=vcf_df_cols).drop_duplicates()
 
+    # Use clean variant ids
+    vcf_dataframe['ID'] = vcf_dataframe.ID.map(variant_map)
+
     #do a quick check and raise an exception if no eqtls were found
     if vcf_dataframe.empty:
 
-        raise Exception("No mathing EQTLs were found in VCF file - check variant ID formatting")
+        raise Exception("No matching EQTLs were found in VCF file - check variant ID formatting")
 
 
     return vcf_dataframe
@@ -184,7 +179,7 @@ def drop_novcf_variants(eqtl_dataframe, vcf_dataframe):
 
     '''Drop variants that we couldnt get from the vcf file from the eqtl matrix'''
 
-    corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id.isin(vcf_dataframe.ID)]
+    corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id_clean.isin(vcf_dataframe.ID)]
 
     return corrected_eqtl_dataframe