From f3ff68c358b279bed23b7a9fe790a81f50499317 Mon Sep 17 00:00:00 2001 From: Dylan Taylor Date: Tue, 26 Sep 2023 14:29:52 -0400 Subject: [PATCH 1/3] Modified scripts to be agnostic to input variantID format --- src/afcn.py | 6 +++--- src/calc.pyx | 18 +++++++++--------- src/parse.pyx | 44 ++++++++++++++++++++++++++++---------------- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/afcn.py b/src/afcn.py index affe54b..1840e9d 100755 --- a/src/afcn.py +++ b/src/afcn.py @@ -168,11 +168,11 @@ def main(): thisgene_expressions = expression_matrix[expression_matrix.index.isin([gene])] #variants in this gene - thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id + thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id_clean #haplotypes for these variants for all inds thisgene_haplotypes = haplotype_matrix[haplotype_matrix.index.isin(list(thisgene_variants))] #the eqtl entries for this gene - thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id.isin(list(thisgene_variants)))] + thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id_clean.isin(list(thisgene_variants)))] #save in dictionary inputs[gene] = [thisgene_expressions, thisgene_eqtls, thisgene_haplotypes, cov_dataf_full, args] @@ -192,7 +192,7 @@ def main(): out_df = pd.merge(eqtl_dataframe_copy, final_df, how='left',on=original_columns ) #now drop the gene_id_clean column - out_df = out_df.drop(columns=["gene_id_clean"]) + out_df = out_df.drop(columns=["gene_id_clean", "variant_id_clean"]) ############################################################################### # diff --git a/src/calc.pyx b/src/calc.pyx index 53bfcb2..eadde08 100644 --- a/src/calc.pyx +++ b/src/calc.pyx @@ -312,17 +312,17 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args if parameter in variant_list: - eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value - eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr + eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value + eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr if ci_unsuccessful == 0: #get +-95% intervals minus_95 = ci[parameter][1][1] - eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95 + eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95 plus_95 = ci[parameter][5][1] - eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95 + eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95 else: #its C0 @@ -344,7 +344,7 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args for lowq_eqtl in zero_list.keys(): # No need for confidence intervals - eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = 0 + eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = 0 return eqtl_dataf @@ -403,17 +403,17 @@ def least_squares(h1, h2, thisgene_expressions, eqtl_dataf, covar, args): #if parameter is an s value, variant_order excludes variants with nan values if parameter in variant_list: - eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value - eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr + eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value + eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr if ci_unsuccessful == 0: #get +-95% intervals minus_95 = ci[parameter][1][1] - eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95 + eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95 plus_95 = ci[parameter][5][1] - eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95 + eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95 else: #its C0 or one of the peer factors diff --git a/src/parse.pyx b/src/parse.pyx index 5d7c678..e3693bb 100644 --- a/src/parse.pyx +++ b/src/parse.pyx @@ -33,13 +33,20 @@ def read_eqtls(eqtl_filename): #open the eqtl file eqtl_file = pd.read_csv(eqtl_filename, sep='\t') - #drop gene versions - eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0] + # #drop gene versions + # eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0] - if is_variant_id_format_valid(eqtl_file) == False: + # Clean up gene and variant IDs + gene_map = {gene_id: f'g{i}' for i, gene_id in enumerate(eqtl_file.gene_id.unique())} + eqtl_file['gene_id_clean'] = eqtl_file.gene_id.map(gene_map) - raise Exception('''Variant IDs must not begin with a digit, - reformat your vcf and EQTL matrix''') + variant_map = {variant_id: f'v{i}' for i, variant_id in enumerate(eqtl_file.variant_id.unique())} + eqtl_file['variant_id_clean'] = eqtl_file.variant_id.map(variant_map) + + # if is_variant_id_format_valid(eqtl_file) == False: + + # raise Exception('''Variant IDs must not begin with a digit, + # reformat your vcf and EQTL matrix''') return eqtl_file @@ -49,7 +56,7 @@ def read_expressions(expressions_filename, eqtl_file, args): '''Used to read the relevant expressions into a pandas dataframe''' - chosen_genes = list(eqtl_file.gene_id_clean) + gene_map = eqtl_file.loc[:, ['gene_id', 'gene_id_clean']].set_index('gene_id')['gene_id_clean'].to_dict() expr_df = [] expr_df_cols = [] #open the expression file, get the expressions for the genes we have eqtls for @@ -75,9 +82,9 @@ def read_expressions(expressions_filename, eqtl_file, args): #try splitting the gene by version number, in #case our user forgot - thisgene = expr_data[0].split(".")[0] + thisgene = expr_data[0] - if thisgene in chosen_genes: + if thisgene in gene_map: expr_df.append(expr_data) expr_dataframe = pd.DataFrame(expr_df, columns=expr_df_cols) @@ -96,7 +103,7 @@ def read_expressions(expressions_filename, eqtl_file, args): #now do the gene version correction once and for all if needed - expr_dataframe['Name'] = expr_dataframe['Name'].str.split(".").str[0] + expr_dataframe['Name'] = expr_dataframe['Name'].map(gene_map) return expr_dataframe @@ -127,7 +134,7 @@ def read_haplotypes(vcfName, eqtl_file): '''Reads in the variants we have eqtls and expressions(genes) for''' - chosen_variants = eqtl_file.variant_id.unique() + variant_map = eqtl_file.loc[:, ['variant_id', 'variant_id_clean']].set_index('variant_id')['variant_id_clean'].to_dict() #match these to the vcf file vcf_df = [] @@ -140,10 +147,11 @@ def read_haplotypes(vcfName, eqtl_file): #write the header first vcf_df_cols = get_vcf_header(vcfName) - for variant in chosen_variants: + for variant in variant_map: #get coordinates - [chrom, pos] = variant.split("_")[:2] + chrom = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_chr']].iloc[0,0] + pos = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_pos']].iloc[0,0] #query vcf using tabix try: @@ -151,8 +159,9 @@ def read_haplotypes(vcfName, eqtl_file): records = tabix_haplotypes.fetch(chrom, int(pos) - 1, int(pos)) for record in records: - - vcf_df.append(record.split('\t')) + record_info = record.split('\t') + if record_info[2] == variant: + vcf_df.append(record_info) except: @@ -162,10 +171,13 @@ def read_haplotypes(vcfName, eqtl_file): vcf_dataframe = pd.DataFrame(vcf_df, columns=vcf_df_cols).drop_duplicates() + # Use clean variant ids + vcf_dataframe['ID'] = vcf_dataframe.ID.map(variant_map) + #do a quick check and raise an exception if no eqtls were found if vcf_dataframe.empty: - raise Exception("No mathing EQTLs were found in VCF file - check variant ID formatting") + raise Exception("No matching EQTLs were found in VCF file - check variant ID formatting") return vcf_dataframe @@ -184,7 +196,7 @@ def drop_novcf_variants(eqtl_dataframe, vcf_dataframe): '''Drop variants that we couldnt get from the vcf file from the eqtl matrix''' - corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id.isin(vcf_dataframe.ID)] + corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id_clean.isin(vcf_dataframe.ID)] return corrected_eqtl_dataframe From 07eab649e9c90742c77047f746ed73032dfacd52 Mon Sep 17 00:00:00 2001 From: Dylan Taylor <54147660+dtaylo95@users.noreply.github.com> Date: Wed, 15 Nov 2023 12:03:39 -0500 Subject: [PATCH 2/3] Tab-separated output --- src/afcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/afcn.py b/src/afcn.py index 1840e9d..e8c8a43 100755 --- a/src/afcn.py +++ b/src/afcn.py @@ -207,7 +207,7 @@ def main(): #Save results print("Writing results to file: " + str(args.output)) - out_df.to_csv(args.output, index=None) + out_df.to_csv(args.output, index=None, sep='\t') From cca1cea3b6f3b46c82fd1b833b5c618b1076e8be Mon Sep 17 00:00:00 2001 From: Dylan Taylor Date: Wed, 14 Feb 2024 16:05:13 -0500 Subject: [PATCH 3/3] Removed function in parse.pyx 1) because it is no longer used with updated formatting behavior and 2) was causing a Cython type error on compile --- src/parse.pyx | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/src/parse.pyx b/src/parse.pyx index e3693bb..dc702da 100644 --- a/src/parse.pyx +++ b/src/parse.pyx @@ -8,23 +8,6 @@ import warnings import sys warnings.filterwarnings("ignore") -def is_variant_id_format_valid(eqtl_file): - - '''Checks if the variant ids in the eqtl matrix begin with a valid character''' - - #get the first character of all rows - variant_ids = eqtl_file['variant_id'].tolist() - firstchar_isdigit = [str(var_id)[0].isalpha() for var_id in variant_ids] - - #check if any of these characters is a digit - if True in firstchar_isdigit: - - return True - - else: - - return False - def read_eqtls(eqtl_filename):