Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/afcn.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,11 @@ def main():

thisgene_expressions = expression_matrix[expression_matrix.index.isin([gene])]
#variants in this gene
thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id
thisgene_variants = eqtl_matrix[eqtl_matrix.gene_id_clean == gene].variant_id_clean
#haplotypes for these variants for all inds
thisgene_haplotypes = haplotype_matrix[haplotype_matrix.index.isin(list(thisgene_variants))]
#the eqtl entries for this gene
thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id.isin(list(thisgene_variants)))]
thisgene_eqtls = eqtl_matrix[(eqtl_matrix.gene_id_clean == gene) & (eqtl_matrix.variant_id_clean.isin(list(thisgene_variants)))]
#save in dictionary
inputs[gene] = [thisgene_expressions, thisgene_eqtls, thisgene_haplotypes, cov_dataf_full, args]

Expand All @@ -192,7 +192,7 @@ def main():
out_df = pd.merge(eqtl_dataframe_copy, final_df, how='left',on=original_columns )

#now drop the gene_id_clean column
out_df = out_df.drop(columns=["gene_id_clean"])
out_df = out_df.drop(columns=["gene_id_clean", "variant_id_clean"])

###############################################################################
#
Expand All @@ -207,7 +207,7 @@ def main():

#Save results
print("Writing results to file: " + str(args.output))
out_df.to_csv(args.output, index=None)
out_df.to_csv(args.output, index=None, sep='\t')



Expand Down
18 changes: 9 additions & 9 deletions src/calc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -312,17 +312,17 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
if parameter in variant_list:


eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr

if ci_unsuccessful == 0:

#get +-95% intervals
minus_95 = ci[parameter][1][1]
eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95

plus_95 = ci[parameter][5][1]
eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95


else: #its C0
Expand All @@ -344,7 +344,7 @@ def nocovar_least_squares(h1, h2, thisgene_expressions, eqtl_dataf, sa, c0, args
for lowq_eqtl in zero_list.keys():

# No need for confidence intervals
eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = 0
eqtl_dataf.log2_aFC[( lowq_eqtl == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = 0

return eqtl_dataf

Expand Down Expand Up @@ -403,17 +403,17 @@ def least_squares(h1, h2, thisgene_expressions, eqtl_dataf, covar, args):
#if parameter is an s value, variant_order excludes variants with nan values
if parameter in variant_list:

eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr
eqtl_dataf.log2_aFC[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].value
eqtl_dataf.log2_aFC_error[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = result.params[parameter].stderr

if ci_unsuccessful == 0:

#get +-95% intervals
minus_95 = ci[parameter][1][1]
eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = minus_95
eqtl_dataf.log2_aFC_min_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = minus_95

plus_95 = ci[parameter][5][1]
eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id) & (eqtl_dataf.gene_id_clean == gene)] = plus_95
eqtl_dataf.log2_aFC_plus_95_interv[(parameter == eqtl_dataf.variant_id_clean) & (eqtl_dataf.gene_id_clean == gene)] = plus_95


else: #its C0 or one of the peer factors
Expand Down
61 changes: 28 additions & 33 deletions src/parse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,6 @@ import warnings
import sys
warnings.filterwarnings("ignore")

def is_variant_id_format_valid(eqtl_file):

'''Checks if the variant ids in the eqtl matrix begin with a valid character'''

#get the first character of all rows
variant_ids = eqtl_file['variant_id'].tolist()
firstchar_isdigit = [str(var_id)[0].isalpha() for var_id in variant_ids]

#check if any of these characters is a digit
if True in firstchar_isdigit:

return True

else:

return False


def read_eqtls(eqtl_filename):

Expand All @@ -33,13 +16,20 @@ def read_eqtls(eqtl_filename):
#open the eqtl file
eqtl_file = pd.read_csv(eqtl_filename, sep='\t')

#drop gene versions
eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]
# #drop gene versions
# eqtl_file['gene_id_clean'] = eqtl_file.gene_id.str.split('.').str[0]

if is_variant_id_format_valid(eqtl_file) == False:
# Clean up gene and variant IDs
gene_map = {gene_id: f'g{i}' for i, gene_id in enumerate(eqtl_file.gene_id.unique())}
eqtl_file['gene_id_clean'] = eqtl_file.gene_id.map(gene_map)

variant_map = {variant_id: f'v{i}' for i, variant_id in enumerate(eqtl_file.variant_id.unique())}
eqtl_file['variant_id_clean'] = eqtl_file.variant_id.map(variant_map)

# if is_variant_id_format_valid(eqtl_file) == False:

raise Exception('''Variant IDs must not begin with a digit,
reformat your vcf and EQTL matrix''')
# raise Exception('''Variant IDs must not begin with a digit,
# reformat your vcf and EQTL matrix''')


return eqtl_file
Expand All @@ -49,7 +39,7 @@ def read_expressions(expressions_filename, eqtl_file, args):

'''Used to read the relevant expressions into a pandas dataframe'''

chosen_genes = list(eqtl_file.gene_id_clean)
gene_map = eqtl_file.loc[:, ['gene_id', 'gene_id_clean']].set_index('gene_id')['gene_id_clean'].to_dict()
expr_df = []
expr_df_cols = []
#open the expression file, get the expressions for the genes we have eqtls for
Expand All @@ -75,9 +65,9 @@ def read_expressions(expressions_filename, eqtl_file, args):

#try splitting the gene by version number, in
#case our user forgot
thisgene = expr_data[0].split(".")[0]
thisgene = expr_data[0]

if thisgene in chosen_genes:
if thisgene in gene_map:
expr_df.append(expr_data)

expr_dataframe = pd.DataFrame(expr_df, columns=expr_df_cols)
Expand All @@ -96,7 +86,7 @@ def read_expressions(expressions_filename, eqtl_file, args):


#now do the gene version correction once and for all if needed
expr_dataframe['Name'] = expr_dataframe['Name'].str.split(".").str[0]
expr_dataframe['Name'] = expr_dataframe['Name'].map(gene_map)

return expr_dataframe

Expand Down Expand Up @@ -127,7 +117,7 @@ def read_haplotypes(vcfName, eqtl_file):

'''Reads in the variants we have eqtls and expressions(genes) for'''

chosen_variants = eqtl_file.variant_id.unique()
variant_map = eqtl_file.loc[:, ['variant_id', 'variant_id_clean']].set_index('variant_id')['variant_id_clean'].to_dict()

#match these to the vcf file
vcf_df = []
Expand All @@ -140,19 +130,21 @@ def read_haplotypes(vcfName, eqtl_file):
#write the header first
vcf_df_cols = get_vcf_header(vcfName)

for variant in chosen_variants:
for variant in variant_map:

#get coordinates
[chrom, pos] = variant.split("_")[:2]
chrom = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_chr']].iloc[0,0]
pos = eqtl_file.loc[eqtl_file['variant_id']==variant, ['variant_pos']].iloc[0,0]

#query vcf using tabix
try:

records = tabix_haplotypes.fetch(chrom, int(pos) - 1, int(pos))

for record in records:

vcf_df.append(record.split('\t'))
record_info = record.split('\t')
if record_info[2] == variant:
vcf_df.append(record_info)


except:
Expand All @@ -162,10 +154,13 @@ def read_haplotypes(vcfName, eqtl_file):

vcf_dataframe = pd.DataFrame(vcf_df, columns=vcf_df_cols).drop_duplicates()

# Use clean variant ids
vcf_dataframe['ID'] = vcf_dataframe.ID.map(variant_map)

#do a quick check and raise an exception if no eqtls were found
if vcf_dataframe.empty:

raise Exception("No mathing EQTLs were found in VCF file - check variant ID formatting")
raise Exception("No matching EQTLs were found in VCF file - check variant ID formatting")


return vcf_dataframe
Expand All @@ -184,7 +179,7 @@ def drop_novcf_variants(eqtl_dataframe, vcf_dataframe):

'''Drop variants that we couldnt get from the vcf file from the eqtl matrix'''

corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id.isin(vcf_dataframe.ID)]
corrected_eqtl_dataframe = eqtl_dataframe[eqtl_dataframe.variant_id_clean.isin(vcf_dataframe.ID)]

return corrected_eqtl_dataframe

Expand Down