diff --git a/README.md b/README.md index 8091bc7..b5669d9 100644 --- a/README.md +++ b/README.md @@ -42,11 +42,11 @@ MungeCredibleSet: Promoters: the absolute path to the file containing promoter annotations. E.g.,{path}/V2G/resources/RefSeqCurated.170308.bed.CollapsedGeneBounds.TSS500bp.bed" UbiquitouslyExpressedGenes: the absolute path to the bed file containing ubiquitously expressed genes. E.g., {path}/V2G/resources/UbiquitouslyExpressedGenes.txt -ABCPRED: the absolute path to the ABC results for variant overlapping. E.g.{path}/V2G/resources/size_sorted_CombinedPredictions.AvgHiC.ABC0.015.minus150.txt.gz" +ABCPRED: the absolute path to the ABC results for variant overlapping. Please include all relevant cell types and stimulation conditions. E.g.{path}/V2G/resources/size_sorted_CombinedPredictions.AvgHiC.ABC0.015.minus150.txt.gz" ALLPEAKS: the absolute path to the directory containing [cell type-specific peaks](https://mitra.stanford.edu/engreitz/public/SchnitzlerKang2023/EnhancerList.minus150). -CelltypeTable: the absolute path to the table specifying cell type name patterns for cell type groups. The table is for catagorizing the cell types in ABCPRD and ALLPEAKS. E.g. {path}/V2G/resources/grouped_celltype_table.txt" +CelltypeTable: the absolute path to the table specifying cell type name patterns for cell type groups. Please include names for all target cell types in this table. The table is for catagorizing the cell types in ABCPRD and ALLPEAKS. E.g. {path}/V2G/resources/grouped_celltype_table.txt" LipidBloodAssociationTable: the absolute path to the table containing variants associated with lipid level regulation. E.g., {path}/V2G/resources/lipid.level.csv" diff --git a/resources/ABC_overlap.header b/resources/ABC_overlap.header index 0288217..d4f3a31 100644 --- a/resources/ABC_overlap.header +++ b/resources/ABC_overlap.header @@ -1 +1 @@ -chr start end variant PosteriorProb Disease CredibleSet chr start end name class activity_base TargetGene TargetGeneTSS TargetGeneExpression TargetGenePromoterActivityQuantile TargetGeneIsExpressed distance isSelfPromoter powerlaw_contact powerlaw_contact_reference hic_contact hic_contact_pl_scaled hic_pseudocount hic_contact_pl_scaled_adj ABC.Score.Numerator ABC.Score powerlaw.Score.Numerator powerlaw.Score CellType +chr start end name class TargetGene TargetGeneEnsemblID TargetGeneTSS isSelfPromoter CellType distanceToTSS.Featuren ormalizedDNase_prom.Feature 3DContact.Feature ABC.Score.Feature numCandidateEnhGene.Feature numTSSEnhGene.Feature sumNearbyEnhancers.Feature ubiquitousExpressedGene.Feature Score diff --git a/resources/grouped_celltype_table.txt b/resources/grouped_celltype_table.txt index 30046a9..e5dc8a7 100644 --- a/resources/grouped_celltype_table.txt +++ b/resources/grouped_celltype_table.txt @@ -4,7 +4,7 @@ Telo|telo|HUVEC|EPC|EC|endocardium|endothelial|ahy926|Endothelial|Endocardial|en vcm_fetal|acm_fetal|Day30_Cardiomyocytes|Day15_cardiomyocytes|cardiac_muscle_cell-ENCODE CM smc|SMC|coronary_artery_smooth_muscle_cell-Miller2016 SMC fib Fibroblasts -hepatocyte|HepG2|liver Hepatocytes +hepatocyte|HepG2|liver|HuH-7|HuH-7.5|hepatic Hepatocytes Adipocyte|adipose Adipose ventricle|F6|F8|F19|coronary_artery-ENCODE HeartOrArteryTissue H1_BMP4_Derived_Mesendoderm_Cultured_Cells-Roadmap|hESC|telo|Telo|SMC|Day5_cardiacprogenitors|Day30_Cardiomyocytes|Day2_mesoderm|Day15_cardiomyocytes|Day0_ipsc|coronary_artery_smooth_muscle_cell-Miller2016 CellModels diff --git a/snakemake/workflow/Snakefile b/snakemake/workflow/Snakefile index ea25939..9e2fa00 100644 --- a/snakemake/workflow/Snakefile +++ b/snakemake/workflow/Snakefile @@ -44,9 +44,16 @@ def getZeroIndexed(wildcards): def getExcludeVariants(wildcards): return(str(variant_table.loc[wildcards.trait, "ExcludeVariants"])) + +## Function to extract whether PIP information is included +def hasPIP(wildcards): + # test if variant_table column PIP is NA in various form + return(str(variant_table.loc[wildcards.trait, "PIPCol"] != "NA" and variant_table.loc[wildcards.trait, "PIPCol"] != "None")) + + ########################################################## rule CreateCredibleSets: -""" Reformat variant list and create a summary table of each credible set, including information such as whether the variants in the credible sets overlap a promoter, are coding variants or are splicing variants. """ + """ Reformat variant list and create a summary table of each credible set, including information such as whether the variants in the credible sets overlap a promoter, are coding variants or are splicing variants. """ input: fine_mapped_variants=getVariants, params: @@ -91,7 +98,7 @@ rule CreateCredibleSets: """ rule create_locus_window: -""" Create a +/- 2 million base pairs window centerted at the lead variant for each credible set. """ + """ Create a +/- 2 million base pairs window centerted at the lead variant for each credible set. """ input: variant_list=os.path.join(config["OutDir"], "{trait}", "variant.list.txt"), cs_list=os.path.join(config["OutDir"], "{trait}", "all.cs.txt") @@ -114,7 +121,7 @@ rule create_locus_window: """ rule intersect_window_w_genes: -""" Intersect the locus windows with gene promoters. """ + """ Intersect the locus windows with gene promoters. """ input: locus_window_table=os.path.join(config["OutDir"], "{trait}","intermediate_files", "{trait}_gwas_table_base.bed"), params: @@ -135,7 +142,7 @@ rule intersect_window_w_genes: """ rule merge_locus_window_w_genes: -""" Merge the genes overlapping locus window with their corresponding credible set information. Filter genes at each credible set to keep at least two genes on each side of the lead variants (4 in total); and include genes > +/- 500 kb away from the lead variants if there are < 2 genes on each side of the variant within the +/- 500kb window until there are at least 2 genes on each side of the lead variant. """ + """ Merge the genes overlapping locus window with their corresponding credible set information. Filter genes at each credible set to keep at least two genes on each side of the lead variants (4 in total); and include genes > +/- 500 kb away from the lead variants if there are < 2 genes on each side of the variant within the +/- 500kb window until there are at least 2 genes on each side of the lead variant. """ input: locus_window_table=os.path.join(config["OutDir"], "{trait}","intermediate_files", "{trait}_gwas_table_base.txt"), gene_overlap_window=os.path.join(config["OutDir"], "{trait}","intermediate_files", "{trait}_gwas_table_base.sorted.gene_overlap.bed"), @@ -161,7 +168,7 @@ rule merge_locus_window_w_genes: """ rule intersectWithABC: -""" Intersect variants with ABC enhancers""" + """ Intersect variants with ABC enhancers""" input: variants=os.path.join(config["OutDir"], "{trait}", "variant.list.txt") output: @@ -181,12 +188,12 @@ rule intersectWithABC: sed 1d | \ bedtools sort -i stdin -g {params.hg19sizes} > {output.sorted_variants} - cat {params.header} > {output.ABC_overlap} + {{ printf 'chr\tstart\tend\tvariant\tPosteriorProb\tTrait\tCredibleSet\t'; zcat {params.abcPred} | head -1; }} > {output.ABC_overlap} zcat {params.abcPred} | sed 1d | bedtools sort -faidx {params.hg19sizes} -i stdin |bedtools intersect -sorted -wa -wb -a {output.sorted_variants} -b stdin -g {params.hg19sizes} >> {output.ABC_overlap} """ rule intersectWithPeaks: -""" Intersect variants with cell-type specific peaks.""" + """ Intersect variants with cell-type specific peaks.""" input: sorted_variants=os.path.join(config["OutDir"], "{trait}", "intermediate_files", "Variants.sorted.bed"), output: @@ -213,7 +220,7 @@ rule intersectWithPeaks: """ rule intersectWithCelltypes: -""" Create summary tables of ABC enhancer-overlapping variants.""" + """ Create summary tables of ABC enhancer-overlapping variants.""" input: ABCOverlap=os.path.join(config["OutDir"], "{trait}", "ABCOverlapFull.tsv"), PeakOverlapFull=os.path.join(config["OutDir"], "{trait}", "PeaksOverlapFull.tsv"), @@ -249,7 +256,7 @@ rule intersectWithCelltypes: """ rule groupABCByCelltypes: -""" Summarize ABC enhancer-overlapping variants by cell-type group.""" + """ Summarize ABC enhancer-overlapping variants by cell-type group.""" input: ABC_overlap_ranked=os.path.join(config["OutDir"], "{trait}", "ABCOverlapFull.ranked.tsv") output: @@ -260,7 +267,8 @@ rule groupABCByCelltypes: grouped_celltype_table=config["CelltypeTable"], source=getSource, codeDir=config["CodeDir"], - PIP=config["groupABCByCelltypes"]["PIP"] + PIP=config["groupABCByCelltypes"]["PIP"], + hasPIP=hasPIP, resources: mem_gb=16, runtime_hr=3 @@ -273,11 +281,12 @@ rule groupABCByCelltypes: --ranked_ABC_table {input.ABC_overlap_ranked} \ --source {params.source} \ --PIP {params.PIP} \ + --hasPIP {params.hasPIP} \ --helperFunctions {params.helper_functions} """ rule groupPeakOverlap: -""" Summarize peak-overlapping variants by cell-type group.""" + """ Summarize peak-overlapping variants by cell-type group.""" input: CredibleSet_peak_overlap_summary=os.path.join(config["OutDir"], "{trait}", "CredibleSetPeakOverlapSummary.tsv"), PeaksOverlapFull_Peaks=os.path.join(config["OutDir"],"{trait}", "PeaksOverlapFull.Peaks.tsv"), @@ -290,7 +299,8 @@ rule groupPeakOverlap: grouped_celltype_tables=config["CelltypeTable"], source=getSource, codeDir=config["CodeDir"], - PIP=config["groupPeakOverlap"]["PIP"] + PIP=config["groupPeakOverlap"]["PIP"], + hasPIP=hasPIP, resources: mem_gb=16, runtime_hr=3 @@ -304,10 +314,11 @@ rule groupPeakOverlap: --cellGroup {wildcards.cellGroup} \ --source {params.source} \ --PIP {params.PIP} \ + --hasPIP {params.hasPIP} \ --helperFunctions {params.helper_functions} """ rule createABCtable: -"""Create an information table summarizing the locations of the intermediate files for rule combinedOutputTables. """ + """Create an information table summarizing the locations of the intermediate files for rule combinedOutputTables. """ input: credibleset_peakinfo=expand(os.path.join(config["OutDir"],"{{trait}}", "CredibleSetPeakOverlapSummary.with{cellGroup}peakinfo.tsv"),cellGroup=CellGroupsOfInterest), credibleset_peakinfo_cellgroup_only=expand(os.path.join(config["OutDir"],"{{trait}}", "CredibleSetPeakOverlapSummary.withpeakinfo.only{cellGroup}.tsv"),cellGroup=CellGroupsOfInterest), @@ -328,7 +339,7 @@ rule createABCtable: rule combineOutputTables: -""" Combine intermediate tables to create the final credible set to gene table.""" + """ Combine intermediate tables to create the final credible set to gene table.""" input: base_df=os.path.join(config["OutDir"], "{trait}","intermediate_files", "{trait}_gwas_table_base_genes.txt"), ABC_overlapping_df=os.path.join(config["OutDir"],"{trait}", "peak_ABC_overlapping_table.tsv"), @@ -358,7 +369,7 @@ rule combineOutputTables: """ rule getVariantInfo: -""" combineOutputTables creates a table with credible set level information. getVariantInfo details the variants that contribute to the credible set-gene linking.""" + """ combineOutputTables creates a table with credible set level information. getVariantInfo details the variants that contribute to the credible set-gene linking.""" input: combined_table=os.path.join(config["OutDir"],"{trait}", "{trait}_cell2gene.txt"), variant_list=os.path.join(config["OutDir"], "{trait}", "variant.list.txt"), diff --git a/snakemake/workflow/scripts/CreateCS.R b/snakemake/workflow/scripts/CreateCS.R index 7675b1d..5d7a53f 100644 --- a/snakemake/workflow/scripts/CreateCS.R +++ b/snakemake/workflow/scripts/CreateCS.R @@ -20,9 +20,11 @@ option.list <- list( make_option("--excludeVariants", type="character", help="Variant to exclude") ) opt <- parse_args(OptionParser(option_list=option.list)) + + source(opt$helperFunctions) ############################################################################## -variants=read.table(opt$fineMappedVariants, header=T, stringsAsFactors=F) +variants=read.table(opt$fineMappedVariants, header=T, stringsAsFactors=F, fill=T) ############################################################################## ## Load common data genes <- readBed(opt$genes) diff --git a/snakemake/workflow/scripts/celltype.overlap.R b/snakemake/workflow/scripts/celltype.overlap.R index 1d5f078..c602fc7 100755 --- a/snakemake/workflow/scripts/celltype.overlap.R +++ b/snakemake/workflow/scripts/celltype.overlap.R @@ -15,9 +15,10 @@ option.list <- list( ) opt <- parse_args(OptionParser(option_list=option.list)) + source(opt$helperFunctions) setwd(opt$outDir) -abc <- read.delim(opt$ABCOverlap) +abc <- read.delim(opt$ABCOverlap) if (opt$removeNonCoding){ abc <- abc %>% filter(!greplany(c("^LINC","-AS","^MIR","RNU","^LOC"),TargetGene)) }else{ @@ -26,7 +27,8 @@ if (opt$removeNonCoding){ variants <- read.delim(opt$variantList) all.cs <- read.delim(opt$csList) -geneRanks <- abc %>% group_by(CredibleSet,TargetGene) %>% summarise(MaxABC=max(ABC.Score)) %>% as.data.frame() +ABC.Score.column <- if("ABC.Score" %in% colnames(abc)) "ABC.Score" else "Score" +geneRanks <- abc %>% group_by(CredibleSet,TargetGene) %>% summarise(MaxABC=max(get(ABC.Score.column))) %>% as.data.frame() geneRanks <- geneRanks %>% group_by(CredibleSet) %>% mutate(GeneRank=dense_rank(-MaxABC)) %>% as.data.frame() abc.ranked <- merge(abc, geneRanks) write.table(abc.ranked, file="ABCOverlapFull.ranked.tsv",row.names=F, col.names=T, sep='\t', quote=F) @@ -44,7 +46,7 @@ tmp <- abc.ranked %>% group_by(variant,CredibleSet) %>% summarise( AllCellTypes=paste0(CellType, collapse=','), TargetGenes=paste0(unique(TargetGene), collapse=','), - TopGene=unique(TargetGene[which.max(ABC.Score)])) %>% + TopGene=unique(TargetGene[which.max(get(ABC.Score.column))])) %>% merge(all.cs) %>% arrange(CredibleSet) %>% unique() %>% as.data.frame() write.table(tmp, file="ABCVariantOverlapSummary.tsv",row.names=F, col.names=T, sep='\t', quote=F) diff --git a/snakemake/workflow/scripts/combine_tables.R b/snakemake/workflow/scripts/combine_tables.R index 0664666..f77ece3 100755 --- a/snakemake/workflow/scripts/combine_tables.R +++ b/snakemake/workflow/scripts/combine_tables.R @@ -58,9 +58,10 @@ for (row in 1:nrow(lipid.blood.association.core )){ if (length(traits[which(traits %in% blood.keys)])> 0){ blood.snps <- c(blood.snps, snp) } - } + CS.base.df <- CS.base.df %>% mutate(LipidLevelsAssociated=ifelse(LeadVariant %in% lipid.snps, "TRUE", "FALSE")) %>% mutate(BloodPressureAssociated=ifelse(LeadVariant %in% blood.snps, "TRUE", "FALSE")) + ############################################################# counter=0 for (cellgroup in cellgroups){ @@ -68,6 +69,7 @@ for (cellgroup in cellgroups){ #Peaks peaks_df <- read.table(peak_ABC_overlapping_table[cellgroup,"Peaks"],header=TRUE, sep = '\t', stringsAsFactors=FALSE) only_intersects_with_cellgroup_peaks_df <- read.table(peak_ABC_overlapping_table[cellgroup,"OnlyPeaks"],header=TRUE, sep = '\t', stringsAsFactors=FALSE) + #Remove duplications caused by multiple variants in a credible sets. if (("CellType" %in% colnames(peaks_df)) && (paste0("OnlyIntersectsWith", cellgroup, "Peak") %in% colnames(only_intersects_with_cellgroup_peaks_df))){ peaks.core <- peaks_df[c(cellgroup,"LeadVariant","PeakChr", "PeakStart","PeakEnd", "CellType")] %>% distinct() %>% rename(!!paste0("CellTypeOftheClosest",cellgroup,"Peak"):=CellType) %>% rename(!!paste0("IntersectsWith",cellgroup,"Peak") := !!cellgroup) @@ -76,14 +78,15 @@ for (cellgroup in cellgroups){ CS.base.df.peaks <- CS.base.df.peaks %>% rowwise() %>% mutate(!!sym(paste0("DistanceToNearest",cellgroup, "PeakWithVariant")):=!!sym(paste0(cellgroup, "_PeakCenter"))-as.numeric(TSS_position)) %>% mutate(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")):=abs(!!sym(paste0("DistanceToNearest", cellgroup, "PeakWithVariant")))) # Pick the gene with the shortest distance to nearest gene with cellgroup peak # get the gene with the minimum distance - CS.base.df.peaks <- CS.base.df.peaks %>% group_by(Source,LeadVariant, gene) %>% filter(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")) == min(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")), na.rm=TRUE) |all(is.na(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant"))))) %>% ungroup() + CS.base.df.peaks <- CS.base.df.peaks %>% group_by(Source,LeadVariant, gene) %>% filter(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")) == min(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")), na.rm=TRUE) |all(is.na(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant"))))) %>% ungroup() ## absDistanceToNearestHepatocytesPeakWithVariant column with NA entries are removed CS.base.df.peaks <- CS.base.df.peaks %>% group_by(Source,LeadVariant) %>% mutate(!!sym(paste0("RankOfDistanceTo", cellgroup, "PeakWithVariant")):=dense_rank(!!sym(paste0("absDistanceToNearest",cellgroup,"PeakWithVariant")))) %>% ungroup() only_intersects_with_cellgroup_peaks.core <- only_intersects_with_cellgroup_peaks_df %>% select(c(LeadVariant,!!sym(paste0("OnlyIntersectsWith", cellgroup,"Peak")))) %>% filter(!is.na(!!sym(paste0("OnlyIntersectsWith", cellgroup,"Peak")))) CS.base.df <- left_join(CS.base.df.peaks,only_intersects_with_cellgroup_peaks.core, by=c("LeadVariant"="LeadVariant")) %>% relocate(!!sym(paste0(cellgroup, "_PeakCenter")), .after=!!sym(paste0("CellTypeOftheClosest",cellgroup,"Peak"))) %>% relocate(!!sym(paste0("OnlyIntersectsWith", cellgroup,"Peak")),.after=!!sym(paste0("IntersectsWith",cellgroup,"Peak"))) } - #ABC + + ##ABC ABC_df <- read.table(peak_ABC_overlapping_table[cellgroup,"ABC"],header=TRUE, sep = '\t', stringsAsFactors=FALSE) if (nrow(ABC_df)!=0){ ABC.df.core <- ABC_df %>% select(c(TargetGene, MaxABC, CellTypesInCredibleSet,LeadVariant, !!sym(paste0("MaxABC.",cellgroup,".only")), !!sym(paste0(cellgroup,"InCredibleSet")))) %>% rename(CellTypesWithEPInteractions=CellTypesInCredibleSet) %>% rename(!!paste0(cellgroup,"CellTypesWithEPInteractions"):= !!paste0(cellgroup,"InCredibleSet")) @@ -94,7 +97,9 @@ for (cellgroup in cellgroups){ } CS.base.df <- left_join(CS.base.df, ABC.df.core, by=c("gene" = "TargetGene", "LeadVariant" = "LeadVariant")) CS.base.df <- CS.base.df %>% select(-c(PeakChr,PeakStart,PeakEnd)) - CS.base.df <- CS.base.df %>% mutate(!!sym(paste0(cellgroup, "CellTypesWithEPInteractions")):=ifelse(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions"))=="",NA,!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")))) %>% mutate(!!sym(paste0("MaxABC.",cellgroup,".only")):=ifelse(!!sym(paste0("MaxABC.",cellgroup,".only"))==0,NA,!!sym(paste0("MaxABC.",cellgroup,".only")))) + CS.base.df <- CS.base.df %>% + mutate(!!sym(paste0(cellgroup, "CellTypesWithEPInteractions")):=ifelse(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions"))=="",NA,!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")))) %>% + mutate(!!sym(paste0("MaxABC.",cellgroup,".only")):=ifelse(!!sym(paste0("MaxABC.",cellgroup,".only"))==0,NA,!!sym(paste0("MaxABC.",cellgroup,".only")))) } } @@ -109,14 +114,18 @@ if ("MaxABC" %in% colnames(final.df)){ #Remove duplicates in CellTypes with EC Interactions final.df <- final.df %>% distinct() %>% group_by(LeadVariant,gene,Source) %>% mutate(CellTypesWithEPInteractions = as.character(paste0(unique(CellTypesWithEPInteractions),collapse="|"))) %>% mutate(CellTypesWithEPInteractions = as.character(paste0(unique(unlist(strsplit(CellTypesWithEPInteractions, split="|",fixed = TRUE))),collapse = "|"))) %>% ungroup() } -#For each cell group: + +##For each cell group: for (cellgroup in cellgroups){ - if ((paste0("MaxABC.",cellgroup,".only") %in% colnames(final.df)) && (paste0(cellgroup,"CellTypesWithEPInteractions") %in% colnames(final.df)) && (paste0("MaxABC.",cellgroup,".only") %in% colnames(final.df))){ + if ((paste0("MaxABC.",cellgroup,".only") %in% colnames(final.df)) && (paste0(cellgroup,"CellTypesWithEPInteractions") %in% colnames(final.df))){ #sometimes a gene maybe associated with multiple ABC scores because the same credible sets from different sources contain different number of variants. the following steps are to collapse these entries into one entry. #this way it keeps the largest ABC score; The scores will be reranked at the end. final.df <- final.df %>% distinct() %>% group_by(LeadVariant,gene,Source) %>% mutate(!!sym(paste0("MaxABC.",cellgroup,".only")):=ifelse(all(is.na(!!sym(paste0("MaxABC.",cellgroup,".only")))),NA,max(!!sym(paste0("MaxABC.",cellgroup,".only")), na.rm=TRUE))) %>% ungroup() #Again to collapse the cells with EP interactions. - final.df <- final.df %>% distinct() %>% group_by(LeadVariant,gene,Source) %>% mutate(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")):= as.character(paste0(unique(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions"))),collapse="|"))) %>% mutate(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")) := as.character(paste0(unique(unlist(strsplit(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")), split="|",fixed = TRUE))),collapse = "|"))) %>% ungroup() + final.df <- final.df %>% distinct() %>% group_by(LeadVariant,gene,Source) %>% + mutate(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")):= as.character(paste0(unique(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions"))),collapse="|"))) %>% + mutate(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")) := as.character(paste0(unique(unlist(strsplit(!!sym(paste0(cellgroup,"CellTypesWithEPInteractions")), split="|",fixed = TRUE))),collapse = "|"))) %>% + ungroup() #For each credible set, re-rank per gene max abc scores. final.df <- final.df %>% distinct() %>% group_by(LeadVariant,Source) %>% mutate(!!sym(paste0("MaxABC.Rank.",cellgroup,"Only")):=dense_rank(-!!sym(paste0("MaxABC.",cellgroup,".only")))) %>% ungroup() %>% relocate(!!sym(paste0("MaxABC.Rank.",cellgroup,"Only")), .after=!!sym(paste0("MaxABC.",cellgroup,".only"))) } diff --git a/snakemake/workflow/scripts/create_locus_window.R b/snakemake/workflow/scripts/create_locus_window.R index b73f2f8..0d407eb 100644 --- a/snakemake/workflow/scripts/create_locus_window.R +++ b/snakemake/workflow/scripts/create_locus_window.R @@ -10,6 +10,8 @@ option.list <- list( make_option("--trait", type="character") ) opt <- parse_args(OptionParser(option_list=option.list)) + + ############################################ all.cs.df <- read.table(opt$cs, sep = '\t', header=TRUE,stringsAsFactors=FALSE) variants.df <- read.table(opt$variants, sep = '\t', header=TRUE,stringsAsFactors=FALSE) %>% select(CredibleSet, LeadVariant, LeadVariantPos) %>% distinct() diff --git a/snakemake/workflow/scripts/get_credible_set_variants_info.R b/snakemake/workflow/scripts/get_credible_set_variants_info.R index 44aefe5..c5c89f5 100644 --- a/snakemake/workflow/scripts/get_credible_set_variants_info.R +++ b/snakemake/workflow/scripts/get_credible_set_variants_info.R @@ -15,11 +15,16 @@ option.list <- list( opt <- parse_args(OptionParser(option_list=option.list)) + cs2gene_df <- read.table(opt$CS2Gene, sep="\t", header=TRUE,stringsAsFactors=FALSE) -variant_table <- read.table(opt$variant, sep="\t", header=TRUE, stringsAsFactors=FALSE) +variant_table <- read.table(opt$variantList, sep="\t", header=TRUE, stringsAsFactors=FALSE) cellgroups=as.list(strsplit(opt$Cellgroups, ",")[[1]]) peak_overlap_df <- read.table(opt$peakOverlap, sep="\t", header=TRUE, stringsAsFactors=FALSE) %>% select(variant,CredibleSet,CellType) -ABC_overlap_df <- read.table(opt$ABCOverlap, sep="\t", header=TRUE, stringsAsFactors=FALSE) %>% select(variant,CredibleSet,CellType, ABC.Score, TargetGene) +ABC_overlap_df <- read.table(opt$ABCOverlap, sep="\t", header=TRUE, stringsAsFactors=FALSE) +ABC.Score.column <- if("ABC.Score" %in% colnames(ABC_overlap_df)) "ABC.Score" else "Score" +ABC_overlap_df <- ABC_overlap_df%>% select(variant,CredibleSet,CellType, all_of(ABC.Score.column), TargetGene) +colnames(ABC_overlap_df) <- c("variant", "CredibleSet", "CellType", "ABC.Score", "TargetGene") + ############################################################# #filter variants for (coding|splice) coding_df <- variant_table %>% filter(Coding) %>% select(variant,CredibleSet,Coding, CodingVariantGene) %>% mutate(codingVariant=ifelse(Coding, TRUE, FALSE)) %>% mutate_if(is.factor, as.character) @@ -37,14 +42,13 @@ for (cellgroup in cellgroups){ ############################################################# #Variant in ABC -df_ABC_interactions <- cs2gene_df %>% select(CredibleSet,gene, CellTypesWithEPInteractions) %>% separate_rows(CellTypesWithEPInteractions, sep="\\|", convert = TRUE) %>% as.data.frame() +df_ABC_interactions <- cs2gene_df %>% select(CredibleSet,gene, CellTypesWithEPInteractions) %>% separate_rows(CellTypesWithEPInteractions, sep="\\|", convert = TRUE) %>% as.data.frame() df_ABC_interactions <- merge(df_ABC_interactions, ABC_overlap_df, by.x=c("CredibleSet","CellTypesWithEPInteractions","gene"), by.y=c("CredibleSet", "CellType", "TargetGene")) %>% group_by(CredibleSet,gene) %>% mutate(ABC.rank = dense_rank(-ABC.Score)) %>% ungroup() %>% filter(ABC.rank==1) %>% select(-ABC.rank) %>% as.data.frame() %>% mutate(MaxABCVariant=TRUE)%>% group_by(CredibleSet) %>% mutate(MaxABC.Rank=dense_rank(-ABC.Score)) %>% rename(MaxABC.Score=ABC.Score) %>% ungroup() %>% as.data.frame() %>% mutate_if(is.factor, as.character) - variant_ABC_df <- cs2gene_df %>% select(CredibleSet,gene) for (cellgroup in cellgroups){ celltype_col=paste0(cellgroup, "CellTypesWithEPInteractions") - celltype_specific_df_ABC_interactions <- cs2gene_df %>% select(CredibleSet, gene, !!sym(celltype_col)) %>% separate_rows(., !!sym(celltype_col), sep="\\|", convert = TRUE) + celltype_specific_df_ABC_interactions <- cs2gene_df %>% select(CredibleSet, gene, !!sym(celltype_col)) %>% separate_rows(., !!sym(celltype_col), sep="\\|", convert = TRUE) celltype_specific_df_ABC_interactions <- merge(celltype_specific_df_ABC_interactions,ABC_overlap_df,by.x=c("CredibleSet",celltype_col, "gene"), by.y=c("CredibleSet", "CellType", "TargetGene")) %>% group_by(CredibleSet, gene) %>% mutate(ABC.rank = dense_rank(-ABC.Score)) %>% ungroup() %>% filter(ABC.rank==1) %>% rename(!!paste0(cellgroup, ".ABC.Score"):=ABC.Score) %>% select(-ABC.rank) %>% as.data.frame()%>% mutate(!!sym(paste0("MacABC",cellgroup,"Variant")):=TRUE) variant_ABC_df <- variant_ABC_df %>% left_join(celltype_specific_df_ABC_interactions) %>% group_by(CredibleSet) %>% mutate(!!sym(paste0("MaxABC.Rank.", cellgroup, "Only")):=dense_rank(-!!sym(paste0(cellgroup, ".ABC.Score")))) %>% rename(!!paste0("MaxABC.Score.", cellgroup):=!!paste0(cellgroup, ".ABC.Score")) %>% ungroup() %>% as.data.frame() %>% mutate_if(is.factor, as.character) } diff --git a/snakemake/workflow/scripts/group.ABC.result.R b/snakemake/workflow/scripts/group.ABC.result.R index 5758162..0a341d1 100755 --- a/snakemake/workflow/scripts/group.ABC.result.R +++ b/snakemake/workflow/scripts/group.ABC.result.R @@ -8,18 +8,22 @@ option.list <- list( make_option("--cellGroup", type="character", help="CellCategory_to_group"), make_option("--ranked_ABC_table", type="character", help="Path to ranked ABC table"), make_option("--source", type="character", help="source if it's different from the one in credible set"), - make_option("--PIP", type="numeric", help="The PIP threshold to filter the variants"), + make_option("--PIP", type="numeric", help="The PIP threshold to filter the variants"), + make_option("--hasPIP", type="logical"), make_option("--helperFunctions", type="numeric") ) opt <- parse_args(OptionParser(option_list=option.list)) + + setwd(opt$outDir) source(opt$helperFunctions) cellTags_df <- read.table(opt$grouped_celltype_table, header=TRUE, stringsAsFactors=F) ABC.results <- read.table(opt$ranked_ABC_table, header=TRUE,stringsAsFactors=FALSE) -if (!any(is.na(ABC.results$PosteriorProb))) { +if (opt$hasPIP && !any(is.na(ABC.results$PosteriorProb))) { ABC.results <- ABC.results %>% filter(PosteriorProb >= opt$PIP) } +ABC.Score.column <- if("ABC.Score" %in% colnames(ABC.results)) "ABC.Score" else "Score" ABC.grouped <- ABC.results %>% group_by(CredibleSet,TargetGene) %>% mutate(CellTypesInCredibleSet = as.character(paste0(sort(unique(CellType)), collapse = "|"))) %>% ungroup() @@ -33,13 +37,17 @@ cell_group_max_ABC <- paste0("MaxABC.",opt$cellGroup, ".only") celltypes_in_cell_group <- paste0(opt$cellGroup, "InCredibleSet") # group by credible set, target gene, cell type group; calculate the largest ABC score and concatenate all the cell types -ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet,TargetGene,!!sym(opt$cellGroup)) %>% mutate(!!sym(cell_group_max_ABC):=max(ABC.Score)) %>% mutate(!!sym(celltypes_in_cell_group):=as.character(paste0(sort(unique(CellType)), collapse="|"))) %>% ungroup() +ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet,TargetGene,!!sym(opt$cellGroup)) %>% mutate(!!sym(cell_group_max_ABC):=max(get(ABC.Score.column))) %>% mutate(!!sym(celltypes_in_cell_group):=as.character(paste0(sort(unique(CellType)), collapse="|"))) %>% ungroup() # group by credible set and target gene,calculate the max cell type-specific ABC. -ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet, TargetGene) %>% mutate(!!sym(cell_group_max_ABC):=ifelse(!!sym(opt$cellGroup)==0, 0, !!sym(cell_group_max_ABC))) %>% mutate(!!sym(cell_group_max_ABC):=max(!!sym(cell_group_max_ABC))) %>% ungroup() +ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet, TargetGene) %>% mutate(!!sym(cell_group_max_ABC):=ifelse(!!sym(opt$cellGroup)==0, NA, !!sym(cell_group_max_ABC))) %>% mutate(!!sym(cell_group_max_ABC):=max(!!sym(cell_group_max_ABC))) %>% ungroup() #Cell group specific cell types in the cell -ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet, TargetGene) %>% mutate(!!sym(celltypes_in_cell_group):=ifelse(!!sym(opt$cellGroup) ==0, NA, !!sym(celltypes_in_cell_group))) %>% mutate(!!sym(celltypes_in_cell_group):=max(!!sym(celltypes_in_cell_group),na.rm = TRUE))%>% ungroup() +ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% + group_by(CredibleSet, TargetGene) %>% + mutate(!!sym(celltypes_in_cell_group):=ifelse(!!sym(opt$cellGroup) ==0, 0, !!sym(celltypes_in_cell_group))) %>% + mutate(!!sym(celltypes_in_cell_group):=max(!!sym(celltypes_in_cell_group),na.rm = TRUE)) %>% + ungroup() cell_group_max_ABC_rank <- paste0("MaxABC.Rank.",opt$cellGroup, ".only") ABC.grouped.cellgroup <- ABC.grouped.cellgroup %>% group_by(CredibleSet) %>% mutate(!!sym(cell_group_max_ABC_rank):=dense_rank(-!!sym(cell_group_max_ABC))) %>% ungroup() diff --git a/snakemake/workflow/scripts/group.peakoverlap.result.R b/snakemake/workflow/scripts/group.peakoverlap.result.R index 940b909..6f535cc 100755 --- a/snakemake/workflow/scripts/group.peakoverlap.result.R +++ b/snakemake/workflow/scripts/group.peakoverlap.result.R @@ -11,16 +11,23 @@ option.list <- list( make_option("--cellGroup", type="character"), make_option("--source", type="character"), make_option("--PIP", type="numeric"), + make_option("--hasPIP", type="logical"), make_option("--helperFunctions", type="character") ) opt <- parse_args(OptionParser(option_list=option.list)) + + +print("group.peakoverlap.result.R optparse list:") +print(opt) + setwd(opt$outDir) source(opt$helperFunctions) -cs_peak_overlap_summary <- read.table(opt$cs_peak_overlap_summary, header=TRUE, stringsAsFactors=FALSE) +cs_peak_overlap_summary <- read.table(opt$cs_peak_overlap_summary, header=TRUE, stringsAsFactors=FALSE) ## original code by Rosa Ma +## cs_peak_overlap_summary <- read.table(opt$cs_peak_overlap_summary, header=TRUE, stringsAsFactors=FALSE, fill=T) cellTags_df <- read.table(opt$grouped_celltype_table, header=TRUE, stringsAsFactors=F) peak_info <- read.table(opt$PeaksOverlapFull_Peaks, header=TRUE, stringsAsFactors=FALSE) -if (!any(is.na(peak_info$PIP))) { +if (opt$hasPIP && !any(is.na(peak_info$PIP))) { ## if PIP information is included and not NA peak_info <- peak_info %>% filter(PosteriorProb>=opt$PIP) } @@ -40,8 +47,9 @@ if (opt$cellGroup %in% colnames(cs_peak_overlap_summary)){ peak_info_EC_select <- peak_info %>% select(c("PeakChr","PeakStart","PeakEnd", "CredibleSet", "CellType","PosteriorProb")) %>% filter(greplany(celltype_tags, CellType)) combined.credibleset.peak <- left_join(cs_peak_overlap_summary, peak_info_EC_select, by=c("CredibleSet"="CredibleSet")) %>% distinct(.keep_all=TRUE) - - write.table(combined.credibleset.peak, quote=FALSE, sep="\t", paste0("CredibleSetPeakOverlapSummary.with", opt$cellGroup, "peakinfo.tsv")) + print("group.peakoverlap.result.R, combined.credibleset.peak CellType column table summary") + print(combined.credibleset.peak %>% pull(CellType) %>% table) + write.table(combined.credibleset.peak, file=paste0("CredibleSetPeakOverlapSummary.with", opt$cellGroup, "peakinfo.tsv"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) peak_info <- peak_info %>% select(c("PeakChr","PeakStart","PeakEnd", "CredibleSet", "CellType","PosteriorProb"))%>% mutate(isNotCelltype=ifelse(greplany(celltype_tags, CellType), 0, 1)) celltype_specific_colnames <- paste0("OnlyIntersectsWith", opt$cellGroup, "Peak") @@ -49,12 +57,14 @@ if (opt$cellGroup %in% colnames(cs_peak_overlap_summary)){ peak_info <- peak_info %>% group_by(CredibleSet) %>% mutate(ContainsOtherCellTypes=ifelse(sum(isNotCelltype), TRUE, FALSE)) %>% ungroup() %>% filter(!ContainsOtherCellTypes) %>% as.data.frame()%>% select(CredibleSet, ContainsOtherCellTypes) %>% dplyr::rename(!!sym(celltype_specific_colnames):=ContainsOtherCellTypes) %>% mutate(!!sym(celltype_specific_colnames):="TRUE") combined.celltype.credibleset.peak <- left_join(cs_peak_overlap_summary, peak_info, by=c("CredibleSet"="CredibleSet")) %>% distinct(.keep_all=TRUE) + print("group.peakoverlap.result.R, combined.celltype.credibleset.peak dataframe OnlyIntersectsWithHepatocytesPeak column table summary:") + print(combined.celltype.credibleset.peak %>% pull(OnlyIntersectsWithHepatocytesPeak) %>% table) - write.table(combined.celltype.credibleset.peak, quote=FALSE, sep="\t", paste0("CredibleSetPeakOverlapSummary.withpeakinfo.only",opt$cellGroup,".tsv")) + write.table(combined.celltype.credibleset.peak, file=paste0("CredibleSetPeakOverlapSummary.withpeakinfo.only",opt$cellGroup,".tsv"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) } else { combined.credibleset.peak <- data.frame(matrix(ncol=1, nrow=0)) - write.table(combined.credibleset.peak, quote=FALSE, sep="\t", paste0("CredibleSetPeakOverlapSummary.with", opt$cellGroup, "peakinfo.tsv")) + write.table(combined.credibleset.peak, file=paste0("CredibleSetPeakOverlapSummary.with", opt$cellGroup, "peakinfo.tsv"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) combined.celltype.credibleset.peak <- data.frame(matrix(ncol=1, nrow=0)) - write.table(combined.celltype.credibleset.peak, quote=FALSE, sep="\t", paste0("CredibleSetPeakOverlapSummary.withpeakinfo.only",opt$cellGroup,".tsv")) + write.table(combined.celltype.credibleset.peak, file=paste0("CredibleSetPeakOverlapSummary.withpeakinfo.only",opt$cellGroup,".tsv"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) } diff --git a/snakemake/workflow/scripts/helper_functions.R b/snakemake/workflow/scripts/helper_functions.R index 50a423c..9ad9e56 100644 --- a/snakemake/workflow/scripts/helper_functions.R +++ b/snakemake/workflow/scripts/helper_functions.R @@ -110,7 +110,7 @@ annotateVariants <- function(df, variant.gr, promoters.gr, include.names=F) { if (is.na(df$PromoterVariantGene[queryHits(ixn)[i]])) df$PromoterVariantGene[queryHits(ixn)[i]] <- gene else - df$PromoterVariantGene[queryHits(ixn)[i]] <- paste0(unique(c(gene, strsplit(df$PromoterVariantGene[queryHits(ixn)[i]],";"))), collapse=';') + df$PromoterVariantGene[queryHits(ixn)[i]] <- paste0(unique(c(gene, strsplit(df$PromoterVariantGene[queryHits(ixn)[i]],";") %>% unlist())), collapse=';') } } }