From 24bb992b42966d2b919c59726fe34a53d7fab4c7 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Wed, 13 Jan 2021 21:41:25 -0500 Subject: [PATCH 01/38] Starting code for COCOA scoring via matrix methods --- R/COCOA.R | 172 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 83 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 363eb30..efedccb 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -108,6 +108,7 @@ if (getRversion() >= "2.15.1") { #' is "proportionWeightedMean". This vector should contain the proportion of #' each regionSet region that is overlapped by a signalCoord region. The #' order of pOlap should be the same as the overlaps in rsOL. +#' @param rsOLMat #' @template returnCovInfo #' @template checkInput @@ -150,6 +151,7 @@ aggregateSignal <- function(signal, verbose = FALSE, absVal=TRUE, rsOL=NULL, pOlap=NULL, + rsOLMat=NULL, returnCovInfo=TRUE, .checkInput=TRUE) { @@ -249,10 +251,13 @@ aggregateSignal <- function(signal, } } + ################### finished checking inputs ######################### numOfRegions <- length(regionSet) totalCpGs <- nrow(signal) + + #### UPDATE: only do this once, in outermost function possible ##### # extreme positive or negative values both give important information # take absolute value or not if (absVal) { @@ -278,7 +283,8 @@ aggregateSignal <- function(signal, } else { loadingDT <- signal[, signalCol, drop=FALSE] } - + ######## UPDATE: can above code be done only once, like input checking? + ######## ########################################################################### @@ -287,18 +293,44 @@ aggregateSignal <- function(signal, # would rounding speed up aggregation?, potentially make a sparse matrix # if a lot of entries became 0 - # works for both singleBase and multiBase + # works for both singleBase and multiBase (UPDATE: did, but not matrix scoring) if (scoringMetric == "simpleMean") { - loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, - signalGR = signalCoord, - regionSet = regionSet, - calcCols= signalCol, - metric = "mean", - rsOL = rsOL, - returnCovInfo=returnCovInfo)) - results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) + if ((signalCoordType == "multiBase") & !is.null(rsOLMat)) { + + dim(rsMat) + loadings<-loadings[, c("PC1", "PC2")] + dim(loadings) + + loadings <- abs(loadings) + rsScoresMatrix <- t(rsMat) %*% loadings + covCount <- colSums(rsMat) + results <- as.data.frame(apply(rsScoresMatrix, + MARGIN = 2, + FUN = function(x) x/covCount)) + results$signalCoverage <- covCount + # rsOLMat does not give the regionSetCoverage, totalRegionNumber + # or meanRegionSize + + # add these columns? + # results[, signalCoverage := 0] + # results[, regionSetCoverage := 0] + # results[, totalRegionNumber := numOfRegions] + # results[, meanRegionSize := round(mean(width(regionSet)), 1)] + + + } else { + loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, + signalGR = signalCoord, + regionSet = regionSet, + calcCols= signalCol, + metric = "mean", + rsOL = rsOL, + returnCovInfo=returnCovInfo)) + results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + } + } else if (scoringMetric == "simpleMedian") { # scoring singleBase and multiBase both with this function for # simpleMedian @@ -358,77 +390,7 @@ aggregateSignal <- function(signal, returnCovInfo=returnCovInfo) } - # } else if (scoringMetric == "meanDiff") { - # # if (is.null(pcLoadAv)) { - # # # calculate (should already be absolute) - # # pcLoadAv <- apply(X = loadingDT[, signalCol, with=FALSE], - # # MARGIN = 2, FUN = mean) - # # } - # loadMetrics <- signalOLMetrics(dataDT=loadingDT, regionSet=regionSet, - # signalCol = signalCol, - # metrics=c("mean", "sd"), - # alsoNonOLMet=TRUE) - # if (is.null(loadMetrics)) { - # results <- as.data.table(t(rep(NA, length(signalCol)))) - # setnames(results, signalCol) - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - # } else { - # # calculate mean difference - # # pooled standard deviation - # sdPool <- sqrt((loadMetrics$sd_OL^2 + loadMetrics$sd_nonOL^2) / 2) - # - # # mean difference - # # error if signalCoverage > (1/2) * totalCpGs - # meanDiff <- (loadMetrics$mean_OL - loadMetrics$mean_nonOL) / - # (sdPool * sqrt((1 / loadMetrics$signalCoverage) - (1 / (totalCpGs - loadMetrics$signalCoverage)))) - # results <- as.data.table(t(meanDiff)) - # colnames(results) <- loadMetrics$testCol - # - # # add information about degree of overlap - # results <- cbind(results, loadMetrics[1, .SD, .SDcols = c("signalCoverage", "regionSetCoverage", "totalRegionNumber", "meanRegionSize")]) - # } - # - # } else if (scoringMetric == "rankSum") { - # - # # if (wilcox.conf.int) { - # # # returns confidence interval - # # wRes <- rsWilcox(dataDT = loadingDT, regionSet=regionSet, - # # signalCol = signalCol, - # # conf.int = wilcox.conf.int) - # # - # # if (is.null(wRes)) { - # # results <- as.data.table(t(rep(NA, length(signalCol) * 2))) - # # setnames(results, paste0(rep(signalCol, each=2), - # # c("_low", "_high"))) - # # results[, signalCoverage := 0] - # # results[, regionSetCoverage := 0] - # # results[, totalRegionNumber := numOfRegions] - # # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - # # } else { - # # results <- as.data.table(wRes) - # # } - # # - # # } else { - # # returns p value - # # one sided test since I took the absolute value of the loadings - # wRes <- rsWilcox(dataDT = loadingDT, regionSet=regionSet, - # signalCol = signalCol, alternative="greater") - # - # if (is.null(wRes)) { - # results <- as.data.table(t(rep(NA, length(signalCol)))) - # setnames(results, signalCol) - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - # } else { - # results <- as.data.table(wRes) - # } - # #} - + } else { # signalCoordType == "multiBase" # for ATAC-seq @@ -2257,5 +2219,49 @@ regionOLMean <- function(signalDT, signalGR, regionSet, return(signalAve) } +########################################################################## +# matrix scoring +# 1. make a region set matrix. The dimensions are nrows=nfeatures of +# epigenetic data segmentation (e.g. ATAC consensus peaks), ncol=nregionsets. +# It has a 1 for data regions that are overlapped by a given region set +# and a zero for data regions that do not overlap the region set +# This will produce an unweighted mean. +# 2. Multiply the region set matrix times the loading/correlation matrix. +# 3. Divide by total covered regions for that region set to get the mean. This +# is the region set score. +# @template signalCoord +# @template GRList +# @value Returns a matrix where each column corresponds to one region set +# and rows are data regions +olToMat = function(signalCoord, GRList) { + # calculate overlaps only once + # region set must be subject to fit with scoring functions + olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, + subject = x)) + + # each column is a region set + rsMat = matrix(data = rep(0, length(GRList) * length(signalCoord)), + nrow=length(signalCoord)) + for (i in seq_along(GRList)) { + rsMat[unique(queryHits(olList[[i]])), i] = 1 + } + + colnames(rsMat) = names(GRList) + return(rsMat) + # totalRegionNumber = sapply(X = GRList, length) + # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) +} +rsMat = olToMat() + +dim(rsMat) +loadings=loadings[, c("PC1", "PC2")] +dim(loadings) + +loadings = abs(loadings) +rsScoresMatrix = t(rsMat) %*% loadings +covCount = colSums(rsMat) +rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) +View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) +View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) From a1f6682a51b9bf9241ff7bbfd29b57afba277151 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 22 Jan 2021 22:59:01 -0500 Subject: [PATCH 02/38] Update matrix scoring --- R/COCOA.R | 4574 +++++++++++++++++++++++++++-------------------------- 1 file changed, 2307 insertions(+), 2267 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index efedccb..dcb6c1d 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -1,2267 +1,2307 @@ -# PACKAGE DOCUMENTATION -#' Coordinate Covariation Analysis (COCOA) -#' -#' -#' COCOA is a method for understanding epigenetic variation among samples. -#' COCOA can be used with epigenetic data that includes -#' genomic coordinates and an epigenetic signal, -#' such as DNA methylation and chromatin accessibility -#' data. -#' To describe the method on a high level, COCOA quantifies -#' inter-sample variation with either a supervised or unsupervised -#' technique then uses a database of "region sets" to -#' annotate the variation among samples. A region set is a set of -#' genomic regions that share a biological annotation, -#' for instance transcription factor (TF) binding regions, -#' histone modification regions, or open chromatin regions. -#' COCOA can identify region sets that are associated with -#' epigenetic variation between samples and -#' increase understanding of variation in your data. -# -# In contrast to some other common techniques, COCOA offers both -# supervised (known groups/phenotype) and unsupervised (no known groups/ -# phenotype) analyses. Also, COCOA focuses on continuous variation -# between samples instead of having cutoffs. Because of this, COCOA can -# be used as a complementary method alongside "differential" methods -# that find discrete differences between groups of samples and -# it can also be used in situations where there are no groups. -# COCOA can identify biologically meaningful -# sources of variation between samples and -#' -#' @docType package -#' @name COCOA -#' @author John Lawson -#' @author Nathan Sheffield -#' -#' @references \url{http://github.com/databio} -#' @importFrom ggplot2 ggplot aes facet_wrap geom_boxplot geom_jitter geom_line -#' theme_classic xlab ylab geom_hline ylim scale_color_discrete -#' scale_x_discrete scale_fill_brewer scale_color_manual -#' scale_color_brewer theme element_line element_text geom_point -#' @importFrom ComplexHeatmap Heatmap draw -#' @import BiocGenerics S4Vectors IRanges GenomicRanges simpleCache -#' @importFrom data.table ":=" setDT data.table setkey fread setnames -#' setcolorder rbindlist setattr setorder copy is.data.table -#' setorderv as.data.table -#' @importFrom Biobase sampleNames -#' @importFrom stats lm coefficients poly wilcox.test ecdf pgamma p.adjust -#' @importFrom methods is -#' @importFrom MIRA binRegion -#' @importFrom tidyr gather -#' @importFrom grid grid.newpage grid.grabExpr grid.draw popViewport -#' pushViewport unit viewport -#' @importFrom grDevices dev.off -#' @importFrom methods hasArg -#' @importFrom fitdistrplus fitdist -#' @importFrom simpleCache simpleCache -NULL - -# @importFrom ppcor pcor.test - - -# now package lists GenomicRanges in "Depends" instead of "Imports" in -# DESCRIPTION, still import package with @import though -# @importFrom GenomicRanges GRanges GRangesList elementMetadata strand -# seqnames granges - -# Because of some issues, -# (see here: http://stackoverflow.com/questions/9439256/) -# I have to register stuff used in data.table as non-standard evaluation, -# in order to pass some R CMD check NOTES. -if (getRversion() >= "2.15.1") { - utils::globalVariables(c( - ".", "..calcCols", "bin", "binID", "chr", "id", "colsToAnnotate", - "coordinateDT", - "coverage", "Group", "pOlap", "regionGroupID", "regionID", "theme", - "meanRegionSize", "regionSetCoverage", "rowIndex", "rsIndex", - "rsRegionID", "totalRegionNumber", "signalCoverage", ".SD", - "sumProportionOverlap")) -} - -######################################################################### - - -#' Score a region set using feature contribution scores -#' -#' First, this function identifies which epigenetic features -#' overlap the region set. -#' Then the region set is scored using the feature contribution scores -#' (`signal` input) -#' according to the `scoringMetric` parameter. -#' -#' @template signal -#' @template signalCoord -#' @template signalCoordType -#' @templateVar refGenomeWarning TRUE -#' @template regionSet -#' @template signalCol -#' @template scoringMetric -#' @template verbose -# Useful when using -# aggregateSignal with 'apply' to do many region sets at a time. -# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring -# method. returns a 95% confidence interval from the Wilcoxon rank sum test -# instead of p value. -#' @template absVal -#' @template rsOL -#' @param pOlap Numeric vector. Only used if rsOL is given and scoringMetric -#' is "proportionWeightedMean". This vector should contain the proportion of -#' each regionSet region that is overlapped by a signalCoord region. The -#' order of pOlap should be the same as the overlaps in rsOL. -#' @param rsOLMat -#' @template returnCovInfo -#' @template checkInput - -#' @return A data.frame with one row and the following -#' columns: one column for each item of signalCol with names given -#' by signalCol. These columns have scores for the region set for each signalCol. -#' Other columns: signalCoverage (formerly cytosine_coverage) which -#' has number of epigenetic features that overlapped at all with regionSet, -#' regionSetCoverage which has number of regions from regionSet -#' that overlapped any of the epigenetic features, -#' totalRegionNumber that has -#' number of regions in regionSet, meanRegionSize that has average -#' size in base pairs of regions in regionSet, the average is based on -#' all regions in regionSet and not just ones that overlap. -#' For "multiBase" data, if the "proportionWeightedMean" scoring metric -#' is used, then the output will also have a "sumProportionOverlap" column. -#' During this scoring method, the proportion overlap between each signalCoord -#' region and overlapping regionSet region is calculated. This column is -#' the sum of all those proportion overlaps and is another way to quantify -#' coverage of regionSet in addition to regionSetCoverage. -#' -#' @examples -#' data("brcaATACCoord1") -#' data("brcaATACData1") -#' data("esr1_chr1") -#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation -#' rsScores <- aggregateSignal(signal=featureContributionScores, -#' signalCoord=brcaATACCoord1, -#' regionSet=esr1_chr1, -#' signalCol=c("PC1", "PC2"), -#' scoringMetric="default") -#' @export - -aggregateSignal <- function(signal, - signalCoord, - regionSet, - signalCol = c("PC1", "PC2"), - signalCoordType = "default", - scoringMetric = "default", - verbose = FALSE, - absVal=TRUE, - rsOL=NULL, pOlap=NULL, - rsOLMat=NULL, - returnCovInfo=TRUE, - .checkInput=TRUE) { - - ################### checking inputs ################################# - - # if it was already checked outside this function, don't need to re-check - if (.checkInput) { - ########## check that inputs are the correct class - checkConvertInputClasses(signal=signal, - signalCoord=signalCoord, - regionSet=regionSet, - signalCol=signalCol, - rsOL=rsOL) - - ########## check that dimensions of inputs are consistent - # length of signal coord = nrow of signal - if (length(signalCoord) != nrow(signal)) { - stop(cleanws("The number of coordinates in - signalCoord (length(signalCoord)) does not equal the number of - rows in `signal`")) - } - - ######### check that appropriate columns are present - # signalCol are column names of signal - if (!all(signalCol %in% colnames(signal))) { - missingCols = signalCol[!(signalCol %in% colnames(signal))] - stop(cleanws(paste0("Some signalCol are not - columns of signal: ", missingCols))) - } - - ######## check that scoringMetric is appropriate - - if (!(scoringMetric %in% getScoringMethods("both"))) { - stop(cleanws("scoringMetric was not recognized. - Check spelling and available options.")) - } - - ###### check that signalCoordType is appropriate - if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - - ####### - # what happens if there are NAs or Inf in `signal`? - # any NAs that overlap the regionSet will cause the score to be NA - if (is(signal, "data.table")) { - naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } else { - naRows = apply(X = signal[, signalCol, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } - - if (any(naRows)) { - signal <- signal[!naRows, ] - signalCoord <- signalCoord[!naRows] - warning("Removing rows with NA from `signal`") - } - - ################################################################# - - # detect signalCoordType - if (signalCoordType == "default") { - - # when signalCoord is a GRanges object - if (any(start(signalCoord) != end(signalCoord))) { - signalCoordType <- "multiBase" - } else { - signalCoordType <- "singleBase" - } - } - - # if "default" scoring method is given, choose based on signalCoordType - if (scoringMetric == "default") { - if (signalCoordType == "singleBase") { - scoringMetric <- "regionMean" - } else if (signalCoordType == "multiBase") { - scoringMetric <- "proportionWeightedMean" - } else { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - } - - # make sure that scoringMetric is consistent with signalCoordType - if (signalCoordType == "singleBase") { - if (!(scoringMetric %in% getScoringMethods("singleBase"))) { - stop("The scoringMetric you selected is not available for - this data's signalCoordType") - } - } else if (signalCoordType == "multiBase") { - if (!(scoringMetric %in% getScoringMethods("multiBase"))) { - stop("The scoringMetric you selected is not available for - this data's signalCoordType") - } - } - - } - ################### finished checking inputs ######################### - - numOfRegions <- length(regionSet) - totalCpGs <- nrow(signal) - - - #### UPDATE: only do this once, in outermost function possible ##### - # extreme positive or negative values both give important information - # take absolute value or not - if (absVal) { - signal <- abs(signal) # required for later code - } - - # XX copies unnecessarily:reformat into data.table with chromosome location and weight - - # make sure `signal` is the correct type for further steps - # (proportionWeightedMean method requires a matrix) - if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { - signal <- as.data.table(signal) - } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { - signal <- as.matrix(signal) - } - - # restricting to signalCol so unnecessary computations - # are not done - if (is(signal, "data.table")) { - loadingDT <- signal[, signalCol, with=FALSE] - # # naming does not work if only using one PC so add this line for that case - # setnames(loadiangDT, signalCol) - } else { - loadingDT <- signal[, signalCol, drop=FALSE] - } - ######## UPDATE: can above code be done only once, like input checking? - ######## - - - ########################################################################### - # scoring - - # would rounding speed up aggregation?, potentially make a sparse matrix - # if a lot of entries became 0 - - # works for both singleBase and multiBase (UPDATE: did, but not matrix scoring) - if (scoringMetric == "simpleMean") { - if ((signalCoordType == "multiBase") & !is.null(rsOLMat)) { - - dim(rsMat) - loadings<-loadings[, c("PC1", "PC2")] - dim(loadings) - - loadings <- abs(loadings) - rsScoresMatrix <- t(rsMat) %*% loadings - covCount <- colSums(rsMat) - results <- as.data.frame(apply(rsScoresMatrix, - MARGIN = 2, - FUN = function(x) x/covCount)) - results$signalCoverage <- covCount - # rsOLMat does not give the regionSetCoverage, totalRegionNumber - # or meanRegionSize - - # add these columns? - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - - } else { - loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, - signalGR = signalCoord, - regionSet = regionSet, - calcCols= signalCol, - metric = "mean", - rsOL = rsOL, - returnCovInfo=returnCovInfo)) - results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - } - - } else if (scoringMetric == "simpleMedian") { - # scoring singleBase and multiBase both with this function for - # simpleMedian - loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, - signalGR = signalCoord, - regionSet = regionSet, - calcCols= signalCol, - metric = "median", - rsOL=rsOL, - returnCovInfo = returnCovInfo)) - - results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - - - } else if (signalCoordType == "singleBase") { - # do the actual aggregation - if (scoringMetric == "regionMean") { - - # specify aggregation operation - # will be done separately for each PC specified - aggrCommand <- buildJ(signalCol, - rep("mean", length(signalCol))) - # previously used BSAggregate from RGenomeUtils but now using local, - # modified copy - loadAgMain <- BSAggregate(BSDT = loadingDT, - regionsGRL = regionSet, - BSCoord = signalCoord, - jExpr = aggrCommand, - byRegionGroup = TRUE, - splitFactor = NULL, - returnOLInfo = returnCovInfo, - rsOL=rsOL) - - results <- .formatResults(loadAgMain, - scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - - } else if (scoringMetric == "regionMedian") { - - aggrCommand <- buildJ(signalCol, - rep("median", length(signalCol))) - loadAgMain <- BSAggregate(BSDT = loadingDT, - regionsGRL = regionSet, - BSCoord = signalCoord, - jExpr = aggrCommand, - byRegionGroup = TRUE, - splitFactor = NULL, - returnOLInfo = returnCovInfo, - rsOL=rsOL) - - results <- .formatResults(loadAgMain, - scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - - } - - } else { - # signalCoordType == "multiBase" - # for ATAC-seq - if (scoringMetric == "proportionWeightedMean") { - loadAgMain <- regionOLWeightedMean(signalMat = loadingDT, - signalGR = signalCoord, - regionSet = regionSet, - calcCols= signalCol, - rsOL = rsOL, - pOlap = pOlap, - returnCovInfo=returnCovInfo) - results <- .formatResults(as.data.table(loadAgMain), - scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - - - } - } - - # signalOLMetrics() # make sure it works with no overlap - if (verbose) { - message("|") - } - - return(as.data.frame(results)) -} - - -# format results of scoring functions in aggregateSignal() -.formatResults <- function(loadAgMain, scoringMetric, - regionSet, signalCol, returnCovInfo=TRUE) { - - numOfRegions <- length(regionSet) - - # if no cytosines from loadings were included in regionSet, result is NA - if (is.null(loadAgMain)) { - results <- as.data.table(t(rep(NA, length(signalCol)))) - setnames(results, signalCol) - if (returnCovInfo) { - results[, signalCoverage := 0] - results[, regionSetCoverage := 0] - results[, totalRegionNumber := numOfRegions] - results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - # this column is only added by this scoring method - if (scoringMetric == "proportionWeightedMean") { - results[, sumProportionOverlap := 0] - } - } - } else { - # regionMean, regionMedian, simpleMean for region data - results <- loadAgMain[, .SD, .SDcols = signalCol] - - if (returnCovInfo) { - results[, signalCoverage := loadAgMain[, .SD, .SDcols = "signalCoverage"]] - results[, regionSetCoverage := loadAgMain[, .SD, .SDcols = "regionSetCoverage"]] - results[, totalRegionNumber := numOfRegions] - results[, meanRegionSize := round(mean(width(regionSet)), 1)] - ################################ - # proportionWeightedMean - # this column is only added by this scoring method - if (scoringMetric == "proportionWeightedMean") { - results[, sumProportionOverlap := loadAgMain[, .SD, .SDcols = "sumProportionOverlap"]] - } - } - - ############################## - # simple mean for base pair data - } - return(results) -} - - -#' Score many region sets -#' -#' This function will give each region set a score for each target variable -#' given by `signalCol` based on -#' the `scoringMetric` parameter. Based on these scores, you can determine -#' which region sets out of a region set database (given by `GRList`) -#' are most associated with the target variables. See the vignette "Introduction -#' to Coordinate Covariation Analysis" for help interpreting your -#' results. -#' -#' @template signal -#' @template signalCoord -#' @template signalCoordType -#' @template GRList -#' @template signalCol -#' @template scoringMetric -#' @template verbose -# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring -# method. returns a 95% confidence interval from the Wilcoxon rank sum test -# instead of p value. -#' @template absVal -#' @template olList -#' @template pOlapList -#' @template returnCovInfo -#' @return Data.frame of results, one row for each region set. -#' It has the following columns: -#' one column for each item of signalCol with names given -#' by signalCol. These columns have scores for the region set for each signalCol. -#' Other columns: signalCoverage (formerly cytosine_coverage) which -#' has number of epigenetic features that overlapped at all with regionSet, -#' regionSetCoverage which has number of regions from regionSet -#' that overlapped any of the epigenetic features, -#' totalRegionNumber that has -#' number of regions in regionSet, meanRegionSize that has average -#' size in base pairs of regions in regionSet, the average is based on -#' all regions in regionSet and not just ones that overlap. -#' For "multiBase" data, if the "proportionWeightedMean" scoring metric -#' is used, then the output will also have a "sumProportionOverlap" column. -#' During this scoring method, the proportion overlap between each signalCoord -#' region and overlapping regionSet region is calculated. This column is -#' the sum of all those proportion overlaps and is another way to quantify -#' coverage of regionSet in addition to regionSetCoverage. -#' -#' -#' @examples -#' data("brcaATACCoord1") -#' data("brcaATACData1") -#' data("esr1_chr1") -#' data("nrf1_chr1") -#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation -#' GRList <- GRangesList(esr1_chr1, nrf1_chr1) -#' rsScores <- aggregateSignalGRList(signal=featureContributionScores, -#' signalCoord=brcaATACCoord1, -#' GRList= GRList, -#' signalCol=c("PC1", "PC2"), -#' scoringMetric="default") -#' -#' @export - -aggregateSignalGRList <- function(signal, - signalCoord, - GRList, - signalCol = c("PC1", "PC2"), - signalCoordType = "default", - scoringMetric = "default", - verbose = TRUE, - absVal=TRUE, olList=NULL, - pOlapList=NULL, returnCovInfo=TRUE) { - - ################### checking inputs ################################# - - ########## check that inputs are the correct class - checkConvertInputClasses(signal=signal, - signalCoord=signalCoord, - regionSet=NULL, - signalCol = signalCol, - GRList=GRList, - olList=olList) - - ########## check that dimensions of inputs are consistent - # length of signal coord = nrow of signal - if (length(signalCoord) != nrow(signal)) { - stop(cleanws("The number of coordinates in - signalCoord (length(signalCoord)) does not equal the number of - rows in `signal`")) - } - - ######### check that appropriate columns are present - # signalCol are column names of signal - if (!all(signalCol %in% colnames(signal))) { - missingCols = signalCol[!(signalCol %in% colnames(signal))] - stop(cleanws(paste0("Some signalCol are not - columns of signal: ", missingCols))) - } - - ######## check that scoringMetric is appropriate - - if (!(scoringMetric %in% getScoringMethods("both"))) { - stop(cleanws("scoringMetric was not recognized. - Check spelling and available options.")) - } - - ###### check that signalCoordType is appropriate - if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - - ####### - # what happens if there are NAs or Inf in `signal`? - # any NAs that overlap the regionSet will cause the score to be NA - if (is(signal, "data.table")) { - naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } else { - naRows = apply(X = signal[, signalCol, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } - - if (any(naRows)) { - signal <- signal[!naRows, ] - signalCoord <- signalCoord[!naRows] - warning("Removing rows with NA from `signal`") - } - - ################################################################# - - # detect signalCoordType - if (signalCoordType == "default") { - # when signalCoord is a GRanges object - if (any(start(signalCoord) != end(signalCoord))) { - signalCoordType <- "multiBase" - } else { - signalCoordType <- "singleBase" - } - } - - # if "default" scoring method is given, choose based on signalCoordType - if (scoringMetric == "default") { - if (signalCoordType == "singleBase") { - scoringMetric <- "regionMean" - } else if (signalCoordType == "multiBase") { - scoringMetric <- "proportionWeightedMean" - } else { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - } - - # convert object class outside aggregateSignal to extra prevent copying - # (one scoring method needs `signal` as a matrix though) - if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { - signal <- as.data.table(signal) - } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { - signal <- as.matrix(signal) - } - - - # take absolute value outside aggregateSignal to prevent extra copying - if (absVal) { - if (is(signal, "data.table")) { - signal[, signalCol] <- abs(signal[, signalCol, with=FALSE]) - } else { - signal[, signalCol] <- abs(signal[, signalCol]) - } - absVal <- FALSE - } - - if (is.null(olList)) { - # apply over the list of region sets - resultsList <- lapplyAlias(GRList, - function(x) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = x, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, pOlap=NULL, - returnCovInfo=returnCovInfo)) - # resultsList <- mapply(FUN = function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = x, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, pOlap=y, - # returnCovInfo=returnCovInfo), x=GRList, y=pOlapList) - #wilcox.conf.int = wilcox.conf.int, - } else { - # # apply over the list of region sets - # resultsList <- lapplyAlias(olList, - # function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = NULL, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, - # rsOL=x, pOlap=pOlapList, returnCovInfo=returnCovInfo)) - # #wilcox.conf.int = wilcox.conf.int, - if (is.null(pOlapList)) { - # mapply does not work if pOlapList is null and other argument is not null - resultsList <- lapplyAlias(olList, FUN = function(x) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, - returnCovInfo=returnCovInfo)) - } else { - resultsList <- mapply(FUN = function(x, y) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, pOlap=y, - returnCovInfo=returnCovInfo), - x=olList, y=pOlapList) - } - - - } - - resultsDT <- do.call(rbind, resultsList) - - # # add names if they are present - # if (!is.null(names(GRList))) { - # # resultsDT[, rsName := names(GRList)] - # row.names(resultsDT) <- names(GRList) - # } - - # rsName <- row.names(resultsDT) - resultsDF <- as.data.frame(resultsDT) #, row.names = rsName) - - return(resultsDF) -} - -# @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal -# (each row corresponds to one genomic coordinate/range) -# @param featureMat Rows should be samples, columns should be "features" -# (whatever you want to get correlation with: eg PC scores), -# all columns in featureMat will be used (subset when passing to function -# in order to not use all columns) -# @param centerDataMat logical object. Should rows in dataMat be centered based on -# their means? (subtracting row mean from each row) -# @param centerFeatureMat logical. Should columns in featureMat be centered based -# on their means? (subtract column mean from each column) -# @param testType character object. Can be "cor" (Pearson correlation), -# "spearmanCor (Spearman correlation) -# "pcor" (partial correlation), "cov" (covariance (Pearson)), -# @param covariate -# -# If a row in dataMat has 0 stand. deviation, correlation will be set to 0 -# instead of NA as would be done by cor() -# -# @return returns a matrix where rows are the genomic signal (eg a CpG or region) and -# columns are the columns of featureMat -# @examples dataMat = matrix(rnorm(50), 5, 10) -# featureMat = matrix(rnorm(20), 10, 2) -createCorFeatureMat <- function(dataMat, featureMat, - centerDataMat=TRUE, centerFeatureMat = TRUE, - testType="cor", covariate=NULL) { - - featureMat <- as.matrix(featureMat) - featureNames <- colnames(featureMat) - nFeatures <- ncol(featureMat) - nDataDims <- nrow(dataMat) - - if (centerDataMat) { - cpgMeans <- rowMeans(dataMat, na.rm = TRUE) - # centering before calculating correlation - dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans) - - } - - if (centerFeatureMat) { - featureMeans <- colMeans(featureMat, na.rm = TRUE) - # centering before calculating correlation(also, t() converts to matrix) - featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans)) - if (dim(featureMat)[1] == 1) { - featureMat <- t(featureMat) - } - } - - # avoid this copy and/or delay transpose until after calculating correlation? - dataMat <- as.data.frame(t(dataMat)) - - - if (testType == "cor") { - # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) - # how much do features correlate with each PC? - - # put epigenetic data first in cor() - featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="pearson") - - } else if (testType == "spearmanCor") { - # xtfrm(x) ranking - featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="spearman") - - # } else if (testType == "pcor") { - # # partial correlation (account for covariates), ppcor package - # - # featurePCCor <- apply(X = featureMat, MARGIN = 2, function(y) apply(X = dataMat, 2, - # FUN = function(x) pcor.test(x = x, y=y, - # z=covariate, - # method="pearson")$estimate)) - # - } else if (testType == "cov") { - featurePCCor <- cov(dataMat, featureMat, use="pairwise.complete.obs") - - } else { - stop("invalid testType") - } - - - # if standard deviation of the data was zero, NA will be produced - # set to 0 because no standard deviation means no correlation with attribute of interest - featurePCCor[is.na(featurePCCor)] <- 0 - colnames(featurePCCor) <- featureNames - - return(featurePCCor) - # corLoadRatio <- signal[, signalCol] / featurePCCor - # hist(corLoadRatio[, "PC10"]) -} - - -#' Create a "meta-region" profile -#' -#' This profile can show enrichment -#' of genomic signals with high feature contribution scores -#' in the region set but not in the -#' surrounding genome, suggesting that variation is linked specifically -#' to that region set. -#' -#' All regions in a given region set -#' are combined into a single aggregate profile. Regions in `regionSet` -#' should be -#' expanded on each side to include a wider area of the genome around -#' the regions of interest (see example and vignettes). -#' To make the profile, first we optionally take -#' the absolute value of `signal` (`absVal` parameter). -#' Then each expanded regionSet region is -#' split into `binNum` bins. The corresponding -#' bins from each region -#' (e.g. all bin1's, all bin2's, etc.) are grouped. -#' All overlapping values from `signal` are -#' aggregated in each bin group according to the `aggrMethod` parameter to -#' get a meta-region profile. Since DNA strand information is not considered, -#' the profile is averaged symmetrically around the center. -#' A peak in the middle of this profile suggests -#' that variability is specific to the region set of interest and is -#' not a product of the surrounding genome. A region set can still be -#' significant even if it does not have a peak. For example, some -#' histone modification region sets may be in large genomic blocks -#' and not show a peak, despite having variation across samples. -#' -#' @template signal -#' @template signalCoord -#' @template regionSet -#' @template signalCol -#' @template signalCoordType -#' @param binNum Number of bins to split each region into when -#' making the aggregate profile. More bins will -#' give a higher resolution but perhaps more noisy profile. -#' @template verbose -#' @templateVar usesAggrMethod TRUE -#' @template scoringMetric -#' @template absVal -#' @return A data.frame with the binned meta-region profile, -#' one row per bin. columns: binID and one column for each target variable -#' in signalCol. The function will return NULL if there -#' is no overlap between signalCoord and any of the bin groups that come -#' from regionSet (e.g. none of the bin1's overlapped signalCoord, -#' NULL returned). -#' -#' @examples -#' data("brcaATACCoord1") -#' data("brcaATACData1") -#' data("esr1_chr1") -#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation -#' esr1_chr1_expanded <- resize(esr1_chr1, 12000, fix="center") -#' mrProfile <- getMetaRegionProfile(signal=featureContributionScores, -#' signalCoord=brcaATACCoord1, -#' regionSet=esr1_chr1_expanded, -#' signalCol=c("PC1", "PC2"), -#' binNum=21) -#' @export - -getMetaRegionProfile <- function(signal, signalCoord, regionSet, - signalCol = c("PC1", "PC2"), - signalCoordType = "default", - binNum = 21, - verbose=TRUE, - aggrMethod = "default", absVal=TRUE) { - - ################### checking inputs ################################# - - ########## check that inputs are the correct class, convert - checkConvertInputClasses(signal=signal, - signalCoord=signalCoord, - regionSet=regionSet, - signalCol = signalCol) - - ########## check that dimensions of inputs are consistent - # length of signal coord = nrow of signal - if (length(signalCoord) != nrow(signal)) { - stop(cleanws("The number of coordinates in - signalCoord (length(signalCoord)) does not equal the number of - rows in `signal`")) - } - - ######### check that appropriate columns are present - # signalCol are column names of `signal` - if (!all(signalCol %in% colnames(signal))) { - missingCols = signalCol[!(signalCol %in% colnames(signal))] - stop(cleanws(paste0("Some signalCol are not - columns of signal: ", missingCols))) - } - - - ######## check that aggregation method is appropriate - - if (!(aggrMethod %in% getScoringMethods("metaRegionProfile"))) { - stop(cleanws("scoringMetric was not recognized. - Check spelling and available options.")) - } - - ###### check that signalCoordType is appropriate - if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - - ####### - # what happens if there are NAs or Inf in `signal`? - - ################################################################# - - # detect signalCoordType - if (signalCoordType == "default") { - # when signalCoord is a GRanges object - if (any(start(signalCoord) != end(signalCoord))) { - signalCoordType <- "multiBase" - } else { - signalCoordType <- "singleBase" - } - } - - # if "default" aggregation method is given, choose based on signalCoordType - if (aggrMethod == "default") { - if (signalCoordType == "singleBase") { - aggrMethod <- "regionMean" - } else if (signalCoordType == "multiBase") { - aggrMethod <- "proportionWeightedMean" - } else { - stop(cleanws("signalCoordType not recognized. - Check spelling/capitalization.")) - } - } - - - - - - ################################################################## - # take absolute value or not - if (absVal) { - loadingDT <- as.data.table(abs(signal)) - } else { - loadingDT <- as.data.table(signal) - } - - GRDT <- grToDt(regionSet) - - loadProf <- BSBinAggregate(BSDT = loadingDT, - rangeDT = GRDT, - binCount = binNum, - BSCoord = signalCoord, - byRegionGroup = TRUE, - splitFactor = NULL, - signalCol = signalCol, - aggrMethod = aggrMethod) - - # if loadProf is NULL, return NULL from function, otherwise make symmetrical - # it will be NULL when there was no overlap between data and any of the bins - if (!is.null(loadProf)) { - loadProf <- makeSymmetric(loadProf) - loadProf[, regionGroupID := seq_len(binNum)][] - setnames(loadProf, old = "regionGroupID", new="binID") - loadProf <- as.data.frame(loadProf) - } else { - warning("Insufficient overlap between regionSet and signalCoord") - } - - return(loadProf) -} - -makeSymmetric <- function(prof) { - symProf <- apply(prof, 2, .makeSymmetric) - symProf <- as.data.table(symProf) - return(symProf) -} - -.makeSymmetric <- function(vec) { - symVec <- (vec + rev(vec)) / 2 - return(symVec) -} - -# Produced originally for binning Ewing RRBS data across various region sets -# -# @param BSDT A data.table. For COCOA, a data.table of loading values -# with the PCs to be annotated. One column for the signal of each -# target variable -# and also has columns with the coordinates for the epigenetic features: -# chr (chromosome) and start column (also possibly end) -# @param rangeDT A data.table with the sets of regions to be binned, -# with columns named start, end -# @param binCount Number of bins across the region -# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start" -# columns must be in BSDT. -# @param byRegionGroup Pass along to binCount (see ?binCount) -# @template signalCol -# @param verbose A "logical" object. Whether progress -# of the function should be shown, one -# bar indicates the region set is completed. -# useful when using BSBinAggregate with 'apply' to do many -# region sets at a time. -# @param aggrMethod see ?getMetaRegionProfile() -BSBinAggregate <- function(BSDT, rangeDT, binCount, - BSCoord=NULL, - byRegionGroup = TRUE, - splitFactor = NULL, - signalCol, - verbose = FALSE, - aggrMethod) { - if (!is(rangeDT, "data.table")) { - stop("rangeDT must be a data.table") - } - - if (is.null(BSCoord)) { - BSCoord <- BSdtToGRanges(list(BSDT)) - } - - - seqnamesColName <- "seqnames" # default column name - if (! "seqnames" %in% colnames(rangeDT)) { - if ("chr" %in% colnames(rangeDT)) { - # message("seqnames column name set to: chr") - seqnamesColName <- "chr" - } else { - # Got neither. - stop("rangeDT must have a seqnames column") - } - } - - # message("Binning...") - binnedDT <- rangeDT[, MIRA::binRegion(start, end, - binCount, get(seqnamesColName))] - # output is a list of GRanges objects, does not play well with vapply - # one GRanges object for each bin, containing a segment of each original rangeDT region - binnedGR <- sapply(split(binnedDT, binnedDT$binID), dtToGr) - # message("Aggregating...") - - if (aggrMethod == "proportionWeightedMean") { - - binMeansList <- lapply(X = binnedGR, - FUN = function(x) regionOLWeightedMean(signalMat = BSDT, - # signalGR = dtToGr(BSDT[, .(chr, start, end)]), - signalGR = BSCoord, - regionSet = x, - calcCols = signalCol)) - - # any bins that had no overlap with data will be NULL - # if any bins had no data, return NULL - if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) { - return(NULL) - } - - # rbindlist of all NULL's will still return an (empty) data.table - # otherwise any NULL items will just be skipped and other rows will - # be concatenated - binnedBSDT <- rbindlist(binMeansList) - regionGroupID = 1:length(binMeansList) - - # regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)] - binnedBSDT[, regionGroupID := regionGroupID] - - } else if (aggrMethod == "simpleMean") { - - binMeansList <- lapply(X = binnedGR, - FUN = function(x) regionOLMean(signalDT = BSDT, - # signalGR = dtToGr(BSDT[, .(chr, start, end)]), - signalGR = BSCoord, - regionSet = x, - calcCols = signalCol)) - # any bins that had no overlap with data will be NULL - # if any bins had no data, return NULL - if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) { - return(NULL) - } - - binnedBSDT <- rbindlist(binMeansList) - regionGroupID = 1:length(binMeansList) - - - # regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)] - binnedBSDT[, regionGroupID := regionGroupID] - - } else if (aggrMethod == "regionMean") { # aggrMethod == "regionMean" - - # what is output if a region set has no overlap? - binnedBSDT <- BSAggregate(BSDT, - regionsGRL=GRangesList(binnedGR), - BSCoord = BSCoord, - jExpr=buildJ(signalCol, - rep("mean", length(signalCol))), - byRegionGroup = byRegionGroup, - splitFactor = splitFactor) - - # if any bins had no data - if (nrow(binnedBSDT) < binCount) { - return(NULL) - } - } else if (aggrMethod == "regionMedian") { - # what is output if a region set has no overlap? - binnedBSDT <- BSAggregate(BSDT, - regionsGRL=GRangesList(binnedGR), - BSCoord = BSCoord, - jExpr=buildJ(signalCol, - rep("median", length(signalCol))), - byRegionGroup = byRegionGroup, - splitFactor = splitFactor) - - # if any bins had no data - if (nrow(binnedBSDT) < binCount) { - return(NULL) - } - } - # RGenomeUtils::BSAggregate - - # # If we aren't aggregating by bin, then don't restrict to min reads! - # if (byRegionGroup) { - # binnedBSDT <- binnedBSDT[readCount > minReads,] - # } - if (verbose) { - message("|") - } - - return(binnedBSDT) -} - -# modification of BSAggregate to just return mean per region -# -# @template signal -# @template signalCoord -# @template regionSet -# Must be from the same reference genome -# as the coordinates for the actual data/samples (signalCoord). -# @template signalCol -# @param returnQuantile "logical" object. If FALSE, return region averages. If TRUE, -# for each region, return the quantile of that region's average value -# based on the distribution of individual genomic signal/feature values -# @template absVal -# @return a data.table with region coordinates and average loading -# values for each region. Has columns chr, start, end, and a column for each -# target variable in signalCol. -# Regions are not in order along the rows of the data.table. -# -# @example averagePerRegion(BSDT = BSDT, regionsGRL, -# jCommand = MIRA:::buildJ(cols = "methylProp", "mean")) -# Devel note: I could add a column for how many cytosines are in each region - -averagePerRegion <- function(signal, - signalCoord, - regionSet, - signalCol = c("PC1", "PC2"), - returnQuantile = FALSE, - absVal=TRUE) { - - ################### checking inputs ################################# - - ########## check that inputs are the correct class and converts - checkConvertInputClasses(signal=signal, - signalCoord=signalCoord, - regionSet=regionSet, - signalCol = signalCol) - - ########## check that dimensions of inputs are consistent - # length of signal coord = nrow of signal - if (length(signalCoord) != nrow(signal)) { - stop(cleanws("The number of coordinates in - signalCoord (length(signalCoord)) does not equal the number of - rows in `signal`")) - } - - ######### check that appropriate columns are present - # signalCol are column names of signal - if (!all(signalCol %in% colnames(signal))) { - missingCols = signalCol[!(signalCol %in% colnames(signal))] - stop(cleanws(paste0("Some signalCol are not - columns of signal: ", missingCols))) - } - - ####### - # what happens if there are NAs or Inf in `signal`? - # any NAs that overlap the regionSet will cause the score to be NA - if (is(signal, "data.table")) { - naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } else { - naRows = apply(X = signal[, signalCol, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } - - if (any(naRows)) { - signal <- signal[!naRows, ] - signalCoord <- signalCoord[!naRows] - warning("Removing rows with NA from `signal`") - } - - ################################################################# - - ################################################################# - - # determine whether coordinates are single base or a range - # if (!("end" %in% colnames(coordinateDT))) { - # dataCoordType <- "singleBase" - # } else { - if (any(start(signalCoord) != end(signalCoord))) { - dataCoordType <- "multiBase" - } else { - dataCoordType <- "singleBase" - } - # } - - # take absolute value or not - if (absVal) { - signalDT <- as.data.table(abs(signal)) - } else { - signalDT <- as.data.table(signal) - } - - # use different function for single base data and for region data - if (dataCoordType == "singleBase") { - # linking coordinates to loading values, has columns chr start, signalCol - BSDT <- signalDT[, .SD, .SDcols = signalCol] - jExpr <- buildJ(signalCol, rep("mean", length(signalCol))) - - avPerRegion <- BSAggregate(BSDT = BSDT, - regionsGRL = regionSet, - BSCoord = signalCoord, - excludeGR = NULL, - regionsGRL.length = NULL, - splitFactor = NULL, - keepCols = NULL, - sumCols = NULL, - jExpr = jExpr, - byRegionGroup = FALSE, - keep.na = FALSE, - returnSD = FALSE, - returnOLInfo = FALSE, - meanPerRegion = TRUE, - returnQuantile = returnQuantile) - - } else if (dataCoordType == "multiBase") { - - avPerRegion <- weightedAvePerRegion(signalDT = signalDT, - signalCoord=signalCoord, - regionSet=regionSet, - calcCols = signalCol, - returnQuantile = returnQuantile) - - } else { - stop("dataCoordType not recognized.") - } - - return(avPerRegion) - -} - -# signalDT -# average by region for region-based data (eg ATAC-seq) -# @return data.table. chr, start, end columns. One column for each calcCols -# that has the average value in each region for that col -weightedAvePerRegion <- function(signalDT, - signalCoord, - regionSet, - calcCols = c("PC1", "PC2"), - returnQuantile = FALSE) { - - if (is(signalCoord, "data.frame")) { - signalCoord <- dtToGr(signalCoord) - } - - hits <- findOverlaps(query = signalCoord, subject = regionSet) - # if no overlap, return NULL - if (length(hits) == 0) { - return(NULL) - } - - olap <- pintersect(signalCoord[queryHits(hits)], - regionSet[subjectHits(hits)]) - polap <- width(olap) / width(regionSet[subjectHits(hits)]) - - # get total proportion overlap per region - # aggregate polap by region - pOlapDT <- data.table(signalDT[queryHits(hits), calcCols, with=FALSE], - rsRegionID = subjectHits(hits), - pOlap = polap) - pOlapByRegionDT <- pOlapDT[, .(regionPOlap = sum(pOlap)), by=rsRegionID] - - # specify aggregation operation - # will be done separately for each PC specified - aggrCommand <- paste("list(", paste(paste0(calcCols, "=", "sum", "(", - calcCols, " * pOlap)"), collapse = ", "), ")") - weightedSumByRegionDT <- pOlapDT[, eval(parse(text=aggrCommand)), by=rsRegionID] - - # weightedSumByRegionDT and pOlapByRegionDT should be in the same order - regionInd <- weightedSumByRegionDT$rsRegionID - olCoord <- grToDt(regionSet)[regionInd, .(chr, start, end)] - - # divide by total proportion overlap to get mean value - jCommand <- paste("list(", paste(paste0(calcCols, "=", calcCols, " / regionPOlap"), collapse = ", "), ")") - meanPerRegion <- cbind(pOlapByRegionDT, weightedSumByRegionDT) - meanPerRegion <- meanPerRegion[, eval(parse(text=jCommand))] - meanPerRegion <- cbind(olCoord, meanPerRegion) - - if (returnQuantile) { - for (i in seq_along(calcCols)) { - # perhaps this could be more efficient with mapply - # ecdf example: ecdf(distributionData)(getPercentileOfThis) - meanPerRegion[, c(calcCols[i]) := ecdf(signalDT[[calcCols[i]]])(meanPerRegion[[calcCols[i]]])] - # I tried set() to improve performance but it took about the same time - } - } - - # meanPerRegion <- pOlapDT[, .(regionMean = sum(score * (pOlap/sum(pOlap))), by=rsRegionID] - - return(meanPerRegion) -} - - - - -#' Get regions that are most associated with target variable -#' -#' Get a GRanges with top regions from the region set based on -#' average feature contribution scores -#' for the regions or the quantile of the region's average -#' feature contribution score based on the -#' distribution of all feature contribution scores for the target variable. -#' Returns average feature contribution score or quantile as GRanges metadata. -#' -#' @template signal -#' @template signalCoord -#' @template regionSet -#' @template signalCol -#' @param cutoff Numeric. Only regions with at least this value will be -#' returned (either above this average `signal` value or above this quantile -#' if returnQuantile=TRUE). -#' @param returnQuantile Logical. If FALSE, return region averages. If TRUE, -#' for each region, return the quantile of that region's average value -#' based on the distribution of individual feature values in `signal` for -#' that `signalCol`. -#' @return A GRanges object with region coordinates for regions with -#' scores/quantiles above "cutoff" for any target variable in signalCol. -#' The scores/quantiles -#' for signalCol are given as metadata in the GRanges. - -# Are regions in order along the rows of the data.table? -# -#' @examples -#' data("brcaATACCoord1") -#' data("brcaATACData1") -#' data("esr1_chr1") -#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation -#' topRegions <- getTopRegions(signal=featureContributionScores, -#' signalCoord=brcaATACCoord1, -#' regionSet=esr1_chr1, -#' returnQuantile = TRUE) -#' @export - -getTopRegions <- function(signal, - signalCoord, - regionSet, - signalCol = c("PC1", "PC2"), - cutoff = 0.8, - returnQuantile=TRUE) { - - - - - regionLoadDT <- averagePerRegion(signal=signal, - signalCoord=signalCoord, regionSet=regionSet, - signalCol = signalCol, - returnQuantile = returnQuantile)[] - - keepInd <- regionLoadDT[, signalCol, with=FALSE] >= cutoff - - # keep region if it is above cutoff in any of the PCs in signalCol - keepInd <- apply(X = keepInd, MARGIN = 1, FUN = any) - - highGR <- dtToGr(regionLoadDT[keepInd, ]) - - values(highGR) <- as.data.frame(regionLoadDT[keepInd, signalCol, with=FALSE]) - - return(highGR) -} - - -# different scoring metrics -# remniscent of LOLA: -# support (number of regions, for us regions that have at least 1 cytosine and can be scored) -# mean loading value (or could do ratio of peak to surrounding regions) -# signal to noise ratio, how big is peak compared to noise of surrounding area -# with SNR, even a small peak could have a high SNR - -# @param loadingProf -# loadingProfileSNR = function() { -# # magnitude of the peak divided by standard deviation of -# # noise (signal in surrounding areas) -# } - - -# support is just number of regions from each input region set that overlap at all with -# the cytosines that have loading values -# findOverlaps(regionSet, dtToGr(coordinateDT)) - -########################################################################### - - -# BSaggregate -- Aggregate a BSDT across regions or region groups, -# for multiple samples at a time. -# This function is as BScombineByRegion, but can handle not only multiple -# samples in BSDT, but also simultaneously multiple region sets by passing -# a regionsGRL (GRangesList object). -# you can use jExpr to do other functions. - -# Given a bisulfite data table as input, with an identifier column for -# different samples; plus a GRanges objects with regions to aggregate. -# -# @param BSDT The bisulfite data.table (output from one of the parsing -# functions for methylation calls) that you wish to aggregate. It can -# be a combined table, with individual samples identified by column passed -# to splitFactor. -# @param regionsGRL Regions across which you want to aggregate. Should be -# from a single region set. eg GRangesList(regionSet) -# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start" -# columns must be in BSDT. -# @param excludeGR A GRanges object with regions you want to -# exclude from the aggregation function. These regions will be eliminated -# from the input table and not counted. -# @param jExpr You can pass a custom command in the j slot to data.table -# specifying which columns to aggregate, and which functions to use. You -# can use buildJ() to build a jExpr argument easily. -# @param byRegionGroup You can aggregate by regionID or by regionGroupID; -# this reflects the regionsGRL that you pass; by default, BSAggregate will -# aggregate each region individually -- scores will then be contiguous, and -# the output is 1 row per region. -# Turn on this flag to aggregate across all region groups, making the result -# uncontiguous, and resulting in 1 row per *region group*. -# @param returnSD Whether the standard deviation of the columns of interest -# should be returned. Standard deviation for rows that overlap with region set -# and also for rows that do not overlap with region set. Not currently functional. -# @param returnOLInfo if true, include number of overlapping cpgs and number -# of overlapping regions in the result -# @param meanPerRegion Will return the mean value in each region in a data.table -# instead of averaging all regions into a single value, returnOLInfo does -# nothing if meanPerRegion=TRUE (function exits before that code, which -# expects a different data structure) -# @param returnQuantile Only used if meanPerRegion=TRUE, instead of mean -# return the quantile/percentile of the mean of each region -# in relation to the distribution of original values in BSDT -# @template rsOL -# -BSAggregate <- function(BSDT, regionsGRL, BSCoord=NULL, excludeGR=NULL, - regionsGRL.length = NULL, splitFactor=NULL, - keepCols=NULL, sumCols=NULL, - jExpr=NULL, byRegionGroup=FALSE, - keep.na=FALSE, returnSD=FALSE, - returnOLInfo=FALSE, meanPerRegion=FALSE, - returnQuantile=FALSE, rsOL=NULL) { - - # Assert that regionsGRL is a GRL. - # If regionsGRL is given as a GRanges, we convert to GRL - if(is(regionsGRL, "GRanges")) { - regionsGRL <- GRangesList(regionsGRL) - } else if (!is(regionsGRL, "GRangesList") && is.null(rsOL)) { - stop("regionsGRL is not a GRanges or GRangesList object") - } - - # will cause error if BSDT is only a data.frame - if (is(BSDT, "data.frame") & !is(BSDT, "data.table")) { - BSDT <- as.data.table(BSDT) - } - if (!is(BSDT, "data.table")) { - stop("BSDT must be a data.table") - } - - if(! is.null(excludeGR)) { - BSDT <- BSFilter(BSDT, minReads=0, excludeGR) - } - - if (returnQuantile) { - # keep all data so quantiles can be calculated - # later code will change BSDT - origBSDT <- data.table::copy(BSDT) - } - # TODO: BSdtToGRanges needs to include end coordinate!!! - if (is.null(BSCoord)) { - bsgr <- BSdtToGRanges(list(BSDT)) - } else { - bsgr <- BSCoord - } - - - additionalColNames <- setdiff(colnames(BSDT), - c("chr","start", "end", - "hitCount","readCount", splitFactor)) - - # specify that "character" outcome is expected from mode by - # supplying "a" as last vapply argument (any character object would work) - colModes <- vapply(BSDT, mode, "a") - if (is.null(sumCols)) { - sumCols <- setdiff(colnames(BSDT),c("chr", "start", "end", - "strand", splitFactor, keepCols)) - # Restrict to numeric columns. - sumCols <- intersect(sumCols, - names(colModes[which(colModes == "numeric")])) - } - - if (is.null(rsOL)) { - # It's required to do a findoverlaps on each region individually, - # Not on a GRL, because of the way overlaps with GRLs work. So, - # we must convert the GRL to a GR, but we must keep track of which - # regions came from which group. - regionsGR <- unlist(regionsGRL) - - if(is.null(regionsGRL.length)) { - # if (length(regionsGRL) > 100) { - # message("BSAggregate: Calculating sizes. You can speed this up by supplying a regionsGRL.length vector...", appendLF=FALSE) - # } - # specify that output should be numeric with vapply - # (any number would work instead of 1) - regionsGRL.length <- vapply(regionsGRL, length, 1) - # message("Done counting regionsGRL lengths.") - } - - # Build a table to keep track of which regions belong to which group - # BIOC note: sapply returns a list where each item is of different length - # therefore, I'm not using vapply - - # one regionID per region of "regionsGR", withinGroupID: for when regionsGRL - # has multiple GRanges, Gives the index of each region within its corresponding - # GRanges object. regionGroupID: for when regionsGRL has multiple GRanges, - # indicates which regionsGRL GRanges object the region has come from ( - # index of the GRanges object in regionsGRL) - region2group <- data.table( - regionID=seq_along(regionsGR), - chr=as.vector(seqnames(regionsGR)), - start=as.vector(start(regionsGR)), - end=as.vector(end(regionsGR)), - # withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))), - regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) # repeats each number regionGRL.length times - setkey(region2group, regionID) - - #TODO: if overlap method is single, but "end" is present, find center - # and set that to start! - # if (all(start(bsgr[[1]]) != end(bsgr[[length(unique(queryHits(hits)))1]]))) { - if (all(start(bsgr) != end(bsgr))) { - stop("BSDT start and end coordinates are not the same. Choose a different aggrMethod.") - } else { - # fo <- findOverlaps(query = bsgr[[1]], subject = regionsGR) - fo <- findOverlaps(query = bsgr, subject = regionsGR) - } - - if (length(subjectHits(fo)) < 1) { - warning("Insufficient overlap between signalCoord and the region set.") - return(NULL) - } - - } else { - # Build a table to keep track of which regions belong to which group - # BIOC note: sapply returns a list where each item is of different length - # therefore, I'm not using vapply - - # region2group <- data.table( - # regionID=seq_len(regionsGR), - # withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))), - # regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) - - if (length(subjectHits(rsOL)) < 1) { - warning("Insufficient overlap between signalCoord and the region set.") - return(NULL) - } - - # only works for when one region set is given in regionsGRL (ie does - # not work for metaregion profiles) - region2group <- data.table( - regionID=seq_len(max(subjectHits(rsOL))), - regionGroupID=rep(1, max(subjectHits(rsOL)))) # assumes only 1 region set in regionsGRL - setkey(region2group, regionID) - - fo <- rsOL - } - - - ### use info from findOverlaps to see how many individual - # cytosines (or input regions) and how many regions (in region sets) - # overlap with one another - if (returnOLInfo) { - signalCoverage <- length(unique(queryHits(fo))) - regionSetCoverage <- length(unique(subjectHits(fo))) - } - - # if("end" %in% colnames(BSDT)){ - # setkey(BSDT, chr, start, end) - # } else{ - # setkey(BSDT, chr, start) - # } - - # Gut check: - # stopifnot(all(elementMetadata(bsgr[[1]])$readCount == BSDT$readCount)) - - # message("Setting regionIDs...") - BSDT <- BSDT[queryHits(fo),] #restrict the table to CpGs (or input region) in any region set. - - BSDT[,regionID:=subjectHits(fo)] #record which region they overlapped. - #BSDT[queryHits(fo),regionID:=subjectHits(fo)] - #if (!keep.na) { - # BSDT = BSDT[queryHits(fo),] - #} - - if (is.null(jExpr)) { - cols=c(sumCols, keepCols) - funcs <- c(rep("sum", length(sumCols)), rep("unique", length(keepCols))) - jExpr <- buildJ(cols, funcs) - } - # message("jExpr: ", jExpr) - - # Define aggregation column. aggregate by region or by region group? - if (byRegionGroup) { - agCol <- "regionGroupID" - } else { - agCol <- "regionID" # Default - } - - # Build the by string - if (is.null(splitFactor)) { - byString <- paste0("list(regionID)") - } else { - byString <- paste0("list(", paste("regionID", - paste0(splitFactor, ""), - collapse=", ", sep=", "), ")") - } - - # Now actually do the aggregate: (aggregate within each region) - # message("Combining...") - bsCombined <- BSDT[,eval(parse(text=jExpr)), by=eval(parse(text=byString))] - setkey(bsCombined, regionID) - - if (meanPerRegion) { - setkey(region2group, regionID) - - avPerRegion <- merge(bsCombined, region2group) - avPerRegion[, c("regionID", - # "withinGroupID", - "regionGroupID") := NULL] - - if (returnQuantile) { - - for (i in seq_along(sumCols)) { - # perhaps this could be more efficient with mapply - # ecdf example: ecdf(distributionData)(getPercentileOfThis) - avPerRegion[, c(sumCols[i]) := ecdf(origBSDT[[sumCols[i]]])(avPerRegion[[sumCols[i]]])] - # I tried set() to improve performance but it took about the same time - } - - } - return(avPerRegion) - } - # Now aggregate across groups. - # I do this in 2 steps to avoid assigning regions to groups, - # which takes awhile. I think this preserves memory and is faster. - - # Define aggregation column. aggregate by region or by region group? - # byRegionGroup=TRUE means to aggregate within each individiual GRanges from - # regions.GRL - if (byRegionGroup) { - # must set allow=TRUE here in case there are multiple IDs (splitCol) - bsCombined[region2group, regionGroupID:=regionGroupID, allow=TRUE] - if (! is.null(splitFactor) ) { - byStringGroup <- paste0("list(", paste("regionGroupID", - paste0(splitFactor, - collapse=", "), - sep=", "), ")") - } else { - byStringGroup <- "list(regionGroupID)" - } - # aggregate by regionGroupID (region set/GRanges in regionsGRL) - bsCombined=bsCombined[,eval(parse(text=jExpr)), - by=eval(parse(text=byStringGroup))] - if (returnOLInfo) { - bsCombined[, signalCoverage := signalCoverage] - bsCombined[, regionSetCoverage := regionSetCoverage] - } - return(bsCombined) - } else { - e <- region2group[bsCombined,] - setkey(e, regionID) - return(e) - } - # WARNING: There are now 2^2 ways to aggregate, sum vs mean - # at each level: across regions, then across region sets. This - # doesn't give you a choice at this point. -} - - - -# Given a BSDT (bisulfite data.table), remove any entries that overlap -# regions given in the excludeGR argument and/or filter out sites -# that have lower than a minimum number of reads. -# @param BSDT Bisulfite data.table to filter -# @param minReads Require at least this level of coverage at a cpg. -# @param excludeGR GRanges object with regions to filter. -# -# @return The BSDT with appropriate regions removed. -BSFilter <- function(BSDT, minReads = 10, excludeGR = NULL) { - # First, filter for minimum reads. - if (minReads > 0) { - BSDT <- BSDT[coverage >= minReads, ] - } - if (NROW(BSDT) == 0) { return(data.table(NULL)) } - # Now, filter entries overlapping a region in excludeGR. - if (!is.null(excludeGR)) { - gr <- dtToGr(BSDT) - fo <- findOverlaps(gr, excludeGR) - qh <- unique(queryHits(fo)) - length(qh) - nrow(BSDT) - BSDT <- BSDT[-qh, ] - } - return(BSDT) -} - - -#' Get indices for top scored region sets -#' -#' For each target variable, get index of original region sets -#' but ordered by rsScores -#' ranking for each target variable. -#' The original index refers to that region set's position -#' in the `GRList` param given to `aggregateSignalGRList` which is -#' also that region set's -#' row index in the COCOA output. The first number in a given column -#' of this function's output will be the -#' original index of the region set ranked first for that target variable. -#' The second row for a -#' column will be the original index of the region set that ranked second -#' for that target variable, etc. You can use this function to make it easier -#' when you want to select the top region sets for further analysis or -#' just for sorting the results. Region set scores are sorted in decreasing -#' or increasing order according to the `decreasing` parameter. -#' -#' @template rsScores -#' @templateVar isRSRankingIndex TRUE -#' @template signalCol -#' @param decreasing Logical. Whether to sort rsScores in decreasing -#' or increasing order. -#' @param newColName Character. The names of the columns of the output data.frame. -#' The order should correspond to the order of the -#' input columns given by signalCol. -#' @return A data.frame with one column for each `signalCol`. -#' Column names are given by `signalCol` or `newColName` (if used). -#' Each column has been -#' sorted by score for region sets for that target variable -#' (order given by `decreasing` -#' param). -#' Original indices for region sets that were used to create rsScores -#' are given. Region sets with a score of NA are counted as having the -#' lowest scores and indices for these region sets will be at the bottom of the -#' returned data.frame (na.last=TRUE in sorting) -#' @examples data("rsScores") -#' rsRankInd = rsRankingIndex(rsScores=rsScores, -#' signalCol=c("PC1", "PC2")) -#' # region sets sorted by score for PC1 -#' rsScores[rsRankInd$PC1, ] -#' # region sets sorted by score for PC2 -#' rsScores[rsRankInd$PC2, ] -#' -#' @export -#' -rsRankingIndex <- function(rsScores, signalCol, - decreasing=TRUE, newColName = signalCol) { - - if (!(is(rsScores, "data.frame") || is(rsScores, "matrix"))) { - stop("rsScores should be a data.frame. Check object class.") - } - rsScores <- as.data.table(rsScores) - - if (!(is(signalCol, "character") | is(signalCol, "list"))) { - stop("signalCol should be a character vector or list of character vectors.") - } - if (is(signalCol, "list")) { - if (!all(vapply(X = signalCol, FUN = class, FUN.VALUE = "a") %in% "character")) { - stop("Items of signalCol should be character vectors.") - } - if (!length(unique(vapply(signalCol, FUN = length, FUN.VALUE = 2)))) { - stop("Items of signalCol should be the same length as each other.") - } - if (!all(unlist(signalCol) %in% colnames(rsScores))) { - stop("Some column names in signalCol are not present in rsScores.") - } - - # the first item in the list is taken for newColName - if (is(newColName, "list")) { - newColName <- signalCol[[1]] - } - - } else { # signalCol is character - if (!all(signalCol %in% colnames(rsScores))) { - stop("Some column names in signalCol are not present in rsScores.") - } - } - - - dtOrder <- rep(-99L, length(decreasing)) - # how to sort scores - # -1 for decreasing order of scores - dtOrder[decreasing] <- -1L - # +1 for increasing order of scores - dtOrder[!decreasing] <- 1L - - - dtOrder <- rep(-99L, length(decreasing)) - # how to sort scores - # -1 for decreasing order of scores - dtOrder[decreasing] <- -1L - # +1 for increasing order of scores - dtOrder[!decreasing] <- 1L - - # so by references changes will not be a problem - rsScores <- copy(rsScores) - rsScores[, rsIndex := seq_len(nrow(rsScores))] - - if (is(signalCol, "list")) { - if (length(newColName) != length(signalCol[[1]])) { - stop("newColName is not the same length as columns given in signalCol.") - } - - rsEnSortedInd <- subset(rsScores, select= signalCol[[1]]) - setnames(rsEnSortedInd, newColName) - - colNameMat <- do.call(rbind, signalCol) - - # then scores by each PC and make a column with the original index for sorted region sets - # this object will be used to pull out region sets that were top hits for each PC - for (i in seq_along(signalCol[[1]])) { - theseOrderCols <- colNameMat[, i] - - setorderv(rsScores, cols = theseOrderCols, order=dtOrder, na.last=TRUE) - - rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]] - } - } else if (is(signalCol, "character")) { - - if (length(newColName) != length(signalCol)) { - stop("newColName is not the same length as columns given in signalCol.") - } - - signalCol <- signalCol[signalCol %in% colnames(rsScores)] - - rsEnSortedInd <- subset(rsScores, select= signalCol) - setnames(rsEnSortedInd, newColName) - - # then scores by each PC and make a column with the original index for sorted region sets - # this object will be used to pull out region sets that were top hits for each PC - for (i in seq_along(signalCol)) { - - - setorderv(rsScores, cols = signalCol[i], order=dtOrder, na.last=TRUE) - - rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]] - } - - } else { - stop("signalCol should be a character vector or list of character vectors.") - } - - # reset order - # setorderv(rsScores, cols = "rsIndex", order=1L) - return(as.data.frame(rsEnSortedInd)) -} - -#################### Metric functions ######################################## -# scores, metrics, or statistical tests - -# Instead of averaging within regions first as BSAggregate does, -# this function does a simple average and standard deviation -# for all CpGs that overlap -# with regions of a region set, also does average and -# standard deviation for non overlapping CpGs. Created to -# get metrics of loading values for each PC. -# -# Faster if given total average for each column of interest -# -# @param dataDT a data.table with -# columns to get metrics of eg (PC1, PC2). All columns -# will be considered -# columns to get the metrics from so no unnecessary columns should be -# included. -# @param dataGR GRanges. Coordinates for dataDT. -# @template regionSet -# Metrics will be calculated on -# only coordinates within this region set (and optionally separately -# on those outside this region set with alsoNonOLMet parameter) -# @param signalCol the columns to calculate the metrics on. The -# metrics will be calculated on each one of these columns separately. -# @param metrics character vector with the name of a function or functions -# to calculate on selected cytosines. Function should only require one -# input which should be the values of the cytosines. -# @param alsoNonOLMet also include same metrics -# for non overlapping CpGs (still also returns metrics for overlapping CpGs) -# param columnMeans Not functional/is deprecated. The idea was to use -# the mean of the column to speed up calculations (eg when calculating -# mean of overlapping CpGs, use that info and column mean to get -# mean for non overlapping CpGs without manually calculating it) -# -signalOLMetrics <- function(dataDT, - dataGR, - regionSet, - signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))], - metrics=c("mean", "sd"), - alsoNonOLMet=TRUE, rsOL=NULL) { - - # convert DT to GR for finding overlaps - # dataGR <- BSdtToGRanges(list(dataDT))[[1]] - - if (is.null(rsOL)) { - OL <- findOverlaps(query = dataGR, subject = regionSet) - # region set info - totalRegionNumber <- length(regionSet) - meanRegionSize <- round(mean(width(regionSet)), 1) - } else { - OL <- rsOL - } - - # if no overlap, exit - if (length(OL) == 0) { - return(NULL) - } - - # get indices for overlapping and non overlapping CpGs - olCpG <- queryHits(OL) - - # get info on degree of overlap - # number of CpGs that overlap - signalCoverage <- length(unique(olCpG)) - # number of regions that overlap - regionSetCoverage <- length(unique(subjectHits(OL))) - - # gets metrics for all columns except chr, start, end - jExpr <- buildJ(cols=rep(signalCol, each=length(metrics)), - funcs=rep(metrics, length(signalCol)), - newColNames = paste0(rep(signalCol, - each=length(metrics)), - "_", metrics)) - - # getting the metrics - olMetrics <- as.data.frame(dataDT[olCpG, eval(parse(text=jExpr))]) - - # calculate average of nonOLCpGs based on columnMean if given - # if (!is.null()) - # - # formatting so there is one row per PC/testCol - # output is a matrix with ncol = length(metrics) - # for vapply, FUN.VALUE should have length equal to a single output of FUN - olResults <- vapply(X = metrics, - FUN = function(x) as.numeric(olMetrics[, grepl(pattern = x, colnames(olMetrics))]), - as.numeric(seq_along(signalCol))) - olResults <- as.data.table(olResults) - setnames(olResults, old = colnames(olResults), new = paste0(colnames(olResults), "_OL")) - - if (alsoNonOLMet) { - nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG] - # if no OL for region set, don't calculate for non region set - # TODO make conditional on region set having any overlap - nonOLMetrics <- as.data.frame(dataDT[nonOLCpG, eval(parse(text=jExpr))]) - - nonOLResults <- vapply(X = metrics, - FUN = function(x) as.numeric(nonOLMetrics[, grepl(pattern = x, colnames(nonOLMetrics))]), - as.numeric(seq_along(signalCol))) - nonOLResults <- as.data.table(nonOLResults) - setnames(nonOLResults, old = colnames(nonOLResults), new = paste0(colnames(nonOLResults), "_nonOL")) - - if (is.null(rsOL)) { - metricDT <- cbind(data.table(testCol=signalCol), - olResults, - nonOLResults, - data.table(signalCoverage, - regionSetCoverage, - totalRegionNumber, - meanRegionSize)) - } else { - metricDT <- cbind(data.table(testCol=signalCol), - olResults, - nonOLResults, - data.table(signalCoverage, - regionSetCoverage)) - } - } else { - if (is.null(rsOL)) { - metricDT <- cbind(data.table(testCol=signalCol), - olResults, - data.table(signalCoverage, - regionSetCoverage, - totalRegionNumber, - meanRegionSize)) - } else { - metricDT <- cbind(data.table(testCol=signalCol), - olResults, - data.table(signalCoverage, - regionSetCoverage)) - } - - } - return(metricDT) -} - - -# Wilcoxon rank sum test for a region set -# @param dataDT a data.table with chr, start, end columns as well -# as columns to get metrics of eg (PC1, PC2). All columns -# except chr, start, and end will be considered -# columns to get the metrics from so no unnecessary columns should be -# included. -# @template regionSet -# @param signalCol the columns of interest. You will do ranksum test separately -# on each of these columns (given test only uses info in one column) -# @param ... Additional parameters of wilcox.test function. See ?wilcox.test. -# For instance specify alternative hypothesis: alternative = "greater". -# @return A vector with a p value for each column other than chr, start or end. - -# @examples data("brcaLoadings1") -# data("brcaMCoord1") -# data("nrf1_chr1") -# dataDT = as.data.table(cbind(brcaMCoord1, brcaLoadings1)) -# rsWilcox(dataDT = dataDT, regionSet = nrf1_chr1, conf.int=TRUE) - -rsWilcox <- function(dataDT, - regionSet, - signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))], - conf.int=FALSE, - ...) { - - # region set info - totalRegionNumber <- length(regionSet) - meanRegionSize <- round(mean(width(regionSet)), 1) - - - # convert DT to GR for finding overlaps - dataGR <- BSdtToGRanges(list(dataDT))[[1]] - - OL <- findOverlaps(query = regionSet, subject = dataGR) - - # if no overlap, exit - if (length(OL) == 0) { - return(NULL) - } - - # get indices for overlapping and non overlapping CpGs - olCpG <- unique(subjectHits(OL)) - nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG] - - # get info on degree of overlap - # number of CpGs that overlap - signalCoverage <- length(unique(olCpG)) - # number of regions that overlap - regionSetCoverage <- length(unique(queryHits(OL))) - - - - # each confidence interval has length of 2: [low, high] - - if (conf.int) { - # calculate Wilcoxon rank sum test for each column - # additional parameters given with ... - # confIntervals will be [low1, high1, low2, high2, etc.] - confIntervals <- as.numeric(vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])), - y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])), - conf.int = conf.int, ...)$conf.int, c(1, 1))) - - names(confIntervals) <- paste0(rep(signalCol, each=2), c("_low", "_high")) - wRes <- data.frame(t(confIntervals), - signalCoverage, - regionSetCoverage, - totalRegionNumber, - meanRegionSize) - - } else { - # calculate Wilcoxon rank sum test for each column - # additional parameters given with ... - pVals <- vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])), - y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])), ...)$p.value, 1) - wRes <- data.frame(t(pVals), - signalCoverage, - regionSetCoverage, - totalRegionNumber, - meanRegionSize) - } - - return(wRes) -} - - -# I optimized upstream so that a matrix would be given to this function -# if this function is rewritten and no longer requires a matrix input, -# then in order to prevent unnecessary object copying, -# rewrite upstream code that converts signalDT to matrix class - -# @param signalMat Data to be aggregated (e.g. raw data: ATAC-seq, -# region based DNA methylation or loading values) -# @param signalGR GRanges object with coordinates for signalMat -# @template regionSet -# The region set to score. -# @param calcCols character object. Column names. A weighted sum will be done -# for each of these columns (columns should be numeric). -# @template rsOL -# @param pOlap see "?aggregateSignal" -# @template returnCovInfo -# @value Returns data.frame with columns 'calcCols', signalCoverage col has -# number of signalGR regions that overlapped with any regionSet regions, -# regionSetCoverage has the sum of all proportion overlaps of regions from -# signalGR with regionSet (regionSet region is denominator) -# containing weighted mean for each col. -# Returns NULL if there is no overlap between signalGR and regionSet - -regionOLWeightedMean <- function(signalMat, signalGR, - regionSet, calcCols, rsOL=NULL, - pOlap=NULL, returnCovInfo=TRUE) { - - if (!is(signalMat, "matrix")) { - signalMat <- as.matrix(signalMat) - } - - if (is.null(rsOL)) { - hits <- findOverlaps(query = signalGR, subject = regionSet) - } else { - hits <- rsOL - - } - - # if no overlap, return NULL - if (length(hits) == 0) { - return(NULL) - } - - if (is.null(pOlap)) { - olap <- pintersect(signalGR[queryHits(hits)], - regionSet[subjectHits(hits)]) - pOlap <- width(olap) / width(regionSet[subjectHits(hits)]) - } - - - # some rows may be duplicated if a signalMat region overlapped multiple - # regions from signalGR but that is ok - # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying - - # weight the signalMat values by the proportion overlap (weighted average) - weightedSum <- t(pOlap) %*% signalMat[queryHits(hits), calcCols] - - # weighted average - denom <- sum(pOlap) - weightedAve <- as.data.frame(weightedSum / denom) - colnames(weightedAve) <- calcCols - - # add columns for coverage info - if (returnCovInfo) { - weightedAve$signalCoverage = length(unique(queryHits(hits))) - weightedAve$regionSetCoverage = length(unique(subjectHits(hits))) - weightedAve$sumProportionOverlap = denom - } - - return(weightedAve) -} - - -# @param signalDT Data to be aggregated (e.g. raw data: ATAC-seq, -# region based DNA methylation or loading values) -# @param signalGR GRanges object with coordinates for signalDT -# @template regionSet -# The region set to score. -# @param calcCols character object. Column names. A mean will be calculated for -# each of these columns (columns should be numeric). -# @param metric character. "mean" or "median" -# @template rsOL -# @template returnCovInfo -# @value Returns data.frame with columns 'calcCols', signalCoverage col has -# number of signalGR regions that overlapped with any regionSet regions, -# regionSetCoverage has the number of regions from -# signalGR that overlapped with regionSet -# Returns NULL if there is no overlap between signalGR and regionSet - -regionOLMean <- function(signalDT, signalGR, regionSet, - calcCols, metric="mean", rsOL=NULL, returnCovInfo=TRUE) { - - if (is.null(rsOL)) { - hits <- findOverlaps(query = signalGR, subject = regionSet) - } else { - hits <- rsOL - } - # if no overlap, return NULL - if (length(hits) == 0) { - return(NULL) - } - - # some rows may be duplicated if a signalDT region overlapped multiple - # regions from signalGR but that is ok - signalDT <- signalDT[queryHits(hits), ] - - if (metric == "mean") { - # mean of the overlapping signalDT values - signalAve <- as.data.frame(t(colMeans(signalDT[,..calcCols]))) - } else if (metric == "median") { - # median of the overlapping signalDT values - signalAve <- as.data.frame(t(apply(X = signalDT[,..calcCols], 2, median))) - - } else { - stop("Error in regionOLMean function. Invalid metric specified.") - } - - if (returnCovInfo) { - # add columns for coverage info - signalAve$signalCoverage <- length(unique(queryHits(hits))) - signalAve$regionSetCoverage <- length(unique(subjectHits(hits))) - } - - return(signalAve) -} - -########################################################################## -# matrix scoring -# 1. make a region set matrix. The dimensions are nrows=nfeatures of -# epigenetic data segmentation (e.g. ATAC consensus peaks), ncol=nregionsets. -# It has a 1 for data regions that are overlapped by a given region set -# and a zero for data regions that do not overlap the region set -# This will produce an unweighted mean. -# 2. Multiply the region set matrix times the loading/correlation matrix. -# 3. Divide by total covered regions for that region set to get the mean. This -# is the region set score. - -# @template signalCoord -# @template GRList -# @value Returns a matrix where each column corresponds to one region set -# and rows are data regions -olToMat = function(signalCoord, GRList) { - # calculate overlaps only once - # region set must be subject to fit with scoring functions - olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, - subject = x)) - - # each column is a region set - rsMat = matrix(data = rep(0, length(GRList) * length(signalCoord)), - nrow=length(signalCoord)) - for (i in seq_along(GRList)) { - rsMat[unique(queryHits(olList[[i]])), i] = 1 - } - - colnames(rsMat) = names(GRList) - return(rsMat) - # totalRegionNumber = sapply(X = GRList, length) - # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) -} -rsMat = olToMat() - -dim(rsMat) -loadings=loadings[, c("PC1", "PC2")] -dim(loadings) - -loadings = abs(loadings) -rsScoresMatrix = t(rsMat) %*% loadings -covCount = colSums(rsMat) -rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) -View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) -View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) - +# PACKAGE DOCUMENTATION +#' Coordinate Covariation Analysis (COCOA) +#' +#' +#' COCOA is a method for understanding epigenetic variation among samples. +#' COCOA can be used with epigenetic data that includes +#' genomic coordinates and an epigenetic signal, +#' such as DNA methylation and chromatin accessibility +#' data. +#' To describe the method on a high level, COCOA quantifies +#' inter-sample variation with either a supervised or unsupervised +#' technique then uses a database of "region sets" to +#' annotate the variation among samples. A region set is a set of +#' genomic regions that share a biological annotation, +#' for instance transcription factor (TF) binding regions, +#' histone modification regions, or open chromatin regions. +#' COCOA can identify region sets that are associated with +#' epigenetic variation between samples and +#' increase understanding of variation in your data. +# +# In contrast to some other common techniques, COCOA offers both +# supervised (known groups/phenotype) and unsupervised (no known groups/ +# phenotype) analyses. Also, COCOA focuses on continuous variation +# between samples instead of having cutoffs. Because of this, COCOA can +# be used as a complementary method alongside "differential" methods +# that find discrete differences between groups of samples and +# it can also be used in situations where there are no groups. +# COCOA can identify biologically meaningful +# sources of variation between samples and +#' +#' @docType package +#' @name COCOA +#' @author John Lawson +#' @author Nathan Sheffield +#' +#' @references \url{http://github.com/databio} +#' @importFrom ggplot2 ggplot aes facet_wrap geom_boxplot geom_jitter geom_line +#' theme_classic xlab ylab geom_hline ylim scale_color_discrete +#' scale_x_discrete scale_fill_brewer scale_color_manual +#' scale_color_brewer theme element_line element_text geom_point +#' @importFrom ComplexHeatmap Heatmap draw +#' @import BiocGenerics S4Vectors IRanges GenomicRanges simpleCache +#' @importFrom data.table ":=" setDT data.table setkey fread setnames +#' setcolorder rbindlist setattr setorder copy is.data.table +#' setorderv as.data.table +#' @importFrom Biobase sampleNames +#' @importFrom stats lm coefficients poly wilcox.test ecdf pgamma p.adjust +#' @importFrom methods is +#' @importFrom MIRA binRegion +#' @importFrom tidyr gather +#' @importFrom grid grid.newpage grid.grabExpr grid.draw popViewport +#' pushViewport unit viewport +#' @importFrom grDevices dev.off +#' @importFrom methods hasArg +#' @importFrom fitdistrplus fitdist +#' @importFrom simpleCache simpleCache +NULL + +# @importFrom ppcor pcor.test + + +# now package lists GenomicRanges in "Depends" instead of "Imports" in +# DESCRIPTION, still import package with @import though +# @importFrom GenomicRanges GRanges GRangesList elementMetadata strand +# seqnames granges + +# Because of some issues, +# (see here: http://stackoverflow.com/questions/9439256/) +# I have to register stuff used in data.table as non-standard evaluation, +# in order to pass some R CMD check NOTES. +if (getRversion() >= "2.15.1") { + utils::globalVariables(c( + ".", "..calcCols", "bin", "binID", "chr", "id", "colsToAnnotate", + "coordinateDT", + "coverage", "Group", "pOlap", "regionGroupID", "regionID", "theme", + "meanRegionSize", "regionSetCoverage", "rowIndex", "rsIndex", + "rsRegionID", "totalRegionNumber", "signalCoverage", ".SD", + "sumProportionOverlap")) +} + +######################################################################### + + +#' Score a region set using feature contribution scores +#' +#' First, this function identifies which epigenetic features +#' overlap the region set. +#' Then the region set is scored using the feature contribution scores +#' (`signal` input) +#' according to the `scoringMetric` parameter. +#' +#' @template signal +#' @template signalCoord +#' @template signalCoordType +#' @templateVar refGenomeWarning TRUE +#' @template regionSet +#' @template signalCol +#' @template scoringMetric +#' @template verbose +# Useful when using +# aggregateSignal with 'apply' to do many region sets at a time. +# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring +# method. returns a 95% confidence interval from the Wilcoxon rank sum test +# instead of p value. +#' @template absVal +#' @template rsOL +#' @param pOlap Numeric vector. Only used if rsOL is given and scoringMetric +#' is "proportionWeightedMean". This vector should contain the proportion of +#' each regionSet region that is overlapped by a signalCoord region. The +#' order of pOlap should be the same as the overlaps in rsOL. +#' @param rsOLMat Matrix. +#' @template returnCovInfo +#' @template checkInput + +#' @return A data.frame with one row and the following +#' columns: one column for each item of signalCol with names given +#' by signalCol. These columns have scores for the region set for each signalCol. +#' Other columns: signalCoverage (formerly cytosine_coverage) which +#' has number of epigenetic features that overlapped at all with regionSet, +#' regionSetCoverage which has number of regions from regionSet +#' that overlapped any of the epigenetic features, +#' totalRegionNumber that has +#' number of regions in regionSet, meanRegionSize that has average +#' size in base pairs of regions in regionSet, the average is based on +#' all regions in regionSet and not just ones that overlap. +#' For "multiBase" data, if the "proportionWeightedMean" scoring metric +#' is used, then the output will also have a "sumProportionOverlap" column. +#' During this scoring method, the proportion overlap between each signalCoord +#' region and overlapping regionSet region is calculated. This column is +#' the sum of all those proportion overlaps and is another way to quantify +#' coverage of regionSet in addition to regionSetCoverage. +#' +#' @examples +#' data("brcaATACCoord1") +#' data("brcaATACData1") +#' data("esr1_chr1") +#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation +#' rsScores <- aggregateSignal(signal=featureContributionScores, +#' signalCoord=brcaATACCoord1, +#' regionSet=esr1_chr1, +#' signalCol=c("PC1", "PC2"), +#' scoringMetric="default") +#' @export + +aggregateSignal <- function(signal, + signalCoord, + regionSet, + signalCol = c("PC1", "PC2"), + signalCoordType = "default", + scoringMetric = "default", + verbose = FALSE, + absVal=TRUE, + rsOL=NULL, pOlap=NULL, + rsOLMat=NULL, + returnCovInfo=TRUE, + .checkInput=TRUE) { + + ################### checking inputs ################################# + + # if it was already checked outside this function, don't need to re-check + if (.checkInput) { + ########## check that inputs are the correct class + checkConvertInputClasses(signal=signal, + signalCoord=signalCoord, + regionSet=regionSet, + signalCol=signalCol, + rsOL=rsOL) + + ########## check that dimensions of inputs are consistent + # length of signal coord = nrow of signal + if (length(signalCoord) != nrow(signal)) { + stop(cleanws("The number of coordinates in + signalCoord (length(signalCoord)) does not equal the number of + rows in `signal`")) + } + + ######### check that appropriate columns are present + # signalCol are column names of signal + if (!all(signalCol %in% colnames(signal))) { + missingCols = signalCol[!(signalCol %in% colnames(signal))] + stop(cleanws(paste0("Some signalCol are not + columns of signal: ", missingCols))) + } + + ######## check that scoringMetric is appropriate + + if (!(scoringMetric %in% getScoringMethods("both"))) { + stop(cleanws("scoringMetric was not recognized. + Check spelling and available options.")) + } + + ###### check that signalCoordType is appropriate + if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + + ####### + # what happens if there are NAs or Inf in `signal`? + # any NAs that overlap the regionSet will cause the score to be NA + if (is(signal, "data.table")) { + naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } else { + naRows = apply(X = signal[, signalCol, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } + + if (any(naRows)) { + signal <- signal[!naRows, ] + signalCoord <- signalCoord[!naRows] + warning("Removing rows with NA from `signal`") + } + + ################################################################# + + # detect signalCoordType + if (signalCoordType == "default") { + + # when signalCoord is a GRanges object + if (any(start(signalCoord) != end(signalCoord))) { + signalCoordType <- "multiBase" + } else { + signalCoordType <- "singleBase" + } + } + + # if "default" scoring method is given, choose based on signalCoordType + if (scoringMetric == "default") { + if (signalCoordType == "singleBase") { + scoringMetric <- "regionMean" + } else if (signalCoordType == "multiBase") { + scoringMetric <- "proportionWeightedMean" + } else { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + } + + # make sure that scoringMetric is consistent with signalCoordType + if (signalCoordType == "singleBase") { + if (!(scoringMetric %in% getScoringMethods("singleBase"))) { + stop("The scoringMetric you selected is not available for + this data's signalCoordType") + } + } else if (signalCoordType == "multiBase") { + if (!(scoringMetric %in% getScoringMethods("multiBase"))) { + stop("The scoringMetric you selected is not available for + this data's signalCoordType") + } + } + + } + ################### finished checking inputs ######################### + + numOfRegions <- length(regionSet) + totalCpGs <- nrow(signal) + + + #### UPDATE: only do this once, in outermost function possible ##### + # extreme positive or negative values both give important information + # take absolute value or not + if (absVal) { + signal <- abs(signal) # required for later code + } + + # XX copies unnecessarily:reformat into data.table with chromosome location and weight + + # make sure `signal` is the correct type for further steps + # (proportionWeightedMean method requires a matrix) + if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { + signal <- as.data.table(signal) + } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { + signal <- as.matrix(signal) + } + + # restricting to signalCol so unnecessary computations + # are not done + if (is(signal, "data.table")) { + loadingDT <- signal[, signalCol, with=FALSE] + # # naming does not work if only using one PC so add this line for that case + # setnames(loadiangDT, signalCol) + } else { + loadingDT <- signal[, signalCol, drop=FALSE] + } + ######## UPDATE: can above code be done only once, like input checking? + ######## + + + ########################################################################### + # scoring + + # would rounding speed up aggregation?, potentially make a sparse matrix + # if a lot of entries became 0 + + # the scoring metrics that support matrix scoring + if (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean")) { + + # dim(rsMat) + # loadings<-loadings[, c("PC1", "PC2")] + # dim(loadings) + # + # loadings <- abs(loadings) + + # sum per region set + rsScoresMatrix <- t(rsMat) %*% loadings + # normalize for regions covered (for average) + covCount <- colSums(rsMat) + results <- as.data.frame(apply(rsScoresMatrix, + MARGIN = 2, + FUN = function(x) x/covCount)) + results$signalCoverage <- covCount + # rsOLMat does not give the regionSetCoverage, totalRegionNumber + # or meanRegionSize + + # add these columns? + # results[, signalCoverage := 0] + # results[, regionSetCoverage := 0] + # results[, totalRegionNumber := numOfRegions] + # results[, meanRegionSize := round(mean(width(regionSet)), 1)] + + + + + } + + + # works for both singleBase and multiBase (UPDATE: did, but not matrix scoring) + if (scoringMetric == "simpleMean") { + + loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, + signalGR = signalCoord, + regionSet = regionSet, + calcCols= signalCol, + metric = "mean", + rsOL = rsOL, + returnCovInfo=returnCovInfo)) + results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + + } else if (scoringMetric == "simpleMedian") { + # scoring singleBase and multiBase both with this function for + # simpleMedian + loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, + signalGR = signalCoord, + regionSet = regionSet, + calcCols= signalCol, + metric = "median", + rsOL=rsOL, + returnCovInfo = returnCovInfo)) + + results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + + + } else if (signalCoordType == "singleBase") { + # do the actual aggregation + if (scoringMetric == "regionMean") { + + # specify aggregation operation + # will be done separately for each PC specified + aggrCommand <- buildJ(signalCol, + rep("mean", length(signalCol))) + # previously used BSAggregate from RGenomeUtils but now using local, + # modified copy + loadAgMain <- BSAggregate(BSDT = loadingDT, + regionsGRL = regionSet, + BSCoord = signalCoord, + jExpr = aggrCommand, + byRegionGroup = TRUE, + splitFactor = NULL, + returnOLInfo = returnCovInfo, + rsOL=rsOL) + + results <- .formatResults(loadAgMain, + scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + + } else if (scoringMetric == "regionMedian") { + + aggrCommand <- buildJ(signalCol, + rep("median", length(signalCol))) + loadAgMain <- BSAggregate(BSDT = loadingDT, + regionsGRL = regionSet, + BSCoord = signalCoord, + jExpr = aggrCommand, + byRegionGroup = TRUE, + splitFactor = NULL, + returnOLInfo = returnCovInfo, + rsOL=rsOL) + + results <- .formatResults(loadAgMain, + scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + + } + + } else { + # signalCoordType == "multiBase" + # for ATAC-seq + if (scoringMetric == "proportionWeightedMean") { + loadAgMain <- regionOLWeightedMean(signalMat = loadingDT, + signalGR = signalCoord, + regionSet = regionSet, + calcCols= signalCol, + rsOL = rsOL, + pOlap = pOlap, + returnCovInfo=returnCovInfo) + results <- .formatResults(as.data.table(loadAgMain), + scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) + + + } + } + + # signalOLMetrics() # make sure it works with no overlap + if (verbose) { + message("|") + } + + return(as.data.frame(results)) +} + + +# format results of scoring functions in aggregateSignal() +.formatResults <- function(loadAgMain, scoringMetric, + regionSet, signalCol, returnCovInfo=TRUE) { + + numOfRegions <- length(regionSet) + + # if no cytosines from loadings were included in regionSet, result is NA + if (is.null(loadAgMain)) { + results <- as.data.table(t(rep(NA, length(signalCol)))) + setnames(results, signalCol) + if (returnCovInfo) { + results[, signalCoverage := 0] + results[, regionSetCoverage := 0] + results[, totalRegionNumber := numOfRegions] + results[, meanRegionSize := round(mean(width(regionSet)), 1)] + + # this column is only added by this scoring method + if (scoringMetric == "proportionWeightedMean") { + results[, sumProportionOverlap := 0] + } + } + } else { + # regionMean, regionMedian, simpleMean for region data + results <- loadAgMain[, .SD, .SDcols = signalCol] + + if (returnCovInfo) { + results[, signalCoverage := loadAgMain[, .SD, .SDcols = "signalCoverage"]] + results[, regionSetCoverage := loadAgMain[, .SD, .SDcols = "regionSetCoverage"]] + results[, totalRegionNumber := numOfRegions] + results[, meanRegionSize := round(mean(width(regionSet)), 1)] + ################################ + # proportionWeightedMean + # this column is only added by this scoring method + if (scoringMetric == "proportionWeightedMean") { + results[, sumProportionOverlap := loadAgMain[, .SD, .SDcols = "sumProportionOverlap"]] + } + } + + ############################## + # simple mean for base pair data + } + return(results) +} + + +#' Score many region sets +#' +#' This function will give each region set a score for each target variable +#' given by `signalCol` based on +#' the `scoringMetric` parameter. Based on these scores, you can determine +#' which region sets out of a region set database (given by `GRList`) +#' are most associated with the target variables. See the vignette "Introduction +#' to Coordinate Covariation Analysis" for help interpreting your +#' results. +#' +#' @template signal +#' @template signalCoord +#' @template signalCoordType +#' @template GRList +#' @template signalCol +#' @template scoringMetric +#' @template verbose +# @param wilcox.conf.int logical. Only applies when using "rankSum" scoring +# method. returns a 95% confidence interval from the Wilcoxon rank sum test +# instead of p value. +#' @template absVal +#' @template olList +#' @template pOlapList +#' @param rsOLMat. +#' @template returnCovInfo +#' @return Data.frame of results, one row for each region set. +#' It has the following columns: +#' one column for each item of signalCol with names given +#' by signalCol. These columns have scores for the region set for each signalCol. +#' Other columns: signalCoverage (formerly cytosine_coverage) which +#' has number of epigenetic features that overlapped at all with regionSet, +#' regionSetCoverage which has number of regions from regionSet +#' that overlapped any of the epigenetic features, +#' totalRegionNumber that has +#' number of regions in regionSet, meanRegionSize that has average +#' size in base pairs of regions in regionSet, the average is based on +#' all regions in regionSet and not just ones that overlap. +#' For "multiBase" data, if the "proportionWeightedMean" scoring metric +#' is used, then the output will also have a "sumProportionOverlap" column. +#' During this scoring method, the proportion overlap between each signalCoord +#' region and overlapping regionSet region is calculated. This column is +#' the sum of all those proportion overlaps and is another way to quantify +#' coverage of regionSet in addition to regionSetCoverage. +#' +#' +#' @examples +#' data("brcaATACCoord1") +#' data("brcaATACData1") +#' data("esr1_chr1") +#' data("nrf1_chr1") +#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation +#' GRList <- GRangesList(esr1_chr1, nrf1_chr1) +#' rsScores <- aggregateSignalGRList(signal=featureContributionScores, +#' signalCoord=brcaATACCoord1, +#' GRList= GRList, +#' signalCol=c("PC1", "PC2"), +#' scoringMetric="default") +#' +#' @export + +aggregateSignalGRList <- function(signal, + signalCoord, + GRList, + signalCol = c("PC1", "PC2"), + signalCoordType = "default", + scoringMetric = "default", + verbose = TRUE, + absVal=TRUE, olList=NULL, + pOlapList=NULL, rsOLMat=NULL, + returnCovInfo=TRUE) { + + ################### checking inputs ################################# + + ########## check that inputs are the correct class + checkConvertInputClasses(signal=signal, + signalCoord=signalCoord, + regionSet=NULL, + signalCol = signalCol, + GRList=GRList, + olList=olList) + + ########## check that dimensions of inputs are consistent + # length of signal coord = nrow of signal + if (length(signalCoord) != nrow(signal)) { + stop(cleanws("The number of coordinates in + signalCoord (length(signalCoord)) does not equal the number of + rows in `signal`")) + } + + ######### check that appropriate columns are present + # signalCol are column names of signal + if (!all(signalCol %in% colnames(signal))) { + missingCols = signalCol[!(signalCol %in% colnames(signal))] + stop(cleanws(paste0("Some signalCol are not + columns of signal: ", missingCols))) + } + + ######## check that scoringMetric is appropriate + + if (!(scoringMetric %in% getScoringMethods("both"))) { + stop(cleanws("scoringMetric was not recognized. + Check spelling and available options.")) + } + + ###### check that signalCoordType is appropriate + if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + + ####### + # what happens if there are NAs or Inf in `signal`? + # any NAs that overlap the regionSet will cause the score to be NA + if (is(signal, "data.table")) { + naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } else { + naRows = apply(X = signal[, signalCol, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } + + if (any(naRows)) { + signal <- signal[!naRows, ] + signalCoord <- signalCoord[!naRows] + warning("Removing rows with NA from `signal`") + } + + ################################################################# + + # detect signalCoordType + if (signalCoordType == "default") { + # when signalCoord is a GRanges object + if (any(start(signalCoord) != end(signalCoord))) { + signalCoordType <- "multiBase" + } else { + signalCoordType <- "singleBase" + } + } + + # if "default" scoring method is given, choose based on signalCoordType + if (scoringMetric == "default") { + if (signalCoordType == "singleBase") { + scoringMetric <- "regionMean" + } else if (signalCoordType == "multiBase") { + scoringMetric <- "proportionWeightedMean" + } else { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + } + + # create region set overlap matrix (only used for multiBase) + if (is.null(rsOLMat) && (signalCoordType == "multiBase")) { + olToMat(signalCoord = signalCoord, GRList = GRList) + } + + + # convert object class outside aggregateSignal to extra prevent copying + # (one scoring method needs `signal` as a matrix though) + if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { + signal <- as.data.table(signal) + } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { + signal <- as.matrix(signal) + } + + + # take absolute value outside aggregateSignal to prevent extra copying + if (absVal) { + if (is(signal, "data.table")) { + signal[, signalCol] <- abs(signal[, signalCol, with=FALSE]) + } else { + signal[, signalCol] <- abs(signal[, signalCol]) + } + absVal <- FALSE + } + + if (is.null(olList)) { + # apply over the list of region sets + resultsList <- lapplyAlias(GRList, + function(x) aggregateSignal( + signal = signal, + signalCoord = signalCoord, + signalCoordType = signalCoordType, + regionSet = x, + signalCol = signalCol, + scoringMetric = scoringMetric, + verbose = verbose, + absVal = absVal, pOlap=NULL, + returnCovInfo=returnCovInfo)) + # resultsList <- mapply(FUN = function(x) aggregateSignal( + # signal = signal, + # signalCoord = signalCoord, + # signalCoordType = signalCoordType, + # regionSet = x, + # signalCol = signalCol, + # scoringMetric = scoringMetric, + # verbose = verbose, + # absVal = absVal, pOlap=y, + # returnCovInfo=returnCovInfo), x=GRList, y=pOlapList) + #wilcox.conf.int = wilcox.conf.int, + } else { + # # apply over the list of region sets + # resultsList <- lapplyAlias(olList, + # function(x) aggregateSignal( + # signal = signal, + # signalCoord = signalCoord, + # signalCoordType = signalCoordType, + # regionSet = NULL, + # signalCol = signalCol, + # scoringMetric = scoringMetric, + # verbose = verbose, + # absVal = absVal, + # rsOL=x, pOlap=pOlapList, returnCovInfo=returnCovInfo)) + # #wilcox.conf.int = wilcox.conf.int, + if (is.null(pOlapList)) { + # mapply does not work if pOlapList is null and other argument is not null + resultsList <- lapplyAlias(olList, FUN = function(x) aggregateSignal( + signal = signal, + signalCoord = signalCoord, + signalCoordType = signalCoordType, + regionSet = NULL, + signalCol = signalCol, + scoringMetric = scoringMetric, + verbose = verbose, + absVal = absVal, + rsOL=x, + returnCovInfo=returnCovInfo)) + } else { + resultsList <- mapply(FUN = function(x, y) aggregateSignal( + signal = signal, + signalCoord = signalCoord, + signalCoordType = signalCoordType, + regionSet = NULL, + signalCol = signalCol, + scoringMetric = scoringMetric, + verbose = verbose, + absVal = absVal, + rsOL=x, pOlap=y, + returnCovInfo=returnCovInfo), + x=olList, y=pOlapList) + } + + + } + + resultsDT <- do.call(rbind, resultsList) + + # # add names if they are present + # if (!is.null(names(GRList))) { + # # resultsDT[, rsName := names(GRList)] + # row.names(resultsDT) <- names(GRList) + # } + + # rsName <- row.names(resultsDT) + resultsDF <- as.data.frame(resultsDT) #, row.names = rsName) + + return(resultsDF) +} + +# @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal +# (each row corresponds to one genomic coordinate/range) +# @param featureMat Rows should be samples, columns should be "features" +# (whatever you want to get correlation with: eg PC scores), +# all columns in featureMat will be used (subset when passing to function +# in order to not use all columns) +# @param centerDataMat logical object. Should rows in dataMat be centered based on +# their means? (subtracting row mean from each row) +# @param centerFeatureMat logical. Should columns in featureMat be centered based +# on their means? (subtract column mean from each column) +# @param testType character object. Can be "cor" (Pearson correlation), +# "spearmanCor (Spearman correlation) +# "pcor" (partial correlation), "cov" (covariance (Pearson)), +# @param covariate +# +# If a row in dataMat has 0 stand. deviation, correlation will be set to 0 +# instead of NA as would be done by cor() +# +# @return returns a matrix where rows are the genomic signal (eg a CpG or region) and +# columns are the columns of featureMat +# @examples dataMat = matrix(rnorm(50), 5, 10) +# featureMat = matrix(rnorm(20), 10, 2) +createCorFeatureMat <- function(dataMat, featureMat, + centerDataMat=TRUE, centerFeatureMat = TRUE, + testType="cor", covariate=NULL) { + + featureMat <- as.matrix(featureMat) + featureNames <- colnames(featureMat) + nFeatures <- ncol(featureMat) + nDataDims <- nrow(dataMat) + + if (centerDataMat) { + cpgMeans <- rowMeans(dataMat, na.rm = TRUE) + # centering before calculating correlation + dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans) + + } + + if (centerFeatureMat) { + featureMeans <- colMeans(featureMat, na.rm = TRUE) + # centering before calculating correlation(also, t() converts to matrix) + featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans)) + if (dim(featureMat)[1] == 1) { + featureMat <- t(featureMat) + } + } + + # avoid this copy and/or delay transpose until after calculating correlation? + dataMat <- as.data.frame(t(dataMat)) + + + if (testType == "cor") { + # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) + # how much do features correlate with each PC? + + # put epigenetic data first in cor() + featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="pearson") + + } else if (testType == "spearmanCor") { + # xtfrm(x) ranking + featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="spearman") + + # } else if (testType == "pcor") { + # # partial correlation (account for covariates), ppcor package + # + # featurePCCor <- apply(X = featureMat, MARGIN = 2, function(y) apply(X = dataMat, 2, + # FUN = function(x) pcor.test(x = x, y=y, + # z=covariate, + # method="pearson")$estimate)) + # + } else if (testType == "cov") { + featurePCCor <- cov(dataMat, featureMat, use="pairwise.complete.obs") + + } else { + stop("invalid testType") + } + + + # if standard deviation of the data was zero, NA will be produced + # set to 0 because no standard deviation means no correlation with attribute of interest + featurePCCor[is.na(featurePCCor)] <- 0 + colnames(featurePCCor) <- featureNames + + return(featurePCCor) + # corLoadRatio <- signal[, signalCol] / featurePCCor + # hist(corLoadRatio[, "PC10"]) +} + + +#' Create a "meta-region" profile +#' +#' This profile can show enrichment +#' of genomic signals with high feature contribution scores +#' in the region set but not in the +#' surrounding genome, suggesting that variation is linked specifically +#' to that region set. +#' +#' All regions in a given region set +#' are combined into a single aggregate profile. Regions in `regionSet` +#' should be +#' expanded on each side to include a wider area of the genome around +#' the regions of interest (see example and vignettes). +#' To make the profile, first we optionally take +#' the absolute value of `signal` (`absVal` parameter). +#' Then each expanded regionSet region is +#' split into `binNum` bins. The corresponding +#' bins from each region +#' (e.g. all bin1's, all bin2's, etc.) are grouped. +#' All overlapping values from `signal` are +#' aggregated in each bin group according to the `aggrMethod` parameter to +#' get a meta-region profile. Since DNA strand information is not considered, +#' the profile is averaged symmetrically around the center. +#' A peak in the middle of this profile suggests +#' that variability is specific to the region set of interest and is +#' not a product of the surrounding genome. A region set can still be +#' significant even if it does not have a peak. For example, some +#' histone modification region sets may be in large genomic blocks +#' and not show a peak, despite having variation across samples. +#' +#' @template signal +#' @template signalCoord +#' @template regionSet +#' @template signalCol +#' @template signalCoordType +#' @param binNum Number of bins to split each region into when +#' making the aggregate profile. More bins will +#' give a higher resolution but perhaps more noisy profile. +#' @template verbose +#' @templateVar usesAggrMethod TRUE +#' @template scoringMetric +#' @template absVal +#' @return A data.frame with the binned meta-region profile, +#' one row per bin. columns: binID and one column for each target variable +#' in signalCol. The function will return NULL if there +#' is no overlap between signalCoord and any of the bin groups that come +#' from regionSet (e.g. none of the bin1's overlapped signalCoord, +#' NULL returned). +#' +#' @examples +#' data("brcaATACCoord1") +#' data("brcaATACData1") +#' data("esr1_chr1") +#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation +#' esr1_chr1_expanded <- resize(esr1_chr1, 12000, fix="center") +#' mrProfile <- getMetaRegionProfile(signal=featureContributionScores, +#' signalCoord=brcaATACCoord1, +#' regionSet=esr1_chr1_expanded, +#' signalCol=c("PC1", "PC2"), +#' binNum=21) +#' @export + +getMetaRegionProfile <- function(signal, signalCoord, regionSet, + signalCol = c("PC1", "PC2"), + signalCoordType = "default", + binNum = 21, + verbose=TRUE, + aggrMethod = "default", absVal=TRUE) { + + ################### checking inputs ################################# + + ########## check that inputs are the correct class, convert + checkConvertInputClasses(signal=signal, + signalCoord=signalCoord, + regionSet=regionSet, + signalCol = signalCol) + + ########## check that dimensions of inputs are consistent + # length of signal coord = nrow of signal + if (length(signalCoord) != nrow(signal)) { + stop(cleanws("The number of coordinates in + signalCoord (length(signalCoord)) does not equal the number of + rows in `signal`")) + } + + ######### check that appropriate columns are present + # signalCol are column names of `signal` + if (!all(signalCol %in% colnames(signal))) { + missingCols = signalCol[!(signalCol %in% colnames(signal))] + stop(cleanws(paste0("Some signalCol are not + columns of signal: ", missingCols))) + } + + + ######## check that aggregation method is appropriate + + if (!(aggrMethod %in% getScoringMethods("metaRegionProfile"))) { + stop(cleanws("scoringMetric was not recognized. + Check spelling and available options.")) + } + + ###### check that signalCoordType is appropriate + if (!(signalCoordType %in% c("default", "singleBase", "multiBase"))) { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + + ####### + # what happens if there are NAs or Inf in `signal`? + + ################################################################# + + # detect signalCoordType + if (signalCoordType == "default") { + # when signalCoord is a GRanges object + if (any(start(signalCoord) != end(signalCoord))) { + signalCoordType <- "multiBase" + } else { + signalCoordType <- "singleBase" + } + } + + # if "default" aggregation method is given, choose based on signalCoordType + if (aggrMethod == "default") { + if (signalCoordType == "singleBase") { + aggrMethod <- "regionMean" + } else if (signalCoordType == "multiBase") { + aggrMethod <- "proportionWeightedMean" + } else { + stop(cleanws("signalCoordType not recognized. + Check spelling/capitalization.")) + } + } + + + + + + ################################################################## + # take absolute value or not + if (absVal) { + loadingDT <- as.data.table(abs(signal)) + } else { + loadingDT <- as.data.table(signal) + } + + GRDT <- grToDt(regionSet) + + loadProf <- BSBinAggregate(BSDT = loadingDT, + rangeDT = GRDT, + binCount = binNum, + BSCoord = signalCoord, + byRegionGroup = TRUE, + splitFactor = NULL, + signalCol = signalCol, + aggrMethod = aggrMethod) + + # if loadProf is NULL, return NULL from function, otherwise make symmetrical + # it will be NULL when there was no overlap between data and any of the bins + if (!is.null(loadProf)) { + loadProf <- makeSymmetric(loadProf) + loadProf[, regionGroupID := seq_len(binNum)][] + setnames(loadProf, old = "regionGroupID", new="binID") + loadProf <- as.data.frame(loadProf) + } else { + warning("Insufficient overlap between regionSet and signalCoord") + } + + return(loadProf) +} + +makeSymmetric <- function(prof) { + symProf <- apply(prof, 2, .makeSymmetric) + symProf <- as.data.table(symProf) + return(symProf) +} + +.makeSymmetric <- function(vec) { + symVec <- (vec + rev(vec)) / 2 + return(symVec) +} + +# Produced originally for binning Ewing RRBS data across various region sets +# +# @param BSDT A data.table. For COCOA, a data.table of loading values +# with the PCs to be annotated. One column for the signal of each +# target variable +# and also has columns with the coordinates for the epigenetic features: +# chr (chromosome) and start column (also possibly end) +# @param rangeDT A data.table with the sets of regions to be binned, +# with columns named start, end +# @param binCount Number of bins across the region +# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start" +# columns must be in BSDT. +# @param byRegionGroup Pass along to binCount (see ?binCount) +# @template signalCol +# @param verbose A "logical" object. Whether progress +# of the function should be shown, one +# bar indicates the region set is completed. +# useful when using BSBinAggregate with 'apply' to do many +# region sets at a time. +# @param aggrMethod see ?getMetaRegionProfile() +BSBinAggregate <- function(BSDT, rangeDT, binCount, + BSCoord=NULL, + byRegionGroup = TRUE, + splitFactor = NULL, + signalCol, + verbose = FALSE, + aggrMethod) { + if (!is(rangeDT, "data.table")) { + stop("rangeDT must be a data.table") + } + + if (is.null(BSCoord)) { + BSCoord <- BSdtToGRanges(list(BSDT)) + } + + + seqnamesColName <- "seqnames" # default column name + if (! "seqnames" %in% colnames(rangeDT)) { + if ("chr" %in% colnames(rangeDT)) { + # message("seqnames column name set to: chr") + seqnamesColName <- "chr" + } else { + # Got neither. + stop("rangeDT must have a seqnames column") + } + } + + # message("Binning...") + binnedDT <- rangeDT[, MIRA::binRegion(start, end, + binCount, get(seqnamesColName))] + # output is a list of GRanges objects, does not play well with vapply + # one GRanges object for each bin, containing a segment of each original rangeDT region + binnedGR <- sapply(split(binnedDT, binnedDT$binID), dtToGr) + # message("Aggregating...") + + if (aggrMethod == "proportionWeightedMean") { + + binMeansList <- lapply(X = binnedGR, + FUN = function(x) regionOLWeightedMean(signalMat = BSDT, + # signalGR = dtToGr(BSDT[, .(chr, start, end)]), + signalGR = BSCoord, + regionSet = x, + calcCols = signalCol)) + + # any bins that had no overlap with data will be NULL + # if any bins had no data, return NULL + if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) { + return(NULL) + } + + # rbindlist of all NULL's will still return an (empty) data.table + # otherwise any NULL items will just be skipped and other rows will + # be concatenated + binnedBSDT <- rbindlist(binMeansList) + regionGroupID = 1:length(binMeansList) + + # regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)] + binnedBSDT[, regionGroupID := regionGroupID] + + } else if (aggrMethod == "simpleMean") { + + binMeansList <- lapply(X = binnedGR, + FUN = function(x) regionOLMean(signalDT = BSDT, + # signalGR = dtToGr(BSDT[, .(chr, start, end)]), + signalGR = BSCoord, + regionSet = x, + calcCols = signalCol)) + # any bins that had no overlap with data will be NULL + # if any bins had no data, return NULL + if (any(vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE))) { + return(NULL) + } + + binnedBSDT <- rbindlist(binMeansList) + regionGroupID = 1:length(binMeansList) + + + # regionGroupID = regionGroupID[!vapply(X = binMeansList, FUN = is.null, FUN.VALUE = TRUE)] + binnedBSDT[, regionGroupID := regionGroupID] + + } else if (aggrMethod == "regionMean") { # aggrMethod == "regionMean" + + # what is output if a region set has no overlap? + binnedBSDT <- BSAggregate(BSDT, + regionsGRL=GRangesList(binnedGR), + BSCoord = BSCoord, + jExpr=buildJ(signalCol, + rep("mean", length(signalCol))), + byRegionGroup = byRegionGroup, + splitFactor = splitFactor) + + # if any bins had no data + if (nrow(binnedBSDT) < binCount) { + return(NULL) + } + } else if (aggrMethod == "regionMedian") { + # what is output if a region set has no overlap? + binnedBSDT <- BSAggregate(BSDT, + regionsGRL=GRangesList(binnedGR), + BSCoord = BSCoord, + jExpr=buildJ(signalCol, + rep("median", length(signalCol))), + byRegionGroup = byRegionGroup, + splitFactor = splitFactor) + + # if any bins had no data + if (nrow(binnedBSDT) < binCount) { + return(NULL) + } + } + # RGenomeUtils::BSAggregate + + # # If we aren't aggregating by bin, then don't restrict to min reads! + # if (byRegionGroup) { + # binnedBSDT <- binnedBSDT[readCount > minReads,] + # } + if (verbose) { + message("|") + } + + return(binnedBSDT) +} + +# modification of BSAggregate to just return mean per region +# +# @template signal +# @template signalCoord +# @template regionSet +# Must be from the same reference genome +# as the coordinates for the actual data/samples (signalCoord). +# @template signalCol +# @param returnQuantile "logical" object. If FALSE, return region averages. If TRUE, +# for each region, return the quantile of that region's average value +# based on the distribution of individual genomic signal/feature values +# @template absVal +# @return a data.table with region coordinates and average loading +# values for each region. Has columns chr, start, end, and a column for each +# target variable in signalCol. +# Regions are not in order along the rows of the data.table. +# +# @example averagePerRegion(BSDT = BSDT, regionsGRL, +# jCommand = MIRA:::buildJ(cols = "methylProp", "mean")) +# Devel note: I could add a column for how many cytosines are in each region + +averagePerRegion <- function(signal, + signalCoord, + regionSet, + signalCol = c("PC1", "PC2"), + returnQuantile = FALSE, + absVal=TRUE) { + + ################### checking inputs ################################# + + ########## check that inputs are the correct class and converts + checkConvertInputClasses(signal=signal, + signalCoord=signalCoord, + regionSet=regionSet, + signalCol = signalCol) + + ########## check that dimensions of inputs are consistent + # length of signal coord = nrow of signal + if (length(signalCoord) != nrow(signal)) { + stop(cleanws("The number of coordinates in + signalCoord (length(signalCoord)) does not equal the number of + rows in `signal`")) + } + + ######### check that appropriate columns are present + # signalCol are column names of signal + if (!all(signalCol %in% colnames(signal))) { + missingCols = signalCol[!(signalCol %in% colnames(signal))] + stop(cleanws(paste0("Some signalCol are not + columns of signal: ", missingCols))) + } + + ####### + # what happens if there are NAs or Inf in `signal`? + # any NAs that overlap the regionSet will cause the score to be NA + if (is(signal, "data.table")) { + naRows = apply(X = signal[, signalCol, with=FALSE, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } else { + naRows = apply(X = signal[, signalCol, drop=FALSE], + MARGIN = 1, FUN = function(x) any(is.na(x))) + } + + if (any(naRows)) { + signal <- signal[!naRows, ] + signalCoord <- signalCoord[!naRows] + warning("Removing rows with NA from `signal`") + } + + ################################################################# + + ################################################################# + + # determine whether coordinates are single base or a range + # if (!("end" %in% colnames(coordinateDT))) { + # dataCoordType <- "singleBase" + # } else { + if (any(start(signalCoord) != end(signalCoord))) { + dataCoordType <- "multiBase" + } else { + dataCoordType <- "singleBase" + } + # } + + # take absolute value or not + if (absVal) { + signalDT <- as.data.table(abs(signal)) + } else { + signalDT <- as.data.table(signal) + } + + # use different function for single base data and for region data + if (dataCoordType == "singleBase") { + # linking coordinates to loading values, has columns chr start, signalCol + BSDT <- signalDT[, .SD, .SDcols = signalCol] + jExpr <- buildJ(signalCol, rep("mean", length(signalCol))) + + avPerRegion <- BSAggregate(BSDT = BSDT, + regionsGRL = regionSet, + BSCoord = signalCoord, + excludeGR = NULL, + regionsGRL.length = NULL, + splitFactor = NULL, + keepCols = NULL, + sumCols = NULL, + jExpr = jExpr, + byRegionGroup = FALSE, + keep.na = FALSE, + returnSD = FALSE, + returnOLInfo = FALSE, + meanPerRegion = TRUE, + returnQuantile = returnQuantile) + + } else if (dataCoordType == "multiBase") { + + avPerRegion <- weightedAvePerRegion(signalDT = signalDT, + signalCoord=signalCoord, + regionSet=regionSet, + calcCols = signalCol, + returnQuantile = returnQuantile) + + } else { + stop("dataCoordType not recognized.") + } + + return(avPerRegion) + +} + +# signalDT +# average by region for region-based data (eg ATAC-seq) +# @return data.table. chr, start, end columns. One column for each calcCols +# that has the average value in each region for that col +weightedAvePerRegion <- function(signalDT, + signalCoord, + regionSet, + calcCols = c("PC1", "PC2"), + returnQuantile = FALSE) { + + if (is(signalCoord, "data.frame")) { + signalCoord <- dtToGr(signalCoord) + } + + hits <- findOverlaps(query = signalCoord, subject = regionSet) + # if no overlap, return NULL + if (length(hits) == 0) { + return(NULL) + } + + olap <- pintersect(signalCoord[queryHits(hits)], + regionSet[subjectHits(hits)]) + polap <- width(olap) / width(regionSet[subjectHits(hits)]) + + # get total proportion overlap per region + # aggregate polap by region + pOlapDT <- data.table(signalDT[queryHits(hits), calcCols, with=FALSE], + rsRegionID = subjectHits(hits), + pOlap = polap) + pOlapByRegionDT <- pOlapDT[, .(regionPOlap = sum(pOlap)), by=rsRegionID] + + # specify aggregation operation + # will be done separately for each PC specified + aggrCommand <- paste("list(", paste(paste0(calcCols, "=", "sum", "(", + calcCols, " * pOlap)"), collapse = ", "), ")") + weightedSumByRegionDT <- pOlapDT[, eval(parse(text=aggrCommand)), by=rsRegionID] + + # weightedSumByRegionDT and pOlapByRegionDT should be in the same order + regionInd <- weightedSumByRegionDT$rsRegionID + olCoord <- grToDt(regionSet)[regionInd, .(chr, start, end)] + + # divide by total proportion overlap to get mean value + jCommand <- paste("list(", paste(paste0(calcCols, "=", calcCols, " / regionPOlap"), collapse = ", "), ")") + meanPerRegion <- cbind(pOlapByRegionDT, weightedSumByRegionDT) + meanPerRegion <- meanPerRegion[, eval(parse(text=jCommand))] + meanPerRegion <- cbind(olCoord, meanPerRegion) + + if (returnQuantile) { + for (i in seq_along(calcCols)) { + # perhaps this could be more efficient with mapply + # ecdf example: ecdf(distributionData)(getPercentileOfThis) + meanPerRegion[, c(calcCols[i]) := ecdf(signalDT[[calcCols[i]]])(meanPerRegion[[calcCols[i]]])] + # I tried set() to improve performance but it took about the same time + } + } + + # meanPerRegion <- pOlapDT[, .(regionMean = sum(score * (pOlap/sum(pOlap))), by=rsRegionID] + + return(meanPerRegion) +} + + + + +#' Get regions that are most associated with target variable +#' +#' Get a GRanges with top regions from the region set based on +#' average feature contribution scores +#' for the regions or the quantile of the region's average +#' feature contribution score based on the +#' distribution of all feature contribution scores for the target variable. +#' Returns average feature contribution score or quantile as GRanges metadata. +#' +#' @template signal +#' @template signalCoord +#' @template regionSet +#' @template signalCol +#' @param cutoff Numeric. Only regions with at least this value will be +#' returned (either above this average `signal` value or above this quantile +#' if returnQuantile=TRUE). +#' @param returnQuantile Logical. If FALSE, return region averages. If TRUE, +#' for each region, return the quantile of that region's average value +#' based on the distribution of individual feature values in `signal` for +#' that `signalCol`. +#' @return A GRanges object with region coordinates for regions with +#' scores/quantiles above "cutoff" for any target variable in signalCol. +#' The scores/quantiles +#' for signalCol are given as metadata in the GRanges. + +# Are regions in order along the rows of the data.table? +# +#' @examples +#' data("brcaATACCoord1") +#' data("brcaATACData1") +#' data("esr1_chr1") +#' featureContributionScores <- prcomp(t(brcaATACData1))$rotation +#' topRegions <- getTopRegions(signal=featureContributionScores, +#' signalCoord=brcaATACCoord1, +#' regionSet=esr1_chr1, +#' returnQuantile = TRUE) +#' @export + +getTopRegions <- function(signal, + signalCoord, + regionSet, + signalCol = c("PC1", "PC2"), + cutoff = 0.8, + returnQuantile=TRUE) { + + + + + regionLoadDT <- averagePerRegion(signal=signal, + signalCoord=signalCoord, regionSet=regionSet, + signalCol = signalCol, + returnQuantile = returnQuantile)[] + + keepInd <- regionLoadDT[, signalCol, with=FALSE] >= cutoff + + # keep region if it is above cutoff in any of the PCs in signalCol + keepInd <- apply(X = keepInd, MARGIN = 1, FUN = any) + + highGR <- dtToGr(regionLoadDT[keepInd, ]) + + values(highGR) <- as.data.frame(regionLoadDT[keepInd, signalCol, with=FALSE]) + + return(highGR) +} + + +# different scoring metrics +# remniscent of LOLA: +# support (number of regions, for us regions that have at least 1 cytosine and can be scored) +# mean loading value (or could do ratio of peak to surrounding regions) +# signal to noise ratio, how big is peak compared to noise of surrounding area +# with SNR, even a small peak could have a high SNR + +# @param loadingProf +# loadingProfileSNR = function() { +# # magnitude of the peak divided by standard deviation of +# # noise (signal in surrounding areas) +# } + + +# support is just number of regions from each input region set that overlap at all with +# the cytosines that have loading values +# findOverlaps(regionSet, dtToGr(coordinateDT)) + +########################################################################### + + +# BSaggregate -- Aggregate a BSDT across regions or region groups, +# for multiple samples at a time. +# This function is as BScombineByRegion, but can handle not only multiple +# samples in BSDT, but also simultaneously multiple region sets by passing +# a regionsGRL (GRangesList object). +# you can use jExpr to do other functions. + +# Given a bisulfite data table as input, with an identifier column for +# different samples; plus a GRanges objects with regions to aggregate. +# +# @param BSDT The bisulfite data.table (output from one of the parsing +# functions for methylation calls) that you wish to aggregate. It can +# be a combined table, with individual samples identified by column passed +# to splitFactor. +# @param regionsGRL Regions across which you want to aggregate. Should be +# from a single region set. eg GRangesList(regionSet) +# @param BSCoord GRanges. Coordinates for BSDT. If NULL, then "chr" and "start" +# columns must be in BSDT. +# @param excludeGR A GRanges object with regions you want to +# exclude from the aggregation function. These regions will be eliminated +# from the input table and not counted. +# @param jExpr You can pass a custom command in the j slot to data.table +# specifying which columns to aggregate, and which functions to use. You +# can use buildJ() to build a jExpr argument easily. +# @param byRegionGroup You can aggregate by regionID or by regionGroupID; +# this reflects the regionsGRL that you pass; by default, BSAggregate will +# aggregate each region individually -- scores will then be contiguous, and +# the output is 1 row per region. +# Turn on this flag to aggregate across all region groups, making the result +# uncontiguous, and resulting in 1 row per *region group*. +# @param returnSD Whether the standard deviation of the columns of interest +# should be returned. Standard deviation for rows that overlap with region set +# and also for rows that do not overlap with region set. Not currently functional. +# @param returnOLInfo if true, include number of overlapping cpgs and number +# of overlapping regions in the result +# @param meanPerRegion Will return the mean value in each region in a data.table +# instead of averaging all regions into a single value, returnOLInfo does +# nothing if meanPerRegion=TRUE (function exits before that code, which +# expects a different data structure) +# @param returnQuantile Only used if meanPerRegion=TRUE, instead of mean +# return the quantile/percentile of the mean of each region +# in relation to the distribution of original values in BSDT +# @template rsOL +# +BSAggregate <- function(BSDT, regionsGRL, BSCoord=NULL, excludeGR=NULL, + regionsGRL.length = NULL, splitFactor=NULL, + keepCols=NULL, sumCols=NULL, + jExpr=NULL, byRegionGroup=FALSE, + keep.na=FALSE, returnSD=FALSE, + returnOLInfo=FALSE, meanPerRegion=FALSE, + returnQuantile=FALSE, rsOL=NULL) { + + # Assert that regionsGRL is a GRL. + # If regionsGRL is given as a GRanges, we convert to GRL + if(is(regionsGRL, "GRanges")) { + regionsGRL <- GRangesList(regionsGRL) + } else if (!is(regionsGRL, "GRangesList") && is.null(rsOL)) { + stop("regionsGRL is not a GRanges or GRangesList object") + } + + # will cause error if BSDT is only a data.frame + if (is(BSDT, "data.frame") & !is(BSDT, "data.table")) { + BSDT <- as.data.table(BSDT) + } + if (!is(BSDT, "data.table")) { + stop("BSDT must be a data.table") + } + + if(! is.null(excludeGR)) { + BSDT <- BSFilter(BSDT, minReads=0, excludeGR) + } + + if (returnQuantile) { + # keep all data so quantiles can be calculated + # later code will change BSDT + origBSDT <- data.table::copy(BSDT) + } + # TODO: BSdtToGRanges needs to include end coordinate!!! + if (is.null(BSCoord)) { + bsgr <- BSdtToGRanges(list(BSDT)) + } else { + bsgr <- BSCoord + } + + + additionalColNames <- setdiff(colnames(BSDT), + c("chr","start", "end", + "hitCount","readCount", splitFactor)) + + # specify that "character" outcome is expected from mode by + # supplying "a" as last vapply argument (any character object would work) + colModes <- vapply(BSDT, mode, "a") + if (is.null(sumCols)) { + sumCols <- setdiff(colnames(BSDT),c("chr", "start", "end", + "strand", splitFactor, keepCols)) + # Restrict to numeric columns. + sumCols <- intersect(sumCols, + names(colModes[which(colModes == "numeric")])) + } + + if (is.null(rsOL)) { + # It's required to do a findoverlaps on each region individually, + # Not on a GRL, because of the way overlaps with GRLs work. So, + # we must convert the GRL to a GR, but we must keep track of which + # regions came from which group. + regionsGR <- unlist(regionsGRL) + + if(is.null(regionsGRL.length)) { + # if (length(regionsGRL) > 100) { + # message("BSAggregate: Calculating sizes. You can speed this up by supplying a regionsGRL.length vector...", appendLF=FALSE) + # } + # specify that output should be numeric with vapply + # (any number would work instead of 1) + regionsGRL.length <- vapply(regionsGRL, length, 1) + # message("Done counting regionsGRL lengths.") + } + + # Build a table to keep track of which regions belong to which group + # BIOC note: sapply returns a list where each item is of different length + # therefore, I'm not using vapply + + # one regionID per region of "regionsGR", withinGroupID: for when regionsGRL + # has multiple GRanges, Gives the index of each region within its corresponding + # GRanges object. regionGroupID: for when regionsGRL has multiple GRanges, + # indicates which regionsGRL GRanges object the region has come from ( + # index of the GRanges object in regionsGRL) + region2group <- data.table( + regionID=seq_along(regionsGR), + chr=as.vector(seqnames(regionsGR)), + start=as.vector(start(regionsGR)), + end=as.vector(end(regionsGR)), + # withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))), + regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) # repeats each number regionGRL.length times + setkey(region2group, regionID) + + #TODO: if overlap method is single, but "end" is present, find center + # and set that to start! + # if (all(start(bsgr[[1]]) != end(bsgr[[length(unique(queryHits(hits)))1]]))) { + if (all(start(bsgr) != end(bsgr))) { + stop("BSDT start and end coordinates are not the same. Choose a different aggrMethod.") + } else { + # fo <- findOverlaps(query = bsgr[[1]], subject = regionsGR) + fo <- findOverlaps(query = bsgr, subject = regionsGR) + } + + if (length(subjectHits(fo)) < 1) { + warning("Insufficient overlap between signalCoord and the region set.") + return(NULL) + } + + } else { + # Build a table to keep track of which regions belong to which group + # BIOC note: sapply returns a list where each item is of different length + # therefore, I'm not using vapply + + # region2group <- data.table( + # regionID=seq_len(regionsGR), + # withinGroupID= as.vector(unlist(sapply(regionsGRL.length, seq))), + # regionGroupID=rep(seq_along(regionsGRL), regionsGRL.length)) + + if (length(subjectHits(rsOL)) < 1) { + warning("Insufficient overlap between signalCoord and the region set.") + return(NULL) + } + + # only works for when one region set is given in regionsGRL (ie does + # not work for metaregion profiles) + region2group <- data.table( + regionID=seq_len(max(subjectHits(rsOL))), + regionGroupID=rep(1, max(subjectHits(rsOL)))) # assumes only 1 region set in regionsGRL + setkey(region2group, regionID) + + fo <- rsOL + } + + + ### use info from findOverlaps to see how many individual + # cytosines (or input regions) and how many regions (in region sets) + # overlap with one another + if (returnOLInfo) { + signalCoverage <- length(unique(queryHits(fo))) + regionSetCoverage <- length(unique(subjectHits(fo))) + } + + # if("end" %in% colnames(BSDT)){ + # setkey(BSDT, chr, start, end) + # } else{ + # setkey(BSDT, chr, start) + # } + + # Gut check: + # stopifnot(all(elementMetadata(bsgr[[1]])$readCount == BSDT$readCount)) + + # message("Setting regionIDs...") + BSDT <- BSDT[queryHits(fo),] #restrict the table to CpGs (or input region) in any region set. + + BSDT[,regionID:=subjectHits(fo)] #record which region they overlapped. + #BSDT[queryHits(fo),regionID:=subjectHits(fo)] + #if (!keep.na) { + # BSDT = BSDT[queryHits(fo),] + #} + + if (is.null(jExpr)) { + cols=c(sumCols, keepCols) + funcs <- c(rep("sum", length(sumCols)), rep("unique", length(keepCols))) + jExpr <- buildJ(cols, funcs) + } + # message("jExpr: ", jExpr) + + # Define aggregation column. aggregate by region or by region group? + if (byRegionGroup) { + agCol <- "regionGroupID" + } else { + agCol <- "regionID" # Default + } + + # Build the by string + if (is.null(splitFactor)) { + byString <- paste0("list(regionID)") + } else { + byString <- paste0("list(", paste("regionID", + paste0(splitFactor, ""), + collapse=", ", sep=", "), ")") + } + + # Now actually do the aggregate: (aggregate within each region) + # message("Combining...") + bsCombined <- BSDT[,eval(parse(text=jExpr)), by=eval(parse(text=byString))] + setkey(bsCombined, regionID) + + if (meanPerRegion) { + setkey(region2group, regionID) + + avPerRegion <- merge(bsCombined, region2group) + avPerRegion[, c("regionID", + # "withinGroupID", + "regionGroupID") := NULL] + + if (returnQuantile) { + + for (i in seq_along(sumCols)) { + # perhaps this could be more efficient with mapply + # ecdf example: ecdf(distributionData)(getPercentileOfThis) + avPerRegion[, c(sumCols[i]) := ecdf(origBSDT[[sumCols[i]]])(avPerRegion[[sumCols[i]]])] + # I tried set() to improve performance but it took about the same time + } + + } + return(avPerRegion) + } + # Now aggregate across groups. + # I do this in 2 steps to avoid assigning regions to groups, + # which takes awhile. I think this preserves memory and is faster. + + # Define aggregation column. aggregate by region or by region group? + # byRegionGroup=TRUE means to aggregate within each individiual GRanges from + # regions.GRL + if (byRegionGroup) { + # must set allow=TRUE here in case there are multiple IDs (splitCol) + bsCombined[region2group, regionGroupID:=regionGroupID, allow=TRUE] + if (! is.null(splitFactor) ) { + byStringGroup <- paste0("list(", paste("regionGroupID", + paste0(splitFactor, + collapse=", "), + sep=", "), ")") + } else { + byStringGroup <- "list(regionGroupID)" + } + # aggregate by regionGroupID (region set/GRanges in regionsGRL) + bsCombined=bsCombined[,eval(parse(text=jExpr)), + by=eval(parse(text=byStringGroup))] + if (returnOLInfo) { + bsCombined[, signalCoverage := signalCoverage] + bsCombined[, regionSetCoverage := regionSetCoverage] + } + return(bsCombined) + } else { + e <- region2group[bsCombined,] + setkey(e, regionID) + return(e) + } + # WARNING: There are now 2^2 ways to aggregate, sum vs mean + # at each level: across regions, then across region sets. This + # doesn't give you a choice at this point. +} + + + +# Given a BSDT (bisulfite data.table), remove any entries that overlap +# regions given in the excludeGR argument and/or filter out sites +# that have lower than a minimum number of reads. +# @param BSDT Bisulfite data.table to filter +# @param minReads Require at least this level of coverage at a cpg. +# @param excludeGR GRanges object with regions to filter. +# +# @return The BSDT with appropriate regions removed. +BSFilter <- function(BSDT, minReads = 10, excludeGR = NULL) { + # First, filter for minimum reads. + if (minReads > 0) { + BSDT <- BSDT[coverage >= minReads, ] + } + if (NROW(BSDT) == 0) { return(data.table(NULL)) } + # Now, filter entries overlapping a region in excludeGR. + if (!is.null(excludeGR)) { + gr <- dtToGr(BSDT) + fo <- findOverlaps(gr, excludeGR) + qh <- unique(queryHits(fo)) + length(qh) + nrow(BSDT) + BSDT <- BSDT[-qh, ] + } + return(BSDT) +} + + +#' Get indices for top scored region sets +#' +#' For each target variable, get index of original region sets +#' but ordered by rsScores +#' ranking for each target variable. +#' The original index refers to that region set's position +#' in the `GRList` param given to `aggregateSignalGRList` which is +#' also that region set's +#' row index in the COCOA output. The first number in a given column +#' of this function's output will be the +#' original index of the region set ranked first for that target variable. +#' The second row for a +#' column will be the original index of the region set that ranked second +#' for that target variable, etc. You can use this function to make it easier +#' when you want to select the top region sets for further analysis or +#' just for sorting the results. Region set scores are sorted in decreasing +#' or increasing order according to the `decreasing` parameter. +#' +#' @template rsScores +#' @templateVar isRSRankingIndex TRUE +#' @template signalCol +#' @param decreasing Logical. Whether to sort rsScores in decreasing +#' or increasing order. +#' @param newColName Character. The names of the columns of the output data.frame. +#' The order should correspond to the order of the +#' input columns given by signalCol. +#' @return A data.frame with one column for each `signalCol`. +#' Column names are given by `signalCol` or `newColName` (if used). +#' Each column has been +#' sorted by score for region sets for that target variable +#' (order given by `decreasing` +#' param). +#' Original indices for region sets that were used to create rsScores +#' are given. Region sets with a score of NA are counted as having the +#' lowest scores and indices for these region sets will be at the bottom of the +#' returned data.frame (na.last=TRUE in sorting) +#' @examples data("rsScores") +#' rsRankInd = rsRankingIndex(rsScores=rsScores, +#' signalCol=c("PC1", "PC2")) +#' # region sets sorted by score for PC1 +#' rsScores[rsRankInd$PC1, ] +#' # region sets sorted by score for PC2 +#' rsScores[rsRankInd$PC2, ] +#' +#' @export +#' +rsRankingIndex <- function(rsScores, signalCol, + decreasing=TRUE, newColName = signalCol) { + + if (!(is(rsScores, "data.frame") || is(rsScores, "matrix"))) { + stop("rsScores should be a data.frame. Check object class.") + } + rsScores <- as.data.table(rsScores) + + if (!(is(signalCol, "character") | is(signalCol, "list"))) { + stop("signalCol should be a character vector or list of character vectors.") + } + if (is(signalCol, "list")) { + if (!all(vapply(X = signalCol, FUN = class, FUN.VALUE = "a") %in% "character")) { + stop("Items of signalCol should be character vectors.") + } + if (!length(unique(vapply(signalCol, FUN = length, FUN.VALUE = 2)))) { + stop("Items of signalCol should be the same length as each other.") + } + if (!all(unlist(signalCol) %in% colnames(rsScores))) { + stop("Some column names in signalCol are not present in rsScores.") + } + + # the first item in the list is taken for newColName + if (is(newColName, "list")) { + newColName <- signalCol[[1]] + } + + } else { # signalCol is character + if (!all(signalCol %in% colnames(rsScores))) { + stop("Some column names in signalCol are not present in rsScores.") + } + } + + + dtOrder <- rep(-99L, length(decreasing)) + # how to sort scores + # -1 for decreasing order of scores + dtOrder[decreasing] <- -1L + # +1 for increasing order of scores + dtOrder[!decreasing] <- 1L + + + dtOrder <- rep(-99L, length(decreasing)) + # how to sort scores + # -1 for decreasing order of scores + dtOrder[decreasing] <- -1L + # +1 for increasing order of scores + dtOrder[!decreasing] <- 1L + + # so by references changes will not be a problem + rsScores <- copy(rsScores) + rsScores[, rsIndex := seq_len(nrow(rsScores))] + + if (is(signalCol, "list")) { + if (length(newColName) != length(signalCol[[1]])) { + stop("newColName is not the same length as columns given in signalCol.") + } + + rsEnSortedInd <- subset(rsScores, select= signalCol[[1]]) + setnames(rsEnSortedInd, newColName) + + colNameMat <- do.call(rbind, signalCol) + + # then scores by each PC and make a column with the original index for sorted region sets + # this object will be used to pull out region sets that were top hits for each PC + for (i in seq_along(signalCol[[1]])) { + theseOrderCols <- colNameMat[, i] + + setorderv(rsScores, cols = theseOrderCols, order=dtOrder, na.last=TRUE) + + rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]] + } + } else if (is(signalCol, "character")) { + + if (length(newColName) != length(signalCol)) { + stop("newColName is not the same length as columns given in signalCol.") + } + + signalCol <- signalCol[signalCol %in% colnames(rsScores)] + + rsEnSortedInd <- subset(rsScores, select= signalCol) + setnames(rsEnSortedInd, newColName) + + # then scores by each PC and make a column with the original index for sorted region sets + # this object will be used to pull out region sets that were top hits for each PC + for (i in seq_along(signalCol)) { + + + setorderv(rsScores, cols = signalCol[i], order=dtOrder, na.last=TRUE) + + rsEnSortedInd[, newColName[i] := rsScores[, rsIndex]] + } + + } else { + stop("signalCol should be a character vector or list of character vectors.") + } + + # reset order + # setorderv(rsScores, cols = "rsIndex", order=1L) + return(as.data.frame(rsEnSortedInd)) +} + +#################### Metric functions ######################################## +# scores, metrics, or statistical tests + +# Instead of averaging within regions first as BSAggregate does, +# this function does a simple average and standard deviation +# for all CpGs that overlap +# with regions of a region set, also does average and +# standard deviation for non overlapping CpGs. Created to +# get metrics of loading values for each PC. +# +# Faster if given total average for each column of interest +# +# @param dataDT a data.table with +# columns to get metrics of eg (PC1, PC2). All columns +# will be considered +# columns to get the metrics from so no unnecessary columns should be +# included. +# @param dataGR GRanges. Coordinates for dataDT. +# @template regionSet +# Metrics will be calculated on +# only coordinates within this region set (and optionally separately +# on those outside this region set with alsoNonOLMet parameter) +# @param signalCol the columns to calculate the metrics on. The +# metrics will be calculated on each one of these columns separately. +# @param metrics character vector with the name of a function or functions +# to calculate on selected cytosines. Function should only require one +# input which should be the values of the cytosines. +# @param alsoNonOLMet also include same metrics +# for non overlapping CpGs (still also returns metrics for overlapping CpGs) +# param columnMeans Not functional/is deprecated. The idea was to use +# the mean of the column to speed up calculations (eg when calculating +# mean of overlapping CpGs, use that info and column mean to get +# mean for non overlapping CpGs without manually calculating it) +# +signalOLMetrics <- function(dataDT, + dataGR, + regionSet, + signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))], + metrics=c("mean", "sd"), + alsoNonOLMet=TRUE, rsOL=NULL) { + + # convert DT to GR for finding overlaps + # dataGR <- BSdtToGRanges(list(dataDT))[[1]] + + if (is.null(rsOL)) { + OL <- findOverlaps(query = dataGR, subject = regionSet) + # region set info + totalRegionNumber <- length(regionSet) + meanRegionSize <- round(mean(width(regionSet)), 1) + } else { + OL <- rsOL + } + + # if no overlap, exit + if (length(OL) == 0) { + return(NULL) + } + + # get indices for overlapping and non overlapping CpGs + olCpG <- queryHits(OL) + + # get info on degree of overlap + # number of CpGs that overlap + signalCoverage <- length(unique(olCpG)) + # number of regions that overlap + regionSetCoverage <- length(unique(subjectHits(OL))) + + # gets metrics for all columns except chr, start, end + jExpr <- buildJ(cols=rep(signalCol, each=length(metrics)), + funcs=rep(metrics, length(signalCol)), + newColNames = paste0(rep(signalCol, + each=length(metrics)), + "_", metrics)) + + # getting the metrics + olMetrics <- as.data.frame(dataDT[olCpG, eval(parse(text=jExpr))]) + + # calculate average of nonOLCpGs based on columnMean if given + # if (!is.null()) + # + # formatting so there is one row per PC/testCol + # output is a matrix with ncol = length(metrics) + # for vapply, FUN.VALUE should have length equal to a single output of FUN + olResults <- vapply(X = metrics, + FUN = function(x) as.numeric(olMetrics[, grepl(pattern = x, colnames(olMetrics))]), + as.numeric(seq_along(signalCol))) + olResults <- as.data.table(olResults) + setnames(olResults, old = colnames(olResults), new = paste0(colnames(olResults), "_OL")) + + if (alsoNonOLMet) { + nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG] + # if no OL for region set, don't calculate for non region set + # TODO make conditional on region set having any overlap + nonOLMetrics <- as.data.frame(dataDT[nonOLCpG, eval(parse(text=jExpr))]) + + nonOLResults <- vapply(X = metrics, + FUN = function(x) as.numeric(nonOLMetrics[, grepl(pattern = x, colnames(nonOLMetrics))]), + as.numeric(seq_along(signalCol))) + nonOLResults <- as.data.table(nonOLResults) + setnames(nonOLResults, old = colnames(nonOLResults), new = paste0(colnames(nonOLResults), "_nonOL")) + + if (is.null(rsOL)) { + metricDT <- cbind(data.table(testCol=signalCol), + olResults, + nonOLResults, + data.table(signalCoverage, + regionSetCoverage, + totalRegionNumber, + meanRegionSize)) + } else { + metricDT <- cbind(data.table(testCol=signalCol), + olResults, + nonOLResults, + data.table(signalCoverage, + regionSetCoverage)) + } + } else { + if (is.null(rsOL)) { + metricDT <- cbind(data.table(testCol=signalCol), + olResults, + data.table(signalCoverage, + regionSetCoverage, + totalRegionNumber, + meanRegionSize)) + } else { + metricDT <- cbind(data.table(testCol=signalCol), + olResults, + data.table(signalCoverage, + regionSetCoverage)) + } + + } + return(metricDT) +} + + +# Wilcoxon rank sum test for a region set +# @param dataDT a data.table with chr, start, end columns as well +# as columns to get metrics of eg (PC1, PC2). All columns +# except chr, start, and end will be considered +# columns to get the metrics from so no unnecessary columns should be +# included. +# @template regionSet +# @param signalCol the columns of interest. You will do ranksum test separately +# on each of these columns (given test only uses info in one column) +# @param ... Additional parameters of wilcox.test function. See ?wilcox.test. +# For instance specify alternative hypothesis: alternative = "greater". +# @return A vector with a p value for each column other than chr, start or end. + +# @examples data("brcaLoadings1") +# data("brcaMCoord1") +# data("nrf1_chr1") +# dataDT = as.data.table(cbind(brcaMCoord1, brcaLoadings1)) +# rsWilcox(dataDT = dataDT, regionSet = nrf1_chr1, conf.int=TRUE) + +rsWilcox <- function(dataDT, + regionSet, + signalCol = colnames(dataDT)[!(colnames(dataDT) %in% c("chr", "start", "end"))], + conf.int=FALSE, + ...) { + + # region set info + totalRegionNumber <- length(regionSet) + meanRegionSize <- round(mean(width(regionSet)), 1) + + + # convert DT to GR for finding overlaps + dataGR <- BSdtToGRanges(list(dataDT))[[1]] + + OL <- findOverlaps(query = regionSet, subject = dataGR) + + # if no overlap, exit + if (length(OL) == 0) { + return(NULL) + } + + # get indices for overlapping and non overlapping CpGs + olCpG <- unique(subjectHits(OL)) + nonOLCpG <- (seq_len(nrow(dataDT)))[-olCpG] + + # get info on degree of overlap + # number of CpGs that overlap + signalCoverage <- length(unique(olCpG)) + # number of regions that overlap + regionSetCoverage <- length(unique(queryHits(OL))) + + + + # each confidence interval has length of 2: [low, high] + + if (conf.int) { + # calculate Wilcoxon rank sum test for each column + # additional parameters given with ... + # confIntervals will be [low1, high1, low2, high2, etc.] + confIntervals <- as.numeric(vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])), + y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])), + conf.int = conf.int, ...)$conf.int, c(1, 1))) + + names(confIntervals) <- paste0(rep(signalCol, each=2), c("_low", "_high")) + wRes <- data.frame(t(confIntervals), + signalCoverage, + regionSetCoverage, + totalRegionNumber, + meanRegionSize) + + } else { + # calculate Wilcoxon rank sum test for each column + # additional parameters given with ... + pVals <- vapply(X = signalCol, FUN = function(x) wilcox.test(x = as.numeric(as.matrix(dataDT[olCpG, x, with=FALSE])), + y = as.numeric(as.matrix(dataDT[nonOLCpG, x, with=FALSE])), ...)$p.value, 1) + wRes <- data.frame(t(pVals), + signalCoverage, + regionSetCoverage, + totalRegionNumber, + meanRegionSize) + } + + return(wRes) +} + + +# I optimized upstream so that a matrix would be given to this function +# if this function is rewritten and no longer requires a matrix input, +# then in order to prevent unnecessary object copying, +# rewrite upstream code that converts signalDT to matrix class + +# @param signalMat Data to be aggregated (e.g. raw data: ATAC-seq, +# region based DNA methylation or loading values) +# @param signalGR GRanges object with coordinates for signalMat +# @template regionSet +# The region set to score. +# @param calcCols character object. Column names. A weighted sum will be done +# for each of these columns (columns should be numeric). +# @template rsOL +# @param pOlap see "?aggregateSignal" +# @template returnCovInfo +# @value Returns data.frame with columns 'calcCols', signalCoverage col has +# number of signalGR regions that overlapped with any regionSet regions, +# regionSetCoverage has the sum of all proportion overlaps of regions from +# signalGR with regionSet (regionSet region is denominator) +# containing weighted mean for each col. +# Returns NULL if there is no overlap between signalGR and regionSet + +regionOLWeightedMean <- function(signalMat, signalGR, + regionSet, calcCols, rsOL=NULL, + pOlap=NULL, returnCovInfo=TRUE) { + + if (!is(signalMat, "matrix")) { + signalMat <- as.matrix(signalMat) + } + + if (is.null(rsOL)) { + hits <- findOverlaps(query = signalGR, subject = regionSet) + } else { + hits <- rsOL + + } + + # if no overlap, return NULL + if (length(hits) == 0) { + return(NULL) + } + + if (is.null(pOlap)) { + olap <- pintersect(signalGR[queryHits(hits)], + regionSet[subjectHits(hits)]) + pOlap <- width(olap) / width(regionSet[subjectHits(hits)]) + } + + + # some rows may be duplicated if a signalMat region overlapped multiple + # regions from signalGR but that is ok + # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying + + # weight the signalMat values by the proportion overlap (weighted average) + weightedSum <- t(pOlap) %*% signalMat[queryHits(hits), calcCols] + + # weighted average + denom <- sum(pOlap) + weightedAve <- as.data.frame(weightedSum / denom) + colnames(weightedAve) <- calcCols + + # add columns for coverage info + if (returnCovInfo) { + weightedAve$signalCoverage = length(unique(queryHits(hits))) + weightedAve$regionSetCoverage = length(unique(subjectHits(hits))) + weightedAve$sumProportionOverlap = denom + } + + return(weightedAve) +} + + +# @param signalDT Data to be aggregated (e.g. raw data: ATAC-seq, +# region based DNA methylation or loading values) +# @param signalGR GRanges object with coordinates for signalDT +# @template regionSet +# The region set to score. +# @param calcCols character object. Column names. A mean will be calculated for +# each of these columns (columns should be numeric). +# @param metric character. "mean" or "median" +# @template rsOL +# @template returnCovInfo +# @value Returns data.frame with columns 'calcCols', signalCoverage col has +# number of signalGR regions that overlapped with any regionSet regions, +# regionSetCoverage has the number of regions from +# signalGR that overlapped with regionSet +# Returns NULL if there is no overlap between signalGR and regionSet + +regionOLMean <- function(signalDT, signalGR, regionSet, + calcCols, metric="mean", rsOL=NULL, returnCovInfo=TRUE) { + + if (is.null(rsOL)) { + hits <- findOverlaps(query = signalGR, subject = regionSet) + } else { + hits <- rsOL + } + # if no overlap, return NULL + if (length(hits) == 0) { + return(NULL) + } + + # some rows may be duplicated if a signalDT region overlapped multiple + # regions from signalGR but that is ok + signalDT <- signalDT[queryHits(hits), ] + + if (metric == "mean") { + # mean of the overlapping signalDT values + signalAve <- as.data.frame(t(colMeans(signalDT[,..calcCols]))) + } else if (metric == "median") { + # median of the overlapping signalDT values + signalAve <- as.data.frame(t(apply(X = signalDT[,..calcCols], 2, median))) + + } else { + stop("Error in regionOLMean function. Invalid metric specified.") + } + + if (returnCovInfo) { + # add columns for coverage info + signalAve$signalCoverage <- length(unique(queryHits(hits))) + signalAve$regionSetCoverage <- length(unique(subjectHits(hits))) + } + + return(signalAve) +} + +########################################################################## +# matrix scoring +# 1. make a region set matrix. The dimensions are nrows=nfeatures of +# epigenetic data segmentation (e.g. ATAC consensus peaks), ncol=nregionsets. +# It has a 1 for data regions that are overlapped by a given region set +# and a zero for data regions that do not overlap the region set +# This will produce an unweighted mean. +# 2. Multiply the region set matrix times the loading/correlation matrix. +# 3. Divide by total covered regions for that region set to get the mean. This +# is the region set score. + +# @template signalCoord +# @template GRList +# @template scoringMetric +# @value Returns a matrix where each column corresponds to one region set +# and rows are data regions. Weights depend on the scoringMetric +# All signalCoord regions and region set regions should be run through function +# at the same time so that there will be proper weighting if a region set +# region overlaps multiple signalCoord regions and a scoringMetric other than +# "simpleMean" is being used. +olToMat = function(signalCoord, GRList, scoringMetric) { + # calculate overlaps only once + # region set must be subject to fit with scoring functions + olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, + subject = x)) + + # each column is a region set + rsMat = matrix(data = rep(0, length(GRList) * length(signalCoord)), + nrow=length(signalCoord)) + if (scoringMetric == "simpleMean") { + for (i in seq_along(GRList)) { + rsMat[unique(queryHits(olList[[i]])), i] = 1 + } + } else if (scoringMetric == "regionMean") { + # instead of 1 give weight proportional to how many signalCoord are + # overlapped + + for (i in seq_along(GRList)) { + # aggregate number of overlaps by region set region + tmp = as.data.table(olList[[i]]) + # want the count per region set number + tmp[, .N, by=subjectHits] + + # rsMat[unique(queryHits(olList[[i]])), i] = 1 + } + + } else if (scoringMetric == "proportionWeightedMean") { + + } else { + stop("The given scoringMetric cannot be used with matrix scoring.") + } + + + colnames(rsMat) = names(GRList) + return(rsMat) + # totalRegionNumber = sapply(X = GRList, length) + # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) +} +# rsMat = olToMat() + +# loadings=loadings[, c("PC1", "PC2")] +# +# rsScoresMatrix = t(rsMat) %*% loadings +# covCount = colSums(rsMat) +# rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) +# View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) +# View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) + From 7281c7762e7b4b68af823fb41296c6eed373721b Mon Sep 17 00:00:00 2001 From: j-lawson Date: Sat, 6 Feb 2021 21:33:28 -0500 Subject: [PATCH 03/38] Function to make region set overlap matrix and matrix related changes --- R/COCOA.R | 151 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 124 insertions(+), 27 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index dcb6c1d..d92dcc7 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -294,9 +294,9 @@ aggregateSignal <- function(signal, # if a lot of entries became 0 # the scoring metrics that support matrix scoring - if (scoringMetric %in% c("simpleMean", + if ((scoringMetric %in% c("simpleMean", "regionMean", - "proportionWeightedMean")) { + "proportionWeightedMean")) & !is.null(rsOLMat)) { # dim(rsMat) # loadings<-loadings[, c("PC1", "PC2")] @@ -312,18 +312,11 @@ aggregateSignal <- function(signal, MARGIN = 2, FUN = function(x) x/covCount)) results$signalCoverage <- covCount - # rsOLMat does not give the regionSetCoverage, totalRegionNumber - # or meanRegionSize - - # add these columns? - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - - + + # don't add coverage columns since they are the same every time + # and only need to be calculated once + } @@ -2251,47 +2244,151 @@ regionOLMean <- function(signalDT, signalGR, regionSet, # @template signalCoord # @template GRList # @template scoringMetric -# @value Returns a matrix where each column corresponds to one region set -# and rows are data regions. Weights depend on the scoringMetric +# @value Returns a list with two items: 1. a weight matrix where each +# column corresponds to one region set +# and rows are data regions. 2. A data.frame with size and coverage information +# about the region sets. + +# Matrix weights depend on the scoringMetric # All signalCoord regions and region set regions should be run through function # at the same time so that there will be proper weighting if a region set # region overlaps multiple signalCoord regions and a scoringMetric other than # "simpleMean" is being used. -olToMat = function(signalCoord, GRList, scoringMetric) { +# @examples data("brcaMCoord1") +# data("nrf1_chr1") +# data("esr1_chr1") +# myGRL <- GRangesList(esr1_chr1, nrf1_chr1) +# +olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { + + # calculate overlaps only once # region set must be subject to fit with scoring functions olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, subject = x)) # each column is a region set - rsMat = matrix(data = rep(0, length(GRList) * length(signalCoord)), + rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), nrow=length(signalCoord)) + + nMat <- ceiling(length(signalCoord) / maxRow) + finalMatRows <- length(signalCoord) %% maxRow + if (finalMatRows == 0) { + finalMatRows = maxRow + } + # rep(x, 0) is fine if nMat=1 + matRowNVec = c(rep(maxRow, nMat-1), finalMatRows) + rsMatList <- list() + if (nMat > 1) { + for (matCount in 1:(nMat-1)) { + rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList), + nrow=maxRow)) + } + # final matrix might not be the same size + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), + nrow=finalMatRows)) + + } else { # nMat = 1 + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), + nrow=finalMatRows)) + } + ####################################################################### + # tmpVec = rep(0, length(signalCoord)) if (scoringMetric == "simpleMean") { for (i in seq_along(GRList)) { - rsMat[unique(queryHits(olList[[i]])), i] = 1 - } + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + # normalize by number so that matrix multiplication during COCOA scoring will produce mean + tmpVec[unique(queryHits(olList[[i]]))] <- 1 / length(unique(queryHits(olList[[i]]))) + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + } + } } else if (scoringMetric == "regionMean") { # instead of 1 give weight proportional to how many signalCoord are # overlapped for (i in seq_along(GRList)) { - # aggregate number of overlaps by region set region - tmp = as.data.table(olList[[i]]) - # want the count per region set number - tmp[, .N, by=subjectHits] - - # rsMat[unique(queryHits(olList[[i]])), i] = 1 + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + # aggregate number of overlaps by region set region + tmp <- as.data.table(olList[[i]]) + # want the count per region set number + tmp[, rCount := (1/.N), , by=subjectHits] + normFactor <- sum(tmp$rCount) + tmpVec[tmp$queryHits] <- tmp$rCount / normFactor + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + } } } else if (scoringMetric == "proportionWeightedMean") { + for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + olap <- pintersect(GRList[[i]][subjectHits(olList[[i]])], + signalCoord[queryHits(olList[[i]])]) + pOlap <- width(olap) / width(GRList[[i]][subjectHits(olList[[i]])]) + + # weighted average + denom <- sum(pOlap) + + # aggregate pOlap by signalCoord region + olDT <- data.table(queryHits = queryHits(olList[[i]]), + pOlap=pOlap/denom) + normDT <- olDT[, .(coordSum := sum(pOlap)), by=queryHits)] + tmpVec[normDT$queryHits] <- normDT$coordSum + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + + + # below comment outdated? + # some rows may be duplicated if a signalMat region overlapped multiple + # regions from signalGR but that is ok + # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying + + + + } + } } else { stop("The given scoringMetric cannot be used with matrix scoring.") } - colnames(rsMat) = names(GRList) - return(rsMat) + lapply(X = rsMat, FUN = function(x) colnames(x) <- names(GRList)) + + + totalRegionNumber <- vapply(X = GRList, FUN = length, FUN.VALUE = -1) + meanRegionSize <- + # list item 2 + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, totalRegionNumber, + meanRegionSize) + # results[, signalCoverage := 0] + # results[, regionSetCoverage := 0] + # results[, totalRegionNumber := numOfRegions] + # results[, meanRegionSize := round(mean(width(regionSet)), 1)] + + + # overlap matrix and region set coverage info as data.frame + return(list(rsMatList, rsInfo)) # totalRegionNumber = sapply(X = GRList, length) # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) } From 682daee83bebcc706d954e775b52012e0f3e700f Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 12:54:10 -0500 Subject: [PATCH 04/38] Fix bugs matrix scoring --- R/COCOA.R | 191 ++++++++++++++++++++++++++++++++---------------- R/permutation.R | 24 +++++- 2 files changed, 149 insertions(+), 66 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index d92dcc7..810c2ea 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -536,7 +536,8 @@ aggregateSignalGRList <- function(signal, scoringMetric = "default", verbose = TRUE, absVal=TRUE, olList=NULL, - pOlapList=NULL, rsOLMat=NULL, + pOlapList=NULL, rsMatList=NULL, + signalList=NULL, rsInfo=NULL, returnCovInfo=TRUE) { ################### checking inputs ################################# @@ -618,12 +619,6 @@ aggregateSignalGRList <- function(signal, Check spelling/capitalization.")) } } - - # create region set overlap matrix (only used for multiBase) - if (is.null(rsOLMat) && (signalCoordType == "multiBase")) { - olToMat(signalCoord = signalCoord, GRList = GRList) - } - # convert object class outside aggregateSignal to extra prevent copying # (one scoring method needs `signal` as a matrix though) @@ -644,7 +639,30 @@ aggregateSignalGRList <- function(signal, absVal <- FALSE } - if (is.null(olList)) { + # create region set overlap matrix + # this code should be after code modifying "signal" + if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + if (is.null(signalList)) { + signalMatList <- splitSignal(signal = signal, + maxRow = nrow(rsMatList[[1]])) + } + } + + + if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + resultsDF <- matScore(rsMatList = rsMatList, + signalMatList=signalMatList, + rsInfo=rsInfo) + } else if (is.null(olList)) { # apply over the list of region sets resultsList <- lapplyAlias(GRList, function(x) aggregateSignal( @@ -712,19 +730,15 @@ aggregateSignalGRList <- function(signal, } - - resultsDT <- do.call(rbind, resultsList) - - # # add names if they are present - # if (!is.null(names(GRList))) { - # # resultsDT[, rsName := names(GRList)] - # row.names(resultsDT) <- names(GRList) - # } - - # rsName <- row.names(resultsDT) - resultsDF <- as.data.frame(resultsDT) #, row.names = rsName) - - return(resultsDF) + if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + return(resultsDF) + } else { + resultsDT <- do.call(rbind, resultsList) + resultsDF <- as.data.frame(resultsDT) + return(resultsDF) + } } # @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal @@ -753,7 +767,7 @@ createCorFeatureMat <- function(dataMat, featureMat, centerDataMat=TRUE, centerFeatureMat = TRUE, testType="cor", covariate=NULL) { - featureMat <- as.matrix(featureMat) + featureMat <- as.matrix(featureMat) # copies? featureNames <- colnames(featureMat) nFeatures <- ncol(featureMat) nDataDims <- nrow(dataMat) @@ -775,7 +789,7 @@ createCorFeatureMat <- function(dataMat, featureMat, } # avoid this copy and/or delay transpose until after calculating correlation? - dataMat <- as.data.frame(t(dataMat)) + dataMat <- as.data.frame(t(dataMat)) # copies if (testType == "cor") { @@ -2258,19 +2272,27 @@ regionOLMean <- function(signalDT, signalGR, regionSet, # data("nrf1_chr1") # data("esr1_chr1") # myGRL <- GRangesList(esr1_chr1, nrf1_chr1) +# res = olToMat(signalCoord=, GRList=myGRL, scoringMetric="regionMean") +# olMatList <- res[[1]] +# coverageInfo <- res[[2]] # olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { - + + # input checks + if (is.null(names(GRList))) { + names(GRList) <- paste0("regionSet", seq_along(GRList)) + } + ########################################################################### # calculate overlaps only once # region set must be subject to fit with scoring functions olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, subject = x)) - # each column is a region set - rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), - nrow=length(signalCoord)) - + # # each column is a region set + # rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), + # nrow=length(signalCoord)) + # nMat <- ceiling(length(signalCoord) / maxRow) finalMatRows <- length(signalCoord) %% maxRow if (finalMatRows == 0) { @@ -2281,22 +2303,32 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { rsMatList <- list() if (nMat > 1) { for (matCount in 1:(nMat-1)) { - rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList), - nrow=maxRow)) + rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList)), + nrow=maxRow) + colnames(rsMatList[[matCount]]) <- names(GRList) } # final matrix might not be the same size - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), - nrow=finalMatRows)) + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + nrow=finalMatRows) + colnames(rsMatList[[nMat]]) <- names(GRList) } else { # nMat = 1 - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), - nrow=finalMatRows)) + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + nrow=finalMatRows) + colnames(rsMatList[[nMat]]) <- names(GRList) } + + regionSetCoverage <- rep(0, length(GRList)) + signalCoverage <- rep(0, length(GRList)) + ####################################################################### # tmpVec = rep(0, length(signalCoord)) if (scoringMetric == "simpleMean") { for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # normalize by number so that matrix multiplication during COCOA scoring will produce mean @@ -2306,8 +2338,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { for (j in seq_along(rsMatList)) { rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] tmpCount = tmpCount + matRowNVec[j] - } - } + } + } } } else if (scoringMetric == "regionMean") { # instead of 1 give weight proportional to how many signalCoord are @@ -2315,6 +2347,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { for (i in seq_along(GRList)) { if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # aggregate number of overlaps by region set region @@ -2333,9 +2367,11 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } } else if (scoringMetric == "proportionWeightedMean") { - + sumProportionOverlap = rep(0, length(GRList)) for (i in seq_along(GRList)) { - if (length(olList[[i]]) != 0) { + if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) olap <- pintersect(GRList[[i]][subjectHits(olList[[i]])], @@ -2344,11 +2380,12 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # weighted average denom <- sum(pOlap) + sumProportionOverlap[i] = denom # aggregate pOlap by signalCoord region olDT <- data.table(queryHits = queryHits(olList[[i]]), pOlap=pOlap/denom) - normDT <- olDT[, .(coordSum := sum(pOlap)), by=queryHits)] + normDT <- olDT[, .(coordSum = sum(pOlap)), by=queryHits] tmpVec[normDT$queryHits] <- normDT$coordSum # for coordinate chunks @@ -2362,43 +2399,67 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # some rows may be duplicated if a signalMat region overlapped multiple # regions from signalGR but that is ok # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying - - - } } } else { stop("The given scoringMetric cannot be used with matrix scoring.") } - - - lapply(X = rsMat, FUN = function(x) colnames(x) <- names(GRList)) - totalRegionNumber <- vapply(X = GRList, FUN = length, FUN.VALUE = -1) - meanRegionSize <- + meanRegionSize <- vapply(X = GRList, FUN = function(x) mean(width(x)), + FUN.VALUE = -1) # list item 2 - rsInfo = data.frame(rsName=names(GRList), signalCoverage, - regionSetCoverage, totalRegionNumber, - meanRegionSize) - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - - # overlap matrix and region set coverage info as data.frame + if (scoringMetric == "proportionWeightedMean") { + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, sumProportionOverlap, + totalRegionNumber, + meanRegionSize) + } else { + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, totalRegionNumber, + meanRegionSize) + } + + # overlap matrix list and region set coverage info as data.frame return(list(rsMatList, rsInfo)) # totalRegionNumber = sapply(X = GRList, length) # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) } -# rsMat = olToMat() -# loadings=loadings[, c("PC1", "PC2")] -# -# rsScoresMatrix = t(rsMat) %*% loadings -# covCount = colSums(rsMat) -# rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) -# View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) -# View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) +# multiply region set matrices with data matrices to get COCOA score +matScore <- function(rsMatList, signalMatList, covInfo) { + + # rsMatList mats are features X region sets + # signalMatList is features X target variables + # multiply matrices + scoreL <- mapply(FUN = function(x, y) t(x) %*% y, + x=rsMatList, y=signalMatList) + # each item in scoreL is region sets X target variables + # combine results. Already normalized to one so can just add to get mean. + scoreDF <- do.call("+", scoreL) + + resultsDF <- cbind(scoreDF, covInfo) + return(resultsDF) +} + +splitSignal <- function(signal, maxRow=500000) { + + signalList <- list() + nMat <- ceiling(nrow(signal) / maxRow) + + finalMatRows <- nrow(signal) %% maxRow + if (finalMatRows == 0) { + finalMatRows = maxRow + } + # rep(x, 0) is fine if nMat=1 + matRowNVec = c(rep(maxRow, nMat-1), finalMatRows) + tmpCount = 1 + for (j in seq(nMat)) { + signalList[[j]] <- signal[tmpCount:(tmpCount + matRowNVec[j]-1), ] + tmpCount = tmpCount + matRowNVec[j] + } + + return(signalList) +} + diff --git a/R/permutation.R b/R/permutation.R index f1bd41d..7bdfda9 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -277,8 +277,25 @@ runCOCOAPerm <- function(genomicSignal, } else { pOlapList <- NULL } + ######################################################################### + # create region set overlap matrix + # this code should be after code modifying "signal" + if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + if (is.null(signalList)) { + signalMatList <- splitSignal(signal = signal, + maxRow = nrow(rsMatList[[1]])) + } + } - ################# + + ####################################################################### indList <- list() @@ -521,6 +538,8 @@ runCOCOA <- function(genomicSignal, variationMetric = "cor", scoringMetric="default", verbose=TRUE, absVal=TRUE, olList=NULL, pOlapList=NULL, + rsMatList=NULL, + signalList=NULL, rsInfo=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, returnCovInfo=TRUE) { @@ -545,6 +564,9 @@ runCOCOA <- function(genomicSignal, targetVar <- data.frame(targetVar[sampleOrder, ]) colnames(targetVar) <- featureNames + + + # calculate correlation featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal, featureMat = targetVar, From fb6da71be8ed64aba802aaa0c9475273e50a4909 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 20:30:34 -0500 Subject: [PATCH 05/38] code to support matrix calculations --- R/COCOA.R | 113 ++++++++---------------------- R/permutation.R | 183 ++++++++++++++++++++++++++---------------------- 2 files changed, 132 insertions(+), 164 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 810c2ea..7e49cd5 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -108,7 +108,7 @@ if (getRversion() >= "2.15.1") { #' is "proportionWeightedMean". This vector should contain the proportion of #' each regionSet region that is overlapped by a signalCoord region. The #' order of pOlap should be the same as the overlaps in rsOL. -#' @param rsOLMat Matrix. +#' @param rsMatList Matrix. #' @template returnCovInfo #' @template checkInput @@ -150,8 +150,7 @@ aggregateSignal <- function(signal, scoringMetric = "default", verbose = FALSE, absVal=TRUE, - rsOL=NULL, pOlap=NULL, - rsOLMat=NULL, + rsOL=NULL, pOlap=NULL, returnCovInfo=TRUE, .checkInput=TRUE) { @@ -296,7 +295,7 @@ aggregateSignal <- function(signal, # the scoring metrics that support matrix scoring if ((scoringMetric %in% c("simpleMean", "regionMean", - "proportionWeightedMean")) & !is.null(rsOLMat)) { + "proportionWeightedMean")) & !is.null(rsMatList)) { # dim(rsMat) # loadings<-loadings[, c("PC1", "PC2")] @@ -489,9 +488,7 @@ aggregateSignal <- function(signal, # method. returns a 95% confidence interval from the Wilcoxon rank sum test # instead of p value. #' @template absVal -#' @template olList -#' @template pOlapList -#' @param rsOLMat. +#' @param rsMatList. #' @template returnCovInfo #' @return Data.frame of results, one row for each region set. #' It has the following columns: @@ -535,8 +532,8 @@ aggregateSignalGRList <- function(signal, signalCoordType = "default", scoringMetric = "default", verbose = TRUE, - absVal=TRUE, olList=NULL, - pOlapList=NULL, rsMatList=NULL, + absVal=TRUE, + rsMatList=NULL, signalList=NULL, rsInfo=NULL, returnCovInfo=TRUE) { @@ -547,8 +544,8 @@ aggregateSignalGRList <- function(signal, signalCoord=signalCoord, regionSet=NULL, signalCol = signalCol, - GRList=GRList, - olList=olList) + GRList=GRList)#, + #olList=olList) ########## check that dimensions of inputs are consistent # length of signal coord = nrow of signal @@ -639,30 +636,34 @@ aggregateSignalGRList <- function(signal, absVal <- FALSE } - # create region set overlap matrix - # this code should be after code modifying "signal" - if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", - "regionMean", - "proportionWeightedMean"))) { - olMatRes <- olToMat(signalListCoord = signalCoord, - GRList = GRList, - scoringMetric = scoringMetric) - rsMatList <- olMatRes[[1]] - rsInfo <- olMatRes[[2]] - if (is.null(signalList)) { - signalMatList <- splitSignal(signal = signal, - maxRow = nrow(rsMatList[[1]])) - } - } + # ## if this function is only being run once, use data.table calculations + # # create region set overlap matrix + # # this code should be after code modifying "signal" + # if (scoringMetric %in% c("simpleMean", + # "regionMean", + # "proportionWeightedMean")) { + # if (is.null(rsMatList)) { + # olMatRes <- olToMat(signalListCoord = signalCoord, + # GRList = GRList, + # scoringMetric = scoringMetric) + # rsMatList <- olMatRes[[1]] + # rsInfo <- olMatRes[[2]] + # } + # + # if (is.null(signalList)) { + # signalMatList <- splitSignal(signal = signal, + # maxRow = nrow(rsMatList[[1]])) + # } + # } - if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { resultsDF <- matScore(rsMatList = rsMatList, signalMatList=signalMatList, rsInfo=rsInfo) - } else if (is.null(olList)) { + } else { # not matrix COCOA. Old COCOA with data.table # apply over the list of region sets resultsList <- lapplyAlias(GRList, function(x) aggregateSignal( @@ -675,62 +676,10 @@ aggregateSignalGRList <- function(signal, verbose = verbose, absVal = absVal, pOlap=NULL, returnCovInfo=returnCovInfo)) - # resultsList <- mapply(FUN = function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = x, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, pOlap=y, - # returnCovInfo=returnCovInfo), x=GRList, y=pOlapList) - #wilcox.conf.int = wilcox.conf.int, - } else { - # # apply over the list of region sets - # resultsList <- lapplyAlias(olList, - # function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = NULL, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, - # rsOL=x, pOlap=pOlapList, returnCovInfo=returnCovInfo)) - # #wilcox.conf.int = wilcox.conf.int, - if (is.null(pOlapList)) { - # mapply does not work if pOlapList is null and other argument is not null - resultsList <- lapplyAlias(olList, FUN = function(x) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, - returnCovInfo=returnCovInfo)) - } else { - resultsList <- mapply(FUN = function(x, y) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, pOlap=y, - returnCovInfo=returnCovInfo), - x=olList, y=pOlapList) - } - + } } - if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { return(resultsDF) diff --git a/R/permutation.R b/R/permutation.R index 7bdfda9..f0a589e 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -29,7 +29,6 @@ #' @template signalCol #' @template scoringMetric #' @template absVal -#' @template olList #' @param centerGenomicSignal Logical. Should rows in genomicSignal #' be centered based on #' their means? (subtracting row mean from each row) @@ -136,7 +135,6 @@ runCOCOAPerm <- function(genomicSignal, signalCol=c("PC1", "PC2"), scoringMetric="default", absVal=TRUE, - olList=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, variationMetric="cor", @@ -160,7 +158,7 @@ runCOCOAPerm <- function(genomicSignal, } rsScores = rsScores[, colsToAnnotate] # prevents error that occurs if extra column is factor - # more efficient to only do once (not that high impact though) + # more efficient to only do once if (centerGenomicSignal) { cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE) # centering before calculating correlation @@ -207,76 +205,78 @@ runCOCOAPerm <- function(genomicSignal, } } - ################# - if (is.null(olList)) { - - ####### - # must take out NA rows before getting OL list. Otherwise later calculations - # will use wrong indices. - - # what happens if there are NAs or Inf in `signal`? - # any NAs that overlap the regionSet will cause the score to be NA - if (is(genomicSignal, "data.table")) { - naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } else { - naRows = apply(X = genomicSignal[, , drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } - - if (any(naRows)) { - genomicSignal <- genomicSignal[!naRows, ] - signalCoord <- signalCoord[!naRows] - warning("Removing rows with NA from `genomicSignal`") - } - - ################################################################# - - # calculate overlaps only once - # region set must be subject to fit with scoring functions - olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, - subject = x)) - totalRegionNumber = sapply(X = GRList, length) - meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) - } - - # also calculate coverage info - # @param rsOL - calculateCovInfo <- function(rsOL, - scoringMetric=scoringMetric, - pOlap=NULL) { - - covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), - regionSetCoverage=length(unique(subjectHits(rsOL)))) - - if (scoringMetric == "proportionWeightedMean") { - - covInfo$sumProportionOverlap <- sum(pOlap) - } - - } - - covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, - scoringMetric = scoringMetric)) - - - if (scoringMetric == "proportionWeightedMean") { - getPOlap <- function(rsOL, signalGR, regionSet) { - olap <- pintersect(signalGR[queryHits(rsOL)], - regionSet[subjectHits(rsOL)]) - pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)]) - return(pOlap) - } - - # list - pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, - signalGR=signalCoord, - regionSet = y), - x=olList, y=GRList, SIMPLIFY = FALSE) - covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum) - } else { - pOlapList <- NULL - } + ######################################################################## + # # deprecated. Now using matrix calculations instead + # + # if (is.null(olList)) { + # + # ####### + # # must take out NA rows before getting OL list. Otherwise later calculations + # # will use wrong indices. + # + # # what happens if there are NAs or Inf in `signal`? + # # any NAs that overlap the regionSet will cause the score to be NA + # if (is(genomicSignal, "data.table")) { + # naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], + # MARGIN = 1, FUN = function(x) any(is.na(x))) + # } else { + # naRows = apply(X = genomicSignal[, , drop=FALSE], + # MARGIN = 1, FUN = function(x) any(is.na(x))) + # } + # + # if (any(naRows)) { + # genomicSignal <- genomicSignal[!naRows, ] + # signalCoord <- signalCoord[!naRows] + # warning("Removing rows with NA from `genomicSignal`") + # } + # + # ################################################################# + # + # # calculate overlaps only once + # # region set must be subject to fit with scoring functions + # olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, + # subject = x)) + # totalRegionNumber = sapply(X = GRList, length) + # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) + # } + # + # # also calculate coverage info + # # @param rsOL + # calculateCovInfo <- function(rsOL, + # scoringMetric=scoringMetric, + # pOlap=NULL) { + # + # covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), + # regionSetCoverage=length(unique(subjectHits(rsOL)))) + # + # if (scoringMetric == "proportionWeightedMean") { + # + # covInfo$sumProportionOverlap <- sum(pOlap) + # } + # + # } + # + # covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, + # scoringMetric = scoringMetric)) + # + # + # if (scoringMetric == "proportionWeightedMean") { + # getPOlap <- function(rsOL, signalGR, regionSet) { + # olap <- pintersect(signalGR[queryHits(rsOL)], + # regionSet[subjectHits(rsOL)]) + # pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)]) + # return(pOlap) + # } + # + # # list + # pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, + # signalGR=signalCoord, + # regionSet = y), + # x=olList, y=GRList, SIMPLIFY = FALSE) + # covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum) + # } else { + # pOlapList <- NULL + # } ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" @@ -328,8 +328,8 @@ runCOCOAPerm <- function(genomicSignal, centerGenomicSignal = centerGenomicSignal, centerTargetVar = centerTargetVar, verbose=verbose, - olList=olList, - pOlapList=pOlapList, + rsMatList = rsMatList, + rsInfo = rsInfo, returnCovInfo = returnCovInfo) message(y) # must be ahead of object that is saved as cache, not after tmp @@ -354,8 +354,8 @@ runCOCOAPerm <- function(genomicSignal, scoringMetric=scoringMetric, absVal=absVal, verbose=verbose, - olList=olList, - pOlapList=pOlapList, + rsMatList=rsMatList, + rsInfo = rsInfo, returnCovInfo = returnCovInfo) message(".") return(tmp) @@ -486,8 +486,6 @@ runCOCOAPerm <- function(genomicSignal, #' @template scoringMetric #' @template verbose #' @template absVal -#' @template olList -#' @template pOlapList #' @param centerGenomicSignal Logical. Should rows in genomicSignal #' be centered based on #' their means? (subtracting row mean from each row) @@ -537,13 +535,14 @@ runCOCOA <- function(genomicSignal, sampleOrder=1:nrow(targetVar), variationMetric = "cor", scoringMetric="default", verbose=TRUE, - absVal=TRUE, olList=NULL, pOlapList=NULL, + absVal=TRUE, rsMatList=NULL, - signalList=NULL, rsInfo=NULL, + rsInfo=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, returnCovInfo=TRUE) { + signalList <- NULL # if vector is given, return error if (is.null(dim(targetVar))) { stop("`targetVar` should be a matrix or data.frame") @@ -580,13 +579,33 @@ runCOCOA <- function(genomicSignal, absVal <- FALSE } + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", + "proportionWeightedMean"))) { + + if (is.null(rsMatList)) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + } + if (is.null(signalList)) { + signalList <- splitSignal(signal = featureLabelCor, + maxRow = nrow(rsMatList[[1]])) + } + + } + # run COCOA thisPermRes <- aggregateSignalGRList(signal=featureLabelCor, signalCoord=signalCoord, GRList=GRList, signalCol = signalCol, scoringMetric = scoringMetric, verbose = verbose, - absVal = absVal, olList = olList, pOlapList=pOlapList, + absVal = absVal, # olList = olList, pOlapList=pOlapList, + rsMatList=rsMatList, + rsInfo = rsInfo, + signalList=signalList, returnCovInfo=returnCovInfo) # return From 6719706d9180539e07752d6ce02f0b2df3936e98 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 20:35:40 -0500 Subject: [PATCH 06/38] Fix typo --- R/COCOA.R | 1 - R/permutation.R | 7 ------- 2 files changed, 8 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 7e49cd5..7e71f3b 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -678,7 +678,6 @@ aggregateSignalGRList <- function(signal, returnCovInfo=returnCovInfo)) } - } if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { diff --git a/R/permutation.R b/R/permutation.R index f0a589e..ba25dba 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -582,13 +582,6 @@ runCOCOA <- function(genomicSignal, if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { - if (is.null(rsMatList)) { - olMatRes <- olToMat(signalListCoord = signalCoord, - GRList = GRList, - scoringMetric = scoringMetric) - rsMatList <- olMatRes[[1]] - rsInfo <- olMatRes[[2]] - } if (is.null(signalList)) { signalList <- splitSignal(signal = featureLabelCor, maxRow = nrow(rsMatList[[1]])) From 172948b1c4b2767c1cec46777f96eea285b38617 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:03:39 -0500 Subject: [PATCH 07/38] Fix typo --- R/permutation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/permutation.R b/R/permutation.R index ba25dba..caca4bf 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -280,7 +280,7 @@ runCOCOAPerm <- function(genomicSignal, ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" - if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { olMatRes <- olToMat(signalListCoord = signalCoord, From 15ccc6de6e59bb40cc68f048e190a8dcc9aa4d54 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:12:32 -0500 Subject: [PATCH 08/38] Fix bug --- R/permutation.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/R/permutation.R b/R/permutation.R index caca4bf..c2b8e95 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -280,18 +280,14 @@ runCOCOAPerm <- function(genomicSignal, ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" - if (is.null(rsMatList) && (scoringMetric %in% c("simpleMean", + if (scoringMetric %in% c("simpleMean", "regionMean", - "proportionWeightedMean"))) { + "proportionWeightedMean")) { olMatRes <- olToMat(signalListCoord = signalCoord, GRList = GRList, scoringMetric = scoringMetric) rsMatList <- olMatRes[[1]] rsInfo <- olMatRes[[2]] - if (is.null(signalList)) { - signalMatList <- splitSignal(signal = signal, - maxRow = nrow(rsMatList[[1]])) - } } From 9a02d29f4b64d712e27095f7d01c756d09950ad3 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:23:16 -0500 Subject: [PATCH 09/38] Fix typo --- R/COCOA.R | 2 +- R/permutation.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 7e71f3b..415f111 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -643,7 +643,7 @@ aggregateSignalGRList <- function(signal, # "regionMean", # "proportionWeightedMean")) { # if (is.null(rsMatList)) { - # olMatRes <- olToMat(signalListCoord = signalCoord, + # olMatRes <- olToMat(signalCoord = signalCoord, # GRList = GRList, # scoringMetric = scoringMetric) # rsMatList <- olMatRes[[1]] diff --git a/R/permutation.R b/R/permutation.R index c2b8e95..ea9e2d6 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -283,7 +283,7 @@ runCOCOAPerm <- function(genomicSignal, if (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean")) { - olMatRes <- olToMat(signalListCoord = signalCoord, + olMatRes <- olToMat(signalCoord = signalCoord, GRList = GRList, scoringMetric = scoringMetric) rsMatList <- olMatRes[[1]] From e899e5fa1b3f7b67d7ab2a1af39f2c1ece5e8007 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:44:29 -0500 Subject: [PATCH 10/38] Fix param name --- R/COCOA.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 415f111..a5d745d 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -2375,7 +2375,7 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } # multiply region set matrices with data matrices to get COCOA score -matScore <- function(rsMatList, signalMatList, covInfo) { +matScore <- function(rsMatList, signalMatList, rsInfo) { # rsMatList mats are features X region sets # signalMatList is features X target variables @@ -2386,7 +2386,7 @@ matScore <- function(rsMatList, signalMatList, covInfo) { # combine results. Already normalized to one so can just add to get mean. scoreDF <- do.call("+", scoreL) - resultsDF <- cbind(scoreDF, covInfo) + resultsDF <- cbind(scoreDF, rsInfo) return(resultsDF) } From aeb0447cade64e970ce9e0f5d4174827dba6bcfc Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 22:03:12 -0500 Subject: [PATCH 11/38] Fix typo --- R/COCOA.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/COCOA.R b/R/COCOA.R index a5d745d..503b5fe 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -661,7 +661,7 @@ aggregateSignalGRList <- function(signal, "regionMean", "proportionWeightedMean"))) { resultsDF <- matScore(rsMatList = rsMatList, - signalMatList=signalMatList, + signalMatList=signalList, rsInfo=rsInfo) } else { # not matrix COCOA. Old COCOA with data.table # apply over the list of region sets From 109d66653086f70f07410146f5a7f29e6eec1a8e Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 12 Feb 2021 10:15:18 -0500 Subject: [PATCH 12/38] Fix bug --- R/COCOA.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 503b5fe..1e816ff 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -2381,10 +2381,10 @@ matScore <- function(rsMatList, signalMatList, rsInfo) { # signalMatList is features X target variables # multiply matrices scoreL <- mapply(FUN = function(x, y) t(x) %*% y, - x=rsMatList, y=signalMatList) + x=rsMatList, y=signalMatList, SIMPLIFY = FALSE) # each item in scoreL is region sets X target variables # combine results. Already normalized to one so can just add to get mean. - scoreDF <- do.call("+", scoreL) + scoreDF <- Reduce("+", scoreL) resultsDF <- cbind(scoreDF, rsInfo) return(resultsDF) From d31c4042851f2bfe1cd800e52a5063b3d522bb58 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Sat, 6 Feb 2021 21:33:28 -0500 Subject: [PATCH 13/38] Function to make region set overlap matrix and matrix related changes --- R/COCOA.R | 248 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 192 insertions(+), 56 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index efedccb..5de415b 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -108,7 +108,7 @@ if (getRversion() >= "2.15.1") { #' is "proportionWeightedMean". This vector should contain the proportion of #' each regionSet region that is overlapped by a signalCoord region. The #' order of pOlap should be the same as the overlaps in rsOL. -#' @param rsOLMat +#' @param rsOLMat Matrix. #' @template returnCovInfo #' @template checkInput @@ -293,43 +293,46 @@ aggregateSignal <- function(signal, # would rounding speed up aggregation?, potentially make a sparse matrix # if a lot of entries became 0 + # the scoring metrics that support matrix scoring + if ((scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean")) & !is.null(rsOLMat)) { + + # dim(rsMat) + # loadings<-loadings[, c("PC1", "PC2")] + # dim(loadings) + # + # loadings <- abs(loadings) + + # sum per region set + rsScoresMatrix <- t(rsMat) %*% loadings + # normalize for regions covered (for average) + covCount <- colSums(rsMat) + results <- as.data.frame(apply(rsScoresMatrix, + MARGIN = 2, + FUN = function(x) x/covCount)) + results$signalCoverage <- covCount + + + # don't add coverage columns since they are the same every time + # and only need to be calculated once + + } + + # works for both singleBase and multiBase (UPDATE: did, but not matrix scoring) if (scoringMetric == "simpleMean") { - if ((signalCoordType == "multiBase") & !is.null(rsOLMat)) { - - dim(rsMat) - loadings<-loadings[, c("PC1", "PC2")] - dim(loadings) - - loadings <- abs(loadings) - rsScoresMatrix <- t(rsMat) %*% loadings - covCount <- colSums(rsMat) - results <- as.data.frame(apply(rsScoresMatrix, - MARGIN = 2, - FUN = function(x) x/covCount)) - results$signalCoverage <- covCount - # rsOLMat does not give the regionSetCoverage, totalRegionNumber - # or meanRegionSize - - # add these columns? - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - } else { - loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, - signalGR = signalCoord, - regionSet = regionSet, - calcCols= signalCol, - metric = "mean", - rsOL = rsOL, - returnCovInfo=returnCovInfo)) - results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, - regionSet=regionSet, signalCol = signalCol, - returnCovInfo=returnCovInfo) - } + loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, + signalGR = signalCoord, + regionSet = regionSet, + calcCols= signalCol, + metric = "mean", + rsOL = rsOL, + returnCovInfo=returnCovInfo)) + results <- .formatResults(loadAgMain, scoringMetric = scoringMetric, + regionSet=regionSet, signalCol = signalCol, + returnCovInfo=returnCovInfo) } else if (scoringMetric == "simpleMedian") { # scoring singleBase and multiBase both with this function for @@ -488,6 +491,7 @@ aggregateSignal <- function(signal, #' @template absVal #' @template olList #' @template pOlapList +#' @param rsOLMat. #' @template returnCovInfo #' @return Data.frame of results, one row for each region set. #' It has the following columns: @@ -532,7 +536,8 @@ aggregateSignalGRList <- function(signal, scoringMetric = "default", verbose = TRUE, absVal=TRUE, olList=NULL, - pOlapList=NULL, returnCovInfo=TRUE) { + pOlapList=NULL, rsOLMat=NULL, + returnCovInfo=TRUE) { ################### checking inputs ################################# @@ -613,6 +618,12 @@ aggregateSignalGRList <- function(signal, Check spelling/capitalization.")) } } + + # create region set overlap matrix (only used for multiBase) + if (is.null(rsOLMat) && (signalCoordType == "multiBase")) { + olToMat(signalCoord = signalCoord, GRList = GRList) + } + # convert object class outside aggregateSignal to extra prevent copying # (one scoring method needs `signal` as a matrix though) @@ -2232,36 +2243,161 @@ regionOLMean <- function(signalDT, signalGR, regionSet, # @template signalCoord # @template GRList -# @value Returns a matrix where each column corresponds to one region set -# and rows are data regions -olToMat = function(signalCoord, GRList) { +# @template scoringMetric +# @value Returns a list with two items: 1. a weight matrix where each +# column corresponds to one region set +# and rows are data regions. 2. A data.frame with size and coverage information +# about the region sets. + +# Matrix weights depend on the scoringMetric +# All signalCoord regions and region set regions should be run through function +# at the same time so that there will be proper weighting if a region set +# region overlaps multiple signalCoord regions and a scoringMetric other than +# "simpleMean" is being used. +# @examples data("brcaMCoord1") +# data("nrf1_chr1") +# data("esr1_chr1") +# myGRL <- GRangesList(esr1_chr1, nrf1_chr1) +# +olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { + + # calculate overlaps only once # region set must be subject to fit with scoring functions olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, subject = x)) # each column is a region set - rsMat = matrix(data = rep(0, length(GRList) * length(signalCoord)), + rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), nrow=length(signalCoord)) - for (i in seq_along(GRList)) { - rsMat[unique(queryHits(olList[[i]])), i] = 1 + + nMat <- ceiling(length(signalCoord) / maxRow) + finalMatRows <- length(signalCoord) %% maxRow + if (finalMatRows == 0) { + finalMatRows = maxRow + } + # rep(x, 0) is fine if nMat=1 + matRowNVec = c(rep(maxRow, nMat-1), finalMatRows) + rsMatList <- list() + if (nMat > 1) { + for (matCount in 1:(nMat-1)) { + rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList), + nrow=maxRow)) + } + # final matrix might not be the same size + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), + nrow=finalMatRows)) + + } else { # nMat = 1 + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), + nrow=finalMatRows)) + } + ####################################################################### + # tmpVec = rep(0, length(signalCoord)) + if (scoringMetric == "simpleMean") { + for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + # normalize by number so that matrix multiplication during COCOA scoring will produce mean + tmpVec[unique(queryHits(olList[[i]]))] <- 1 / length(unique(queryHits(olList[[i]]))) + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + } + } + } else if (scoringMetric == "regionMean") { + # instead of 1 give weight proportional to how many signalCoord are + # overlapped + + for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + # aggregate number of overlaps by region set region + tmp <- as.data.table(olList[[i]]) + # want the count per region set number + tmp[, rCount := (1/.N), , by=subjectHits] + normFactor <- sum(tmp$rCount) + tmpVec[tmp$queryHits] <- tmp$rCount / normFactor + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + } + } + + } else if (scoringMetric == "proportionWeightedMean") { + + for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + tmpCount = 1 + tmpVec = rep(0, length(signalCoord)) + olap <- pintersect(GRList[[i]][subjectHits(olList[[i]])], + signalCoord[queryHits(olList[[i]])]) + pOlap <- width(olap) / width(GRList[[i]][subjectHits(olList[[i]])]) + + # weighted average + denom <- sum(pOlap) + + # aggregate pOlap by signalCoord region + olDT <- data.table(queryHits = queryHits(olList[[i]]), + pOlap=pOlap/denom) + normDT <- olDT[, .(coordSum := sum(pOlap)), by=queryHits)] + tmpVec[normDT$queryHits] <- normDT$coordSum + + # for coordinate chunks + for (j in seq_along(rsMatList)) { + rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] + tmpCount = tmpCount + matRowNVec[j] + } + + + # below comment outdated? + # some rows may be duplicated if a signalMat region overlapped multiple + # regions from signalGR but that is ok + # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying + + + + } + } + } else { + stop("The given scoringMetric cannot be used with matrix scoring.") } + + + lapply(X = rsMat, FUN = function(x) colnames(x) <- names(GRList)) - colnames(rsMat) = names(GRList) - return(rsMat) + + totalRegionNumber <- vapply(X = GRList, FUN = length, FUN.VALUE = -1) + meanRegionSize <- + # list item 2 + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, totalRegionNumber, + meanRegionSize) + # results[, signalCoverage := 0] + # results[, regionSetCoverage := 0] + # results[, totalRegionNumber := numOfRegions] + # results[, meanRegionSize := round(mean(width(regionSet)), 1)] + + + # overlap matrix and region set coverage info as data.frame + return(list(rsMatList, rsInfo)) # totalRegionNumber = sapply(X = GRList, length) # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) } -rsMat = olToMat() - -dim(rsMat) -loadings=loadings[, c("PC1", "PC2")] -dim(loadings) - -loadings = abs(loadings) -rsScoresMatrix = t(rsMat) %*% loadings -covCount = colSums(rsMat) -rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) -View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) -View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) +# rsMat = olToMat() +# loadings=loadings[, c("PC1", "PC2")] +# +# rsScoresMatrix = t(rsMat) %*% loadings +# covCount = colSums(rsMat) +# rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) +# View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) +# View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) From b13acd2d6ac358189566a557669cc34e6d72655c Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 12:54:10 -0500 Subject: [PATCH 14/38] Fix bugs matrix scoring --- R/COCOA.R | 192 ++++++++++++++++++++++++++++++++---------------- R/permutation.R | 24 +++++- 2 files changed, 150 insertions(+), 66 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 5de415b..8b26695 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -536,7 +536,8 @@ aggregateSignalGRList <- function(signal, scoringMetric = "default", verbose = TRUE, absVal=TRUE, olList=NULL, - pOlapList=NULL, rsOLMat=NULL, + pOlapList=NULL, rsMatList=NULL, + signalList=NULL, rsInfo=NULL, returnCovInfo=TRUE) { ################### checking inputs ################################# @@ -618,12 +619,6 @@ aggregateSignalGRList <- function(signal, Check spelling/capitalization.")) } } - - # create region set overlap matrix (only used for multiBase) - if (is.null(rsOLMat) && (signalCoordType == "multiBase")) { - olToMat(signalCoord = signalCoord, GRList = GRList) - } - # convert object class outside aggregateSignal to extra prevent copying # (one scoring method needs `signal` as a matrix though) @@ -644,7 +639,30 @@ aggregateSignalGRList <- function(signal, absVal <- FALSE } - if (is.null(olList)) { + # create region set overlap matrix + # this code should be after code modifying "signal" + if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + if (is.null(signalList)) { + signalMatList <- splitSignal(signal = signal, + maxRow = nrow(rsMatList[[1]])) + } + } + + + if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + resultsDF <- matScore(rsMatList = rsMatList, + signalMatList=signalMatList, + rsInfo=rsInfo) + } else if (is.null(olList)) { # apply over the list of region sets resultsList <- lapplyAlias(GRList, function(x) aggregateSignal( @@ -712,19 +730,15 @@ aggregateSignalGRList <- function(signal, } - - resultsDT <- do.call(rbind, resultsList) - - # # add names if they are present - # if (!is.null(names(GRList))) { - # # resultsDT[, rsName := names(GRList)] - # row.names(resultsDT) <- names(GRList) - # } - - # rsName <- row.names(resultsDT) - resultsDF <- as.data.frame(resultsDT) #, row.names = rsName) - - return(resultsDF) + if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + return(resultsDF) + } else { + resultsDT <- do.call(rbind, resultsList) + resultsDF <- as.data.frame(resultsDT) + return(resultsDF) + } } # @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal @@ -753,7 +767,7 @@ createCorFeatureMat <- function(dataMat, featureMat, centerDataMat=TRUE, centerFeatureMat = TRUE, testType="cor", covariate=NULL) { - featureMat <- as.matrix(featureMat) + featureMat <- as.matrix(featureMat) # copies? featureNames <- colnames(featureMat) nFeatures <- ncol(featureMat) nDataDims <- nrow(dataMat) @@ -775,7 +789,7 @@ createCorFeatureMat <- function(dataMat, featureMat, } # avoid this copy and/or delay transpose until after calculating correlation? - dataMat <- as.data.frame(t(dataMat)) + dataMat <- as.data.frame(t(dataMat)) # copies if (testType == "cor") { @@ -2258,19 +2272,27 @@ regionOLMean <- function(signalDT, signalGR, regionSet, # data("nrf1_chr1") # data("esr1_chr1") # myGRL <- GRangesList(esr1_chr1, nrf1_chr1) +# res = olToMat(signalCoord=, GRList=myGRL, scoringMetric="regionMean") +# olMatList <- res[[1]] +# coverageInfo <- res[[2]] # olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { - + + # input checks + if (is.null(names(GRList))) { + names(GRList) <- paste0("regionSet", seq_along(GRList)) + } + ########################################################################### # calculate overlaps only once # region set must be subject to fit with scoring functions olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, subject = x)) - # each column is a region set - rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), - nrow=length(signalCoord)) - + # # each column is a region set + # rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), + # nrow=length(signalCoord)) + # nMat <- ceiling(length(signalCoord) / maxRow) finalMatRows <- length(signalCoord) %% maxRow if (finalMatRows == 0) { @@ -2281,22 +2303,32 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { rsMatList <- list() if (nMat > 1) { for (matCount in 1:(nMat-1)) { - rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList), - nrow=maxRow)) + rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList)), + nrow=maxRow) + colnames(rsMatList[[matCount]]) <- names(GRList) } # final matrix might not be the same size - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), - nrow=finalMatRows)) + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + nrow=finalMatRows) + colnames(rsMatList[[nMat]]) <- names(GRList) } else { # nMat = 1 - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList), - nrow=finalMatRows)) + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + nrow=finalMatRows) + colnames(rsMatList[[nMat]]) <- names(GRList) } + + regionSetCoverage <- rep(0, length(GRList)) + signalCoverage <- rep(0, length(GRList)) + ####################################################################### # tmpVec = rep(0, length(signalCoord)) if (scoringMetric == "simpleMean") { for (i in seq_along(GRList)) { + if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # normalize by number so that matrix multiplication during COCOA scoring will produce mean @@ -2306,8 +2338,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { for (j in seq_along(rsMatList)) { rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] tmpCount = tmpCount + matRowNVec[j] - } - } + } + } } } else if (scoringMetric == "regionMean") { # instead of 1 give weight proportional to how many signalCoord are @@ -2315,6 +2347,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { for (i in seq_along(GRList)) { if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # aggregate number of overlaps by region set region @@ -2333,9 +2367,11 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } } else if (scoringMetric == "proportionWeightedMean") { - + sumProportionOverlap = rep(0, length(GRList)) for (i in seq_along(GRList)) { - if (length(olList[[i]]) != 0) { + if (length(olList[[i]]) != 0) { + signalCoverage[i] = length(unique(queryHits(olList[[i]]))) + regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) tmpCount = 1 tmpVec = rep(0, length(signalCoord)) olap <- pintersect(GRList[[i]][subjectHits(olList[[i]])], @@ -2344,11 +2380,12 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # weighted average denom <- sum(pOlap) + sumProportionOverlap[i] = denom # aggregate pOlap by signalCoord region olDT <- data.table(queryHits = queryHits(olList[[i]]), pOlap=pOlap/denom) - normDT <- olDT[, .(coordSum := sum(pOlap)), by=queryHits)] + normDT <- olDT[, .(coordSum = sum(pOlap)), by=queryHits] tmpVec[normDT$queryHits] <- normDT$coordSum # for coordinate chunks @@ -2362,42 +2399,67 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # some rows may be duplicated if a signalMat region overlapped multiple # regions from signalGR but that is ok # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying - - - } } } else { stop("The given scoringMetric cannot be used with matrix scoring.") } - - - lapply(X = rsMat, FUN = function(x) colnames(x) <- names(GRList)) - totalRegionNumber <- vapply(X = GRList, FUN = length, FUN.VALUE = -1) - meanRegionSize <- + meanRegionSize <- vapply(X = GRList, FUN = function(x) mean(width(x)), + FUN.VALUE = -1) # list item 2 - rsInfo = data.frame(rsName=names(GRList), signalCoverage, - regionSetCoverage, totalRegionNumber, - meanRegionSize) - # results[, signalCoverage := 0] - # results[, regionSetCoverage := 0] - # results[, totalRegionNumber := numOfRegions] - # results[, meanRegionSize := round(mean(width(regionSet)), 1)] - - - # overlap matrix and region set coverage info as data.frame + if (scoringMetric == "proportionWeightedMean") { + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, sumProportionOverlap, + totalRegionNumber, + meanRegionSize) + } else { + rsInfo = data.frame(rsName=names(GRList), signalCoverage, + regionSetCoverage, totalRegionNumber, + meanRegionSize) + } + + # overlap matrix list and region set coverage info as data.frame return(list(rsMatList, rsInfo)) # totalRegionNumber = sapply(X = GRList, length) # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) } -# rsMat = olToMat() -# loadings=loadings[, c("PC1", "PC2")] -# -# rsScoresMatrix = t(rsMat) %*% loadings -# covCount = colSums(rsMat) -# rsScoresMatrix = apply(rsScoresMatrix, MARGIN = 2, FUN = function(x) x/covCount) -# View(rsScoresMatrix[order(rsScoresMatrix[,"PC1"], decreasing = T),]) -# View(rsScoresMatrix[order(rsScoresMatrix[,"PC2"], decreasing = T),]) +# multiply region set matrices with data matrices to get COCOA score +matScore <- function(rsMatList, signalMatList, covInfo) { + + # rsMatList mats are features X region sets + # signalMatList is features X target variables + # multiply matrices + scoreL <- mapply(FUN = function(x, y) t(x) %*% y, + x=rsMatList, y=signalMatList) + # each item in scoreL is region sets X target variables + # combine results. Already normalized to one so can just add to get mean. + scoreDF <- do.call("+", scoreL) + + resultsDF <- cbind(scoreDF, covInfo) + return(resultsDF) +} + +splitSignal <- function(signal, maxRow=500000) { + + signalList <- list() + nMat <- ceiling(nrow(signal) / maxRow) + + finalMatRows <- nrow(signal) %% maxRow + if (finalMatRows == 0) { + finalMatRows = maxRow + } + # rep(x, 0) is fine if nMat=1 + matRowNVec = c(rep(maxRow, nMat-1), finalMatRows) + tmpCount = 1 + for (j in seq(nMat)) { + signalList[[j]] <- signal[tmpCount:(tmpCount + matRowNVec[j]-1), ] + tmpCount = tmpCount + matRowNVec[j] + } + + return(signalList) +} + + diff --git a/R/permutation.R b/R/permutation.R index f1bd41d..7bdfda9 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -277,8 +277,25 @@ runCOCOAPerm <- function(genomicSignal, } else { pOlapList <- NULL } + ######################################################################### + # create region set overlap matrix + # this code should be after code modifying "signal" + if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean"))) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + if (is.null(signalList)) { + signalMatList <- splitSignal(signal = signal, + maxRow = nrow(rsMatList[[1]])) + } + } - ################# + + ####################################################################### indList <- list() @@ -521,6 +538,8 @@ runCOCOA <- function(genomicSignal, variationMetric = "cor", scoringMetric="default", verbose=TRUE, absVal=TRUE, olList=NULL, pOlapList=NULL, + rsMatList=NULL, + signalList=NULL, rsInfo=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, returnCovInfo=TRUE) { @@ -545,6 +564,9 @@ runCOCOA <- function(genomicSignal, targetVar <- data.frame(targetVar[sampleOrder, ]) colnames(targetVar) <- featureNames + + + # calculate correlation featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal, featureMat = targetVar, From 4c9cfa70a332e6f1c2c9eb07d27045c95bc399bf Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 20:30:34 -0500 Subject: [PATCH 15/38] code to support matrix calculations --- R/COCOA.R | 113 ++++++++---------------------- R/permutation.R | 183 ++++++++++++++++++++++++++---------------------- 2 files changed, 132 insertions(+), 164 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 8b26695..deac025 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -108,7 +108,7 @@ if (getRversion() >= "2.15.1") { #' is "proportionWeightedMean". This vector should contain the proportion of #' each regionSet region that is overlapped by a signalCoord region. The #' order of pOlap should be the same as the overlaps in rsOL. -#' @param rsOLMat Matrix. +#' @param rsMatList Matrix. #' @template returnCovInfo #' @template checkInput @@ -150,8 +150,7 @@ aggregateSignal <- function(signal, scoringMetric = "default", verbose = FALSE, absVal=TRUE, - rsOL=NULL, pOlap=NULL, - rsOLMat=NULL, + rsOL=NULL, pOlap=NULL, returnCovInfo=TRUE, .checkInput=TRUE) { @@ -296,7 +295,7 @@ aggregateSignal <- function(signal, # the scoring metrics that support matrix scoring if ((scoringMetric %in% c("simpleMean", "regionMean", - "proportionWeightedMean")) & !is.null(rsOLMat)) { + "proportionWeightedMean")) & !is.null(rsMatList)) { # dim(rsMat) # loadings<-loadings[, c("PC1", "PC2")] @@ -489,9 +488,7 @@ aggregateSignal <- function(signal, # method. returns a 95% confidence interval from the Wilcoxon rank sum test # instead of p value. #' @template absVal -#' @template olList -#' @template pOlapList -#' @param rsOLMat. +#' @param rsMatList. #' @template returnCovInfo #' @return Data.frame of results, one row for each region set. #' It has the following columns: @@ -535,8 +532,8 @@ aggregateSignalGRList <- function(signal, signalCoordType = "default", scoringMetric = "default", verbose = TRUE, - absVal=TRUE, olList=NULL, - pOlapList=NULL, rsMatList=NULL, + absVal=TRUE, + rsMatList=NULL, signalList=NULL, rsInfo=NULL, returnCovInfo=TRUE) { @@ -547,8 +544,8 @@ aggregateSignalGRList <- function(signal, signalCoord=signalCoord, regionSet=NULL, signalCol = signalCol, - GRList=GRList, - olList=olList) + GRList=GRList)#, + #olList=olList) ########## check that dimensions of inputs are consistent # length of signal coord = nrow of signal @@ -639,30 +636,34 @@ aggregateSignalGRList <- function(signal, absVal <- FALSE } - # create region set overlap matrix - # this code should be after code modifying "signal" - if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", - "regionMean", - "proportionWeightedMean"))) { - olMatRes <- olToMat(signalListCoord = signalCoord, - GRList = GRList, - scoringMetric = scoringMetric) - rsMatList <- olMatRes[[1]] - rsInfo <- olMatRes[[2]] - if (is.null(signalList)) { - signalMatList <- splitSignal(signal = signal, - maxRow = nrow(rsMatList[[1]])) - } - } + # ## if this function is only being run once, use data.table calculations + # # create region set overlap matrix + # # this code should be after code modifying "signal" + # if (scoringMetric %in% c("simpleMean", + # "regionMean", + # "proportionWeightedMean")) { + # if (is.null(rsMatList)) { + # olMatRes <- olToMat(signalListCoord = signalCoord, + # GRList = GRList, + # scoringMetric = scoringMetric) + # rsMatList <- olMatRes[[1]] + # rsInfo <- olMatRes[[2]] + # } + # + # if (is.null(signalList)) { + # signalMatList <- splitSignal(signal = signal, + # maxRow = nrow(rsMatList[[1]])) + # } + # } - if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { resultsDF <- matScore(rsMatList = rsMatList, signalMatList=signalMatList, rsInfo=rsInfo) - } else if (is.null(olList)) { + } else { # not matrix COCOA. Old COCOA with data.table # apply over the list of region sets resultsList <- lapplyAlias(GRList, function(x) aggregateSignal( @@ -675,62 +676,10 @@ aggregateSignalGRList <- function(signal, verbose = verbose, absVal = absVal, pOlap=NULL, returnCovInfo=returnCovInfo)) - # resultsList <- mapply(FUN = function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = x, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, pOlap=y, - # returnCovInfo=returnCovInfo), x=GRList, y=pOlapList) - #wilcox.conf.int = wilcox.conf.int, - } else { - # # apply over the list of region sets - # resultsList <- lapplyAlias(olList, - # function(x) aggregateSignal( - # signal = signal, - # signalCoord = signalCoord, - # signalCoordType = signalCoordType, - # regionSet = NULL, - # signalCol = signalCol, - # scoringMetric = scoringMetric, - # verbose = verbose, - # absVal = absVal, - # rsOL=x, pOlap=pOlapList, returnCovInfo=returnCovInfo)) - # #wilcox.conf.int = wilcox.conf.int, - if (is.null(pOlapList)) { - # mapply does not work if pOlapList is null and other argument is not null - resultsList <- lapplyAlias(olList, FUN = function(x) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, - returnCovInfo=returnCovInfo)) - } else { - resultsList <- mapply(FUN = function(x, y) aggregateSignal( - signal = signal, - signalCoord = signalCoord, - signalCoordType = signalCoordType, - regionSet = NULL, - signalCol = signalCol, - scoringMetric = scoringMetric, - verbose = verbose, - absVal = absVal, - rsOL=x, pOlap=y, - returnCovInfo=returnCovInfo), - x=olList, y=pOlapList) - } - + } } - if (!is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { return(resultsDF) diff --git a/R/permutation.R b/R/permutation.R index 7bdfda9..f0a589e 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -29,7 +29,6 @@ #' @template signalCol #' @template scoringMetric #' @template absVal -#' @template olList #' @param centerGenomicSignal Logical. Should rows in genomicSignal #' be centered based on #' their means? (subtracting row mean from each row) @@ -136,7 +135,6 @@ runCOCOAPerm <- function(genomicSignal, signalCol=c("PC1", "PC2"), scoringMetric="default", absVal=TRUE, - olList=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, variationMetric="cor", @@ -160,7 +158,7 @@ runCOCOAPerm <- function(genomicSignal, } rsScores = rsScores[, colsToAnnotate] # prevents error that occurs if extra column is factor - # more efficient to only do once (not that high impact though) + # more efficient to only do once if (centerGenomicSignal) { cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE) # centering before calculating correlation @@ -207,76 +205,78 @@ runCOCOAPerm <- function(genomicSignal, } } - ################# - if (is.null(olList)) { - - ####### - # must take out NA rows before getting OL list. Otherwise later calculations - # will use wrong indices. - - # what happens if there are NAs or Inf in `signal`? - # any NAs that overlap the regionSet will cause the score to be NA - if (is(genomicSignal, "data.table")) { - naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } else { - naRows = apply(X = genomicSignal[, , drop=FALSE], - MARGIN = 1, FUN = function(x) any(is.na(x))) - } - - if (any(naRows)) { - genomicSignal <- genomicSignal[!naRows, ] - signalCoord <- signalCoord[!naRows] - warning("Removing rows with NA from `genomicSignal`") - } - - ################################################################# - - # calculate overlaps only once - # region set must be subject to fit with scoring functions - olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, - subject = x)) - totalRegionNumber = sapply(X = GRList, length) - meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) - } - - # also calculate coverage info - # @param rsOL - calculateCovInfo <- function(rsOL, - scoringMetric=scoringMetric, - pOlap=NULL) { - - covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), - regionSetCoverage=length(unique(subjectHits(rsOL)))) - - if (scoringMetric == "proportionWeightedMean") { - - covInfo$sumProportionOverlap <- sum(pOlap) - } - - } - - covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, - scoringMetric = scoringMetric)) - - - if (scoringMetric == "proportionWeightedMean") { - getPOlap <- function(rsOL, signalGR, regionSet) { - olap <- pintersect(signalGR[queryHits(rsOL)], - regionSet[subjectHits(rsOL)]) - pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)]) - return(pOlap) - } - - # list - pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, - signalGR=signalCoord, - regionSet = y), - x=olList, y=GRList, SIMPLIFY = FALSE) - covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum) - } else { - pOlapList <- NULL - } + ######################################################################## + # # deprecated. Now using matrix calculations instead + # + # if (is.null(olList)) { + # + # ####### + # # must take out NA rows before getting OL list. Otherwise later calculations + # # will use wrong indices. + # + # # what happens if there are NAs or Inf in `signal`? + # # any NAs that overlap the regionSet will cause the score to be NA + # if (is(genomicSignal, "data.table")) { + # naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], + # MARGIN = 1, FUN = function(x) any(is.na(x))) + # } else { + # naRows = apply(X = genomicSignal[, , drop=FALSE], + # MARGIN = 1, FUN = function(x) any(is.na(x))) + # } + # + # if (any(naRows)) { + # genomicSignal <- genomicSignal[!naRows, ] + # signalCoord <- signalCoord[!naRows] + # warning("Removing rows with NA from `genomicSignal`") + # } + # + # ################################################################# + # + # # calculate overlaps only once + # # region set must be subject to fit with scoring functions + # olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, + # subject = x)) + # totalRegionNumber = sapply(X = GRList, length) + # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) + # } + # + # # also calculate coverage info + # # @param rsOL + # calculateCovInfo <- function(rsOL, + # scoringMetric=scoringMetric, + # pOlap=NULL) { + # + # covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), + # regionSetCoverage=length(unique(subjectHits(rsOL)))) + # + # if (scoringMetric == "proportionWeightedMean") { + # + # covInfo$sumProportionOverlap <- sum(pOlap) + # } + # + # } + # + # covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, + # scoringMetric = scoringMetric)) + # + # + # if (scoringMetric == "proportionWeightedMean") { + # getPOlap <- function(rsOL, signalGR, regionSet) { + # olap <- pintersect(signalGR[queryHits(rsOL)], + # regionSet[subjectHits(rsOL)]) + # pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)]) + # return(pOlap) + # } + # + # # list + # pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, + # signalGR=signalCoord, + # regionSet = y), + # x=olList, y=GRList, SIMPLIFY = FALSE) + # covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum) + # } else { + # pOlapList <- NULL + # } ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" @@ -328,8 +328,8 @@ runCOCOAPerm <- function(genomicSignal, centerGenomicSignal = centerGenomicSignal, centerTargetVar = centerTargetVar, verbose=verbose, - olList=olList, - pOlapList=pOlapList, + rsMatList = rsMatList, + rsInfo = rsInfo, returnCovInfo = returnCovInfo) message(y) # must be ahead of object that is saved as cache, not after tmp @@ -354,8 +354,8 @@ runCOCOAPerm <- function(genomicSignal, scoringMetric=scoringMetric, absVal=absVal, verbose=verbose, - olList=olList, - pOlapList=pOlapList, + rsMatList=rsMatList, + rsInfo = rsInfo, returnCovInfo = returnCovInfo) message(".") return(tmp) @@ -486,8 +486,6 @@ runCOCOAPerm <- function(genomicSignal, #' @template scoringMetric #' @template verbose #' @template absVal -#' @template olList -#' @template pOlapList #' @param centerGenomicSignal Logical. Should rows in genomicSignal #' be centered based on #' their means? (subtracting row mean from each row) @@ -537,13 +535,14 @@ runCOCOA <- function(genomicSignal, sampleOrder=1:nrow(targetVar), variationMetric = "cor", scoringMetric="default", verbose=TRUE, - absVal=TRUE, olList=NULL, pOlapList=NULL, + absVal=TRUE, rsMatList=NULL, - signalList=NULL, rsInfo=NULL, + rsInfo=NULL, centerGenomicSignal=TRUE, centerTargetVar=TRUE, returnCovInfo=TRUE) { + signalList <- NULL # if vector is given, return error if (is.null(dim(targetVar))) { stop("`targetVar` should be a matrix or data.frame") @@ -580,13 +579,33 @@ runCOCOA <- function(genomicSignal, absVal <- FALSE } + if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", + "proportionWeightedMean"))) { + + if (is.null(rsMatList)) { + olMatRes <- olToMat(signalListCoord = signalCoord, + GRList = GRList, + scoringMetric = scoringMetric) + rsMatList <- olMatRes[[1]] + rsInfo <- olMatRes[[2]] + } + if (is.null(signalList)) { + signalList <- splitSignal(signal = featureLabelCor, + maxRow = nrow(rsMatList[[1]])) + } + + } + # run COCOA thisPermRes <- aggregateSignalGRList(signal=featureLabelCor, signalCoord=signalCoord, GRList=GRList, signalCol = signalCol, scoringMetric = scoringMetric, verbose = verbose, - absVal = absVal, olList = olList, pOlapList=pOlapList, + absVal = absVal, # olList = olList, pOlapList=pOlapList, + rsMatList=rsMatList, + rsInfo = rsInfo, + signalList=signalList, returnCovInfo=returnCovInfo) # return From 8c929fb47af4757acc8ec545830e28c39800b1ab Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 20:35:40 -0500 Subject: [PATCH 16/38] Fix typo --- R/COCOA.R | 2 -- R/permutation.R | 7 ------- 2 files changed, 9 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index deac025..8f0bd72 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -678,7 +678,6 @@ aggregateSignalGRList <- function(signal, returnCovInfo=returnCovInfo)) } - } if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { @@ -2411,4 +2410,3 @@ splitSignal <- function(signal, maxRow=500000) { return(signalList) } - diff --git a/R/permutation.R b/R/permutation.R index f0a589e..ba25dba 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -582,13 +582,6 @@ runCOCOA <- function(genomicSignal, if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { - if (is.null(rsMatList)) { - olMatRes <- olToMat(signalListCoord = signalCoord, - GRList = GRList, - scoringMetric = scoringMetric) - rsMatList <- olMatRes[[1]] - rsInfo <- olMatRes[[2]] - } if (is.null(signalList)) { signalList <- splitSignal(signal = featureLabelCor, maxRow = nrow(rsMatList[[1]])) From 6d42bd3c58a7935fa0acd47878970132a405b0ac Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:03:39 -0500 Subject: [PATCH 17/38] Fix typo --- R/permutation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/permutation.R b/R/permutation.R index ba25dba..caca4bf 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -280,7 +280,7 @@ runCOCOAPerm <- function(genomicSignal, ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" - if (is.null(rsOLMat) && (scoringMetric %in% c("simpleMean", + if (is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { olMatRes <- olToMat(signalListCoord = signalCoord, From a7a9c6c76e613a16b22bffde85e6dc1c728e5f4e Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:12:32 -0500 Subject: [PATCH 18/38] Fix bug --- R/permutation.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/R/permutation.R b/R/permutation.R index caca4bf..c2b8e95 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -280,18 +280,14 @@ runCOCOAPerm <- function(genomicSignal, ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" - if (is.null(rsMatList) && (scoringMetric %in% c("simpleMean", + if (scoringMetric %in% c("simpleMean", "regionMean", - "proportionWeightedMean"))) { + "proportionWeightedMean")) { olMatRes <- olToMat(signalListCoord = signalCoord, GRList = GRList, scoringMetric = scoringMetric) rsMatList <- olMatRes[[1]] rsInfo <- olMatRes[[2]] - if (is.null(signalList)) { - signalMatList <- splitSignal(signal = signal, - maxRow = nrow(rsMatList[[1]])) - } } From ca515bc4d427e58853045ebdc9dafd2f60616273 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:23:16 -0500 Subject: [PATCH 19/38] Fix typo --- R/COCOA.R | 3 ++- R/permutation.R | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 8f0bd72..26669b8 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -643,7 +643,7 @@ aggregateSignalGRList <- function(signal, # "regionMean", # "proportionWeightedMean")) { # if (is.null(rsMatList)) { - # olMatRes <- olToMat(signalListCoord = signalCoord, + # olMatRes <- olToMat(signalCoord = signalCoord, # GRList = GRList, # scoringMetric = scoringMetric) # rsMatList <- olMatRes[[1]] @@ -2410,3 +2410,4 @@ splitSignal <- function(signal, maxRow=500000) { return(signalList) } + diff --git a/R/permutation.R b/R/permutation.R index c2b8e95..ea9e2d6 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -283,7 +283,7 @@ runCOCOAPerm <- function(genomicSignal, if (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean")) { - olMatRes <- olToMat(signalListCoord = signalCoord, + olMatRes <- olToMat(signalCoord = signalCoord, GRList = GRList, scoringMetric = scoringMetric) rsMatList <- olMatRes[[1]] From 1c7924d403d014719bf18e4bc91e2be4be2dda1e Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 21:44:29 -0500 Subject: [PATCH 20/38] Fix param name --- R/COCOA.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 26669b8..1a78a47 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -2375,7 +2375,7 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } # multiply region set matrices with data matrices to get COCOA score -matScore <- function(rsMatList, signalMatList, covInfo) { +matScore <- function(rsMatList, signalMatList, rsInfo) { # rsMatList mats are features X region sets # signalMatList is features X target variables @@ -2386,7 +2386,7 @@ matScore <- function(rsMatList, signalMatList, covInfo) { # combine results. Already normalized to one so can just add to get mean. scoreDF <- do.call("+", scoreL) - resultsDF <- cbind(scoreDF, covInfo) + resultsDF <- cbind(scoreDF, rsInfo) return(resultsDF) } @@ -2410,4 +2410,3 @@ splitSignal <- function(signal, maxRow=500000) { return(signalList) } - From cef9801cdb86807aaf2db7f9f02248d41ec2cab7 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 11 Feb 2021 22:03:12 -0500 Subject: [PATCH 21/38] Fix typo --- R/COCOA.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 1a78a47..a2ae251 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -661,7 +661,7 @@ aggregateSignalGRList <- function(signal, "regionMean", "proportionWeightedMean"))) { resultsDF <- matScore(rsMatList = rsMatList, - signalMatList=signalMatList, + signalMatList=signalList, rsInfo=rsInfo) } else { # not matrix COCOA. Old COCOA with data.table # apply over the list of region sets @@ -2409,4 +2409,3 @@ splitSignal <- function(signal, maxRow=500000) { return(signalList) } - From 851b2c333556159103a172ff8fe12142be7a0374 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 12 Feb 2021 10:15:18 -0500 Subject: [PATCH 22/38] Fix bug --- R/COCOA.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index a2ae251..c7b1de5 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -2381,10 +2381,10 @@ matScore <- function(rsMatList, signalMatList, rsInfo) { # signalMatList is features X target variables # multiply matrices scoreL <- mapply(FUN = function(x, y) t(x) %*% y, - x=rsMatList, y=signalMatList) + x=rsMatList, y=signalMatList, SIMPLIFY = FALSE) # each item in scoreL is region sets X target variables # combine results. Already normalized to one so can just add to get mean. - scoreDF <- do.call("+", scoreL) + scoreDF <- Reduce("+", scoreL) resultsDF <- cbind(scoreDF, rsInfo) return(resultsDF) From dd15a108652caea390753ff93173215d4fdc458b Mon Sep 17 00:00:00 2001 From: j-lawson Date: Sat, 13 Feb 2021 23:08:31 -0500 Subject: [PATCH 23/38] Make new vignette and update main COCOA function --- NAMESPACE | 1 - R/COCOA.R | 37 +- R/permutation.R | 324 ++++---- man-roxygen/rsMatList.R | 5 + man/aggregateSignal.Rd | 2 +- man/aggregateSignalGRList.Rd | 32 +- man/runCOCOA.Rd | 216 ++--- man/runCOCOAPerm.Rd | 266 ------- man/signalAlongAxis.Rd | 10 + vignettes/COCOA_analysis_details.Rmd | 1100 ++++++++++++++++++++++++++ vignettes/IntroToCOCOA.Rmd | 922 +-------------------- 11 files changed, 1401 insertions(+), 1514 deletions(-) create mode 100755 man-roxygen/rsMatList.R delete mode 100755 man/runCOCOAPerm.Rd create mode 100755 vignettes/COCOA_analysis_details.Rmd diff --git a/NAMESPACE b/NAMESPACE index 1d8f82e..5fb5325 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,7 +12,6 @@ export(regionQuantileByTargetVar) export(rsRankingIndex) export(rsScoreHeatmap) export(runCOCOA) -export(runCOCOAPerm) export(signalAlongAxis) import(BiocGenerics) import(GenomicRanges) diff --git a/R/COCOA.R b/R/COCOA.R index 119bad8..2c0c18b 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -107,8 +107,7 @@ if (getRversion() >= "2.15.1") { #' @param pOlap Numeric vector. Only used if rsOL is given and scoringMetric #' is "proportionWeightedMean". This vector should contain the proportion of #' each regionSet region that is overlapped by a signalCoord region. The -#' order of pOlap should be the same as the overlaps in rsOL. -#' @param rsMatList Matrix. +#' order of pOlap should be the same as the overlaps in rsOL. #' @template returnCovInfo #' @template checkInput @@ -287,38 +286,8 @@ aggregateSignal <- function(signal, ########################################################################### # scoring - - # would rounding speed up aggregation?, potentially make a sparse matrix - # if a lot of entries became 0 - - # the scoring metrics that support matrix scoring - if ((scoringMetric %in% c("simpleMean", - "regionMean", - "proportionWeightedMean")) & !is.null(rsMatList)) { - - # dim(rsMat) - # loadings<-loadings[, c("PC1", "PC2")] - # dim(loadings) - # - # loadings <- abs(loadings) - - # sum per region set - rsScoresMatrix <- t(rsMat) %*% loadings - # normalize for regions covered (for average) - covCount <- colSums(rsMat) - results <- as.data.frame(apply(rsScoresMatrix, - MARGIN = 2, - FUN = function(x) x/covCount)) - results$signalCoverage <- covCount - - # don't add coverage columns since they are the same every time - # and only need to be calculated once - - } - - - # works for both singleBase and multiBase (UPDATE: did, but not matrix scoring) + # works for both singleBase and multiBase if (scoringMetric == "simpleMean") { loadAgMain <- as.data.table(regionOLMean(signalDT = loadingDT, @@ -484,7 +453,7 @@ aggregateSignal <- function(signal, # method. returns a 95% confidence interval from the Wilcoxon rank sum test # instead of p value. #' @template absVal -#' @param rsMatList. +#' @template rsMatList #' @template returnCovInfo #' @return Data.frame of results, one row for each region set. #' It has the following columns: diff --git a/R/permutation.R b/R/permutation.R index ea9e2d6..1964c3b 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -88,32 +88,8 @@ #' targetVarCols <- c("PC1", "PC2") #' targetVar <- pcScores[, targetVarCols] #' -#' # give the actual order of samples to `runCOCOA` to get the real scores -#' correctSampleOrder=1:nrow(targetVar) -#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1, -#' signalCoord=brcaMCoord1, -#' GRList=GRangesList(esr1_chr1, nrf1_chr1), -#' signalCol=c("PC1", "PC2"), -#' targetVar=targetVar, -#' sampleOrder=correctSampleOrder, -#' variationMetric="cor") -#' -#' # give random order of samples to get random COCOA scores -#' # so you start building a null distribution for each region set -#' # (see vignette for example of building a null distribution with `runCOCOA`) -#' randomOrder <- sample(1:nrow(targetVar), -#' size=nrow(targetVar), -#' replace=FALSE) -#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1, -#' signalCoord=brcaMCoord1, -#' GRList=GRangesList(esr1_chr1, nrf1_chr1), -#' signalCol=c("PC1", "PC2"), -#' targetVar=targetVar, -#' sampleOrder=randomOrder, -#' variationMetric="cor") -#' #' # runCOCOAPerm -#' permResults <- runCOCOAPerm(genomicSignal=brcaMethylData1, +#' permResults <- runCOCOA(genomicSignal=brcaMethylData1, #' signalCoord=brcaMCoord1, #' GRList=GRangesList(esr1_chr1, nrf1_chr1), #' rsScores=realRSScores, @@ -127,18 +103,18 @@ #' #' @export -runCOCOAPerm <- function(genomicSignal, +runCOCOA <- function(genomicSignal, signalCoord, GRList, - rsScores, targetVar, + rsScores=NULL, signalCol=c("PC1", "PC2"), scoringMetric="default", absVal=TRUE, centerGenomicSignal=TRUE, centerTargetVar=TRUE, variationMetric="cor", - nPerm=300, + nPerm=0, useSimpleCache=TRUE, cacheDir=getwd(), dataID="", @@ -206,78 +182,6 @@ runCOCOAPerm <- function(genomicSignal, } ######################################################################## - # # deprecated. Now using matrix calculations instead - # - # if (is.null(olList)) { - # - # ####### - # # must take out NA rows before getting OL list. Otherwise later calculations - # # will use wrong indices. - # - # # what happens if there are NAs or Inf in `signal`? - # # any NAs that overlap the regionSet will cause the score to be NA - # if (is(genomicSignal, "data.table")) { - # naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], - # MARGIN = 1, FUN = function(x) any(is.na(x))) - # } else { - # naRows = apply(X = genomicSignal[, , drop=FALSE], - # MARGIN = 1, FUN = function(x) any(is.na(x))) - # } - # - # if (any(naRows)) { - # genomicSignal <- genomicSignal[!naRows, ] - # signalCoord <- signalCoord[!naRows] - # warning("Removing rows with NA from `genomicSignal`") - # } - # - # ################################################################# - # - # # calculate overlaps only once - # # region set must be subject to fit with scoring functions - # olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, - # subject = x)) - # totalRegionNumber = sapply(X = GRList, length) - # meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x)))) - # } - # - # # also calculate coverage info - # # @param rsOL - # calculateCovInfo <- function(rsOL, - # scoringMetric=scoringMetric, - # pOlap=NULL) { - # - # covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), - # regionSetCoverage=length(unique(subjectHits(rsOL)))) - # - # if (scoringMetric == "proportionWeightedMean") { - # - # covInfo$sumProportionOverlap <- sum(pOlap) - # } - # - # } - # - # covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, - # scoringMetric = scoringMetric)) - # - # - # if (scoringMetric == "proportionWeightedMean") { - # getPOlap <- function(rsOL, signalGR, regionSet) { - # olap <- pintersect(signalGR[queryHits(rsOL)], - # regionSet[subjectHits(rsOL)]) - # pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)]) - # return(pOlap) - # } - # - # # list - # pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, - # signalGR=signalCoord, - # regionSet = y), - # x=olList, y=GRList, SIMPLIFY = FALSE) - # covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum) - # } else { - # pOlapList <- NULL - # } - ######################################################################### # create region set overlap matrix # this code should be after code modifying "signal" if (scoringMetric %in% c("simpleMean", @@ -303,6 +207,27 @@ runCOCOAPerm <- function(genomicSignal, if (useSimpleCache) { + if (is.null(rsScores)) { + simpleCache(paste0("rsScores_", variationMetric, "_", dataID), { + rsScores <- .runCOCOA_old(sampleOrder=1:nrow(targetVar), + genomicSignal=genomicSignal, + signalCoord=signalCoord, + GRList=GRList, + signalCol=colsToAnnotate, + targetVar=targetVar, + variationMetric = variationMetric, + scoringMetric=scoringMetric, + absVal=absVal, + centerGenomicSignal = centerGenomicSignal, + centerTargetVar = centerTargetVar, + verbose=verbose, + rsMatList = rsMatList, + rsInfo = rsInfo, + returnCovInfo = returnCovInfo) + rsScores + }, assignToVariable="rsScores") + } + # create the main permutation cache simpleCache(paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), { @@ -312,7 +237,7 @@ runCOCOAPerm <- function(genomicSignal, # create sub caches, one for each permutation simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), { - tmp <- runCOCOA(sampleOrder=x, + tmp <- .runCOCOA_old(sampleOrder=x, genomicSignal=genomicSignal, signalCoord=signalCoord, GRList=GRList, @@ -339,8 +264,27 @@ runCOCOAPerm <- function(genomicSignal, }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...) } else { + if (is.null(rsScores)) { + rsScores <- .runCOCOA_old(sampleOrder=1:nrow(targetVar), + genomicSignal=genomicSignal, + signalCoord=signalCoord, + GRList=GRList, + signalCol=colsToAnnotate, + targetVar=targetVar, + variationMetric = variationMetric, + scoringMetric=scoringMetric, + absVal=absVal, + centerGenomicSignal = centerGenomicSignal, + centerTargetVar = centerTargetVar, + verbose=verbose, + rsMatList = rsMatList, + rsInfo = rsInfo, + returnCovInfo = returnCovInfo) + } + + helperFun2 <- function(x) { - tmp <- runCOCOA(sampleOrder=x, + tmp <- .runCOCOA_old(sampleOrder=x, genomicSignal=genomicSignal, signalCoord=signalCoord, GRList=GRList, @@ -353,7 +297,7 @@ runCOCOAPerm <- function(genomicSignal, rsMatList=rsMatList, rsInfo = rsInfo, returnCovInfo = returnCovInfo) - message(".") + message(":", appendLF=FALSE) return(tmp) } @@ -439,93 +383,93 @@ runCOCOAPerm <- function(genomicSignal, # zscores } -#' Run COCOA: quantify inter-sample variation, score region sets -#' -#' This is a convenience function that does the two steps of COCOA: -#' quantifying the epigenetic variation and scoring the region sets. -#' This function will return the real COCOA scores if using the default -#' `sampleOrder` parameter values. This -#' function also makes it easy to generate null distributions in order to -#' evaluate the statistical significance of the real COCOA results. -#' You can use the sampleOrder parameter to shuffle the samples, -#' then run COCOA to get fake scores for each region set. By doing -#' this many times, you can build a null distribution for each -#' region set composed of the region set's random scores from each -#' permutation. There are multiple options for quantifying the -#' epigenetic variation, specified by the `variationMetric` parameter. -#' Quantifying the variation for the real/non-permuted COCOA -#' scores should be done with the same -#' variation metric as is used for the random permutations. For an -#' unsupervised analysis using dimensionality reduction, first, the -#' dimensionality reduction is done outside `runCOCOA`, then the -#' latent factors/principal components are input to `runCOCOA` as the -#' sample labels (targetVar parameter) when calculating both the real and -#' also the permutated region set scores. For a supervised analysis, -#' the target variables/phenotypes are the targetVar. -#' See the vignettes for examples. -#' -#' @param sampleOrder numeric. A vector of length (number of samples). If -#' sampleOrder is 1:(number of samples) then this function will return the -#' real COCOA scores. -#' To generate random COCOA scores in order to make -#' null distributions, shuffle the samples in a random order. -#' E.g. sampleOrder = sample(1:ncol(genomicSignal), ncol(genomicSignal)) -#' where ncol(genomicSignal) is the number of samples. -#' Set the seed with set.seed() before making sampleOrder to ensure reproducibility. -#' @template genomicSignal -#' @template signalCoord -#' @template GRList -#' @templateVar usesTargetVar TRUE -#' @template signalCol -#' @template targetVar -#' @template variationMetric -#' @template scoringMetric -#' @template verbose -#' @template absVal -#' @param centerGenomicSignal Logical. Should rows in genomicSignal -#' be centered based on -#' their means? (subtracting row mean from each row) -#' @param centerTargetVar Logical. Should columns in targetVar be -#' centered based -#' on their means? (subtract column mean from each column) -#' @template returnCovInfo -#' @return data.frame. The output of aggregateSignalGRList for one permutation. -#' @examples -#' data("esr1_chr1") -#' data("nrf1_chr1") -#' data("brcaMethylData1") -#' data("brcaMCoord1") -#' pcScores <- prcomp(t(brcaMethylData1))$x -#' targetVarCols <- c("PC1", "PC2") -#' targetVar <- pcScores[, targetVarCols] -#' -#' # give the actual order of samples to `runCOCOA` to get the real scores -#' correctSampleOrder=1:nrow(targetVar) -#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1, -#' signalCoord=brcaMCoord1, -#' GRList=GRangesList(esr1_chr1, nrf1_chr1), -#' signalCol=c("PC1", "PC2"), -#' targetVar=targetVar, -#' sampleOrder=correctSampleOrder, -#' variationMetric="cor") -#' realRSScores -#' -#' # give random order of samples to get random COCOA scores -#' # so you start building a null distribution for each region set -#' # (see vignette for example of building a null distribution with `runCOCOA`) -#' randomOrder <- sample(1:nrow(targetVar), -#' size=nrow(targetVar), -#' replace=FALSE) -#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1, -#' signalCoord=brcaMCoord1, -#' GRList=GRangesList(esr1_chr1, nrf1_chr1), -#' signalCol=c("PC1", "PC2"), -#' targetVar=targetVar, -#' sampleOrder=randomOrder, -#' variationMetric="cor") -#' randomRSScores -#' @export -runCOCOA <- function(genomicSignal, +# Run COCOA: quantify inter-sample variation, score region sets +# +# This is a convenience function that does the two steps of COCOA: +# quantifying the epigenetic variation and scoring the region sets. +# This function will return the real COCOA scores if using the default +# `sampleOrder` parameter values. This +# function also makes it easy to generate null distributions in order to +# evaluate the statistical significance of the real COCOA results. +# You can use the sampleOrder parameter to shuffle the samples, +# then run COCOA to get fake scores for each region set. By doing +# this many times, you can build a null distribution for each +# region set composed of the region set's random scores from each +# permutation. There are multiple options for quantifying the +# epigenetic variation, specified by the `variationMetric` parameter. +# Quantifying the variation for the real/non-permuted COCOA +# scores should be done with the same +# variation metric as is used for the random permutations. For an +# unsupervised analysis using dimensionality reduction, first, the +# dimensionality reduction is done outside `runCOCOA`, then the +# latent factors/principal components are input to `runCOCOA` as the +# sample labels (targetVar parameter) when calculating both the real and +# also the permutated region set scores. For a supervised analysis, +# the target variables/phenotypes are the targetVar. +# See the vignettes for examples. +# +# @param sampleOrder numeric. A vector of length (number of samples). If +# sampleOrder is 1:(number of samples) then this function will return the +# real COCOA scores. +# To generate random COCOA scores in order to make +# null distributions, shuffle the samples in a random order. +# E.g. sampleOrder = sample(1:ncol(genomicSignal), ncol(genomicSignal)) +# where ncol(genomicSignal) is the number of samples. +# Set the seed with set.seed() before making sampleOrder to ensure reproducibility. +# @template genomicSignal +# @template signalCoord +# @template GRList +# @templateVar usesTargetVar TRUE +# @template signalCol +# @template targetVar +# @template variationMetric +# @template scoringMetric +# @template verbose +# @template absVal +# @param centerGenomicSignal Logical. Should rows in genomicSignal +# be centered based on +# their means? (subtracting row mean from each row) +# @param centerTargetVar Logical. Should columns in targetVar be +# centered based +# on their means? (subtract column mean from each column) +# @template returnCovInfo +# @return data.frame. The output of aggregateSignalGRList for one permutation. +# @examples +# data("esr1_chr1") +# data("nrf1_chr1") +# data("brcaMethylData1") +# data("brcaMCoord1") +# pcScores <- prcomp(t(brcaMethylData1))$x +# targetVarCols <- c("PC1", "PC2") +# targetVar <- pcScores[, targetVarCols] +# +# # give the actual order of samples to `runCOCOA` to get the real scores +# correctSampleOrder=1:nrow(targetVar) +# realRSScores <- .runCOCOA_old(genomicSignal=brcaMethylData1, +# signalCoord=brcaMCoord1, +# GRList=GRangesList(esr1_chr1, nrf1_chr1), +# signalCol=c("PC1", "PC2"), +# targetVar=targetVar, +# sampleOrder=correctSampleOrder, +# variationMetric="cor") +# realRSScores +# +# # give random order of samples to get random COCOA scores +# # so you start building a null distribution for each region set +# # (see vignette for example of building a null distribution with `runCOCOA`) +# randomOrder <- sample(1:nrow(targetVar), +# size=nrow(targetVar), +# replace=FALSE) +# randomRSScores <- .runCOCOA_old(genomicSignal=brcaMethylData1, +# signalCoord=brcaMCoord1, +# GRList=GRangesList(esr1_chr1, nrf1_chr1), +# signalCol=c("PC1", "PC2"), +# targetVar=targetVar, +# sampleOrder=randomOrder, +# variationMetric="cor") +# randomRSScores +# +.runCOCOA_old <- function(genomicSignal, signalCoord, GRList, signalCol, targetVar, sampleOrder=1:nrow(targetVar), diff --git a/man-roxygen/rsMatList.R b/man-roxygen/rsMatList.R new file mode 100755 index 0000000..0f74ec0 --- /dev/null +++ b/man-roxygen/rsMatList.R @@ -0,0 +1,5 @@ +#' @param rsMatList list of matrices or data.frames. Rows should be . +#' Columns should be + +# (aggregateSignalGRList) +# diff --git a/man/aggregateSignal.Rd b/man/aggregateSignal.Rd index 500a93a..ac2931d 100755 --- a/man/aggregateSignal.Rd +++ b/man/aggregateSignal.Rd @@ -16,7 +16,7 @@ aggregateSignal( rsOL = NULL, pOlap = NULL, returnCovInfo = TRUE, - .checkInput = TRUE + .checkInput = FALSE ) } \arguments{ diff --git a/man/aggregateSignalGRList.Rd b/man/aggregateSignalGRList.Rd index 7f68cd6..854f0a1 100755 --- a/man/aggregateSignalGRList.Rd +++ b/man/aggregateSignalGRList.Rd @@ -13,8 +13,9 @@ aggregateSignalGRList( scoringMetric = "default", verbose = TRUE, absVal = TRUE, - olList = NULL, - pOlapList = NULL, + rsMatList = NULL, + signalList = NULL, + rsInfo = NULL, returnCovInfo = TRUE ) } @@ -103,31 +104,8 @@ regions in a region set). Choose FALSE if you expect regions in a given region set to all change in the same direction (all be positively correlated with each other).} -\item{olList}{list. Each list item should be a "SortedByQueryHits" object -(output of findOverlaps function). Each hits object should have the overlap -information between signalCoord and one item of GRList (one unique region set). -The region sets from GRList must be the "subject" in findOverlaps -and signalCoord must be the "query". E.g. findOverlaps(subject=regionSet, -query=signalCoord). -Providing this information can greatly improve permutation speed since the -overlaps will not have to be calculated for each permutation. -The "runCOCOAPerm" function calculates this information only once, internally, -so this does not have to be provided when using that function. When using -this parameter, signalCoord, -genomicSignal, and each region set must be in the same order as they were -when olList was created. Otherwise, the wrong genomic loci will be referenced -(e.g. if epigenetic features were filtered out of genomicSignal after olList -was created.)} - -\item{pOlapList}{list. This parameter is only used if the scoring metric is -"proportionWeightedMean" and olList is also provided as an argument. Each -item of the list should be a vector that contains the proportion overlap -between signalCoord and regions from one region set (one item of GRList). -Specifically, each value should be the proportion of the region set region -that is overlapped -by a signalCoord region. -The proportion overlap values should be in the same order as the overlaps -given by olList for the corresponding region set.} +\item{rsMatList}{list of matrices or data.frames. Rows should be . +Columns should be} \item{returnCovInfo}{logical. If TRUE, the following coverage and region set info will diff --git a/man/runCOCOA.Rd b/man/runCOCOA.Rd index 849d1ca..f25d49b 100755 --- a/man/runCOCOA.Rd +++ b/man/runCOCOA.Rd @@ -2,24 +2,31 @@ % Please edit documentation in R/permutation.R \name{runCOCOA} \alias{runCOCOA} -\title{Run COCOA: quantify inter-sample variation, score region sets} +\title{Run COCOA permutations to get p-values} \usage{ runCOCOA( genomicSignal, signalCoord, GRList, - signalCol, targetVar, - sampleOrder = 1:nrow(targetVar), - variationMetric = "cor", + rsScores = NULL, + signalCol = c("PC1", "PC2"), scoringMetric = "default", - verbose = TRUE, absVal = TRUE, - olList = NULL, - pOlapList = NULL, centerGenomicSignal = TRUE, centerTargetVar = TRUE, - returnCovInfo = TRUE + variationMetric = "cor", + nPerm = 0, + useSimpleCache = TRUE, + cacheDir = getwd(), + dataID = "", + testType = "greater", + gammaFitMethod = "mme", + realScoreInDist = TRUE, + force = FALSE, + verbose = TRUE, + returnCovInfo = FALSE, + ... ) } \arguments{ @@ -44,32 +51,26 @@ the same biological annotation). The region set database must be from the same reference genome as the coordinates for the actual data/samples (signalCoord).} -\item{signalCol}{A character vector with the names of the sample variables -of interest/target variables (e.g. PCs or sample phenotypes). - -The columns in `sampleLabels` for which to calculate -the variation related to the epigenetic data -(e.g. correlation) and then to run COCOA on.} - \item{targetVar}{Matrix or data.frame. Rows should be samples. Columns should be the target variables (whatever variable you want to test for association with the epigenetic signal: e.g. PC scores),} -\item{sampleOrder}{numeric. A vector of length (number of samples). If -sampleOrder is 1:(number of samples) then this function will return the -real COCOA scores. -To generate random COCOA scores in order to make -null distributions, shuffle the samples in a random order. -E.g. sampleOrder = sample(1:ncol(genomicSignal), ncol(genomicSignal)) -where ncol(genomicSignal) is the number of samples. -Set the seed with set.seed() before making sampleOrder to ensure reproducibility.} +\item{rsScores}{data.frame. A data.frame with region set +scores. The output of the 'aggregateSignalGRList' function. +Each row is a region set. One column for each sample +variable of interest (e.g. PC or sample phenotype). +Also can have columns with info on the overlap between the +region set and the epigenetic data. +Rows should be in the same order as the region sets in GRList +(the list of region sets used to create rsScores.)} -\item{variationMetric}{Character. The metric to use to quantify the -association between each feature in genomicSignal and each target -variable in sampleLabels. -Either "cor" (Pearson correlation), -"cov" (covariation), or "spearmanCor" (Spearman correlation).} +\item{signalCol}{A character vector with the names of the sample variables +of interest/target variables (e.g. PCs or sample phenotypes). + +The columns in `sampleLabels` for which to calculate +the variation related to the epigenetic data +(e.g. correlation) and then to run COCOA on.} \item{scoringMetric}{A character object with the scoring metric. There are different methods available for @@ -104,10 +105,6 @@ weight the signal value when calculating the mean. The denominator of the mean is the sum of all the proportion overlaps.} -\item{verbose}{A "logical" object. Whether progress -of the function should be shown. One -bar indicates the region set is completed.} - \item{absVal}{Logical. If TRUE, take the absolute value of values in signal. Choose TRUE if you think there may be some genomic loci in a region set that will increase and others @@ -116,32 +113,6 @@ regions in a region set). Choose FALSE if you expect regions in a given region set to all change in the same direction (all be positively correlated with each other).} -\item{olList}{list. Each list item should be a "SortedByQueryHits" object -(output of findOverlaps function). Each hits object should have the overlap -information between signalCoord and one item of GRList (one unique region set). -The region sets from GRList must be the "subject" in findOverlaps -and signalCoord must be the "query". E.g. findOverlaps(subject=regionSet, -query=signalCoord). -Providing this information can greatly improve permutation speed since the -overlaps will not have to be calculated for each permutation. -The "runCOCOAPerm" function calculates this information only once, internally, -so this does not have to be provided when using that function. When using -this parameter, signalCoord, -genomicSignal, and each region set must be in the same order as they were -when olList was created. Otherwise, the wrong genomic loci will be referenced -(e.g. if epigenetic features were filtered out of genomicSignal after olList -was created.)} - -\item{pOlapList}{list. This parameter is only used if the scoring metric is -"proportionWeightedMean" and olList is also provided as an argument. Each -item of the list should be a vector that contains the proportion overlap -between signalCoord and regions from one region set (one item of GRList). -Specifically, each value should be the proportion of the region set region -that is overlapped -by a signalCoord region. -The proportion overlap values should be in the same order as the overlaps -given by olList for the corresponding region set.} - \item{centerGenomicSignal}{Logical. Should rows in genomicSignal be centered based on their means? (subtracting row mean from each row)} @@ -150,39 +121,84 @@ their means? (subtracting row mean from each row)} centered based on their means? (subtract column mean from each column)} +\item{variationMetric}{Character. The metric to use to quantify the +association between each feature in genomicSignal and each target +variable in sampleLabels. +Either "cor" (Pearson correlation), +"cov" (covariation), or "spearmanCor" (Spearman correlation).} + +\item{nPerm}{Numeric. The number of permutations to do.} + +\item{useSimpleCache}{Logical. Whether to use save caches. Caches +will be created for each permutation so that if the function is disrupted +it can restart where it left off. The final results are also saved +as a cache. See simpleCache package for more details.} + +\item{cacheDir}{Character. The path for the directory in which the +caches should be saved.} + +\item{dataID}{Character. A unique identifier for this dataset +(for saving results with simpleCache)} + +\item{testType}{Character. Parameter for `getPermStat`. Whether to +create p values based on one a two sided test or a lesser/greater one +sided test. Options are: "greater", "lesser", "two-sided"} + +\item{gammaFitMethod}{Character. method to use for fitting the gamma +distribution to null distribution. Options are +"mme" (moment matching estimation), "mle" (maximum likelihood estimation), +"qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit +estimation). See ?COCOA::getGammaPVal and +?fitdistrplus::fitdist() for more info.} + +\item{realScoreInDist}{Logical. Should the actual score (from +test with no permutations) be included in the null distribution +when fitting the gamma distribution. realScoreInDist=TRUE is +recommended.} + +\item{force}{Logical. If force=TRUE, when fitting the gamma distribution +returns an error (as may happen when a method other than "mme" +is used) then allow the error. If force=FALSE, when fitting the +gamma distribution returns an error then don't return an error but +instead use the "mme" method +for fitting that specific gamma distribution.} + +\item{verbose}{A "logical" object. Whether progress +of the function should be shown. One +bar indicates the region set is completed.} + \item{returnCovInfo}{logical. If TRUE, the following coverage and region set info will be calculated and included in function output: regionSetCoverage, signalCoverage, totalRegionNumber, and meanRegionSize. For the proportionWeightedMean scoring method, sumProportionOverlap will also be calculated.} + +\item{...}{Character. Optional additional arguments for simpleCache.} } \value{ -data.frame. The output of aggregateSignalGRList for one permutation. +Returns a list with the following 4 items: 1. a list of length nPerm +where each item is a data.frame of the COCOA scores from a single +permutation. Each data.frame is the output of `runCOCOA()` +2. a data.table/data.frame of empirical p-values (the +output of `getPermStat`) 3. a +data.table/data.frame of z-scores (the output of `getPermStat`. +4. a data.frame of p-values based on +the gamma approximation (the output of getGammaPVal(). } \description{ -This is a convenience function that does the two steps of COCOA: -quantifying the epigenetic variation and scoring the region sets. -This function will return the real COCOA scores if using the default -`sampleOrder` parameter values. This -function also makes it easy to generate null distributions in order to -evaluate the statistical significance of the real COCOA results. -You can use the sampleOrder parameter to shuffle the samples, -then run COCOA to get fake scores for each region set. By doing -this many times, you can build a null distribution for each -region set composed of the region set's random scores from each -permutation. There are multiple options for quantifying the -epigenetic variation, specified by the `variationMetric` parameter. -Quantifying the variation for the real/non-permuted COCOA -scores should be done with the same -variation metric as is used for the random permutations. For an -unsupervised analysis using dimensionality reduction, first, the -dimensionality reduction is done outside `runCOCOA`, then the -latent factors/principal components are input to `runCOCOA` as the -sample labels (targetVar parameter) when calculating both the real and -also the permutated region set scores. For a supervised analysis, -the target variables/phenotypes are the targetVar. -See the vignettes for examples. +This is a convenience function that runs multiple steps of the +permutation process together: it runs COCOA permutations, converts these +to null distributions, gets the empirical p value (which is limited by the +number of permutations), gets z scores, and fits a gamma distribution +to each null distribution to estimate p values (not limited by the +number of permutations), +Requires that the user has previously calculated the real COCOA scores. +See these individual functions for more info on each step: runCOCOA, +convertToFromNullDist, getPermStat, and getGammaPVal. +} +\details{ +For reproducibility, set seed with 'set.seed()' function before running. } \examples{ data("esr1_chr1") @@ -193,29 +209,17 @@ pcScores <- prcomp(t(brcaMethylData1))$x targetVarCols <- c("PC1", "PC2") targetVar <- pcScores[, targetVarCols] -# give the actual order of samples to `runCOCOA` to get the real scores -correctSampleOrder=1:nrow(targetVar) -realRSScores <- runCOCOA(genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRangesList(esr1_chr1, nrf1_chr1), - signalCol=c("PC1", "PC2"), - targetVar=targetVar, - sampleOrder=correctSampleOrder, - variationMetric="cor") -realRSScores - -# give random order of samples to get random COCOA scores -# so you start building a null distribution for each region set -# (see vignette for example of building a null distribution with `runCOCOA`) -randomOrder <- sample(1:nrow(targetVar), - size=nrow(targetVar), - replace=FALSE) -randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRangesList(esr1_chr1, nrf1_chr1), - signalCol=c("PC1", "PC2"), - targetVar=targetVar, - sampleOrder=randomOrder, - variationMetric="cor") -randomRSScores +# runCOCOAPerm +permResults <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRangesList(esr1_chr1, nrf1_chr1), + rsScores=realRSScores, + targetVar=targetVar, + signalCol=c("PC1", "PC2"), + variationMetric="cor", + nPerm = 10, + useSimpleCache=FALSE) +permResults + + } diff --git a/man/runCOCOAPerm.Rd b/man/runCOCOAPerm.Rd deleted file mode 100755 index 4b09bdd..0000000 --- a/man/runCOCOAPerm.Rd +++ /dev/null @@ -1,266 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/permutation.R -\name{runCOCOAPerm} -\alias{runCOCOAPerm} -\title{Run COCOA permutations to get p-values} -\usage{ -runCOCOAPerm( - genomicSignal, - signalCoord, - GRList, - rsScores, - targetVar, - signalCol = c("PC1", "PC2"), - scoringMetric = "default", - absVal = TRUE, - olList = NULL, - centerGenomicSignal = TRUE, - centerTargetVar = TRUE, - variationMetric = "cor", - nPerm = 300, - useSimpleCache = TRUE, - cacheDir = getwd(), - dataID = "", - testType = "greater", - gammaFitMethod = "mme", - realScoreInDist = TRUE, - force = FALSE, - verbose = TRUE, - returnCovInfo = FALSE, - ... -) -} -\arguments{ -\item{genomicSignal}{Matrix/data.frame. -The genomic signal (e.g. DNA methylation levels) -Columns of genomicSignal should be samples/patients. -Rows should be individual signal/features -(each row corresponds to one genomic coordinate/range)} - -\item{signalCoord}{A GRanges object or data frame with coordinates -for the genomic signal/original epigenetic data. -Coordinates should be in the -same order as the original data and the feature contribution scores -(each item/row in signalCoord -corresponds to a row in signal). If a data.frame, -must have chr and start columns (optionally can have end column, -depending on the epigenetic data type).} - -\item{GRList}{GRangesList object. Each list item is -a distinct region set to test (region set: regions that correspond to -the same biological annotation). The region set database -must be from the same reference genome -as the coordinates for the actual data/samples (signalCoord).} - -\item{rsScores}{data.frame. A data.frame with region set -scores. The output of the 'aggregateSignalGRList' function. -Each row is a region set. One column for each sample -variable of interest (e.g. PC or sample phenotype). -Also can have columns with info on the overlap between the -region set and the epigenetic data. -Rows should be in the same order as the region sets in GRList -(the list of region sets used to create rsScores.)} - -\item{targetVar}{Matrix or data.frame. Rows should be samples. -Columns should be the target variables -(whatever variable you want to test for association with -the epigenetic signal: e.g. PC scores),} - -\item{signalCol}{A character vector with the names of the sample variables -of interest/target variables (e.g. PCs or sample phenotypes). - -The columns in `sampleLabels` for which to calculate -the variation related to the epigenetic data -(e.g. correlation) and then to run COCOA on.} - -\item{scoringMetric}{A character object with the scoring metric. -There are different methods available for -signalCoordType="singleBase" vs signalCoordType="multiBase". -For "singleBase", the available methods are "regionMean", -"regionMedian", "simpleMean", and "simpleMedian". -The default method is "regionMean". -For "multiBase", the methods are "proportionWeightedMean", -"simpleMean", and "simpleMedian". The default is "proportionWeightedMean". -"regionMean" is a weighted -average of the signal, weighted by region (absolute value of signal -if absVal=TRUE). First the signal is -averaged within each regionSet region, -then all the regions are averaged. With -"regionMean" method, be cautious in interpretation for -region sets with low number of regions that overlap signalCoord. The -"regionMedian" method is the same as "regionMean" but the median is taken -at each step instead of the mean. -The "simpleMean" -method is just the unweighted average of all (absolute) signal values that -overlap the given region set. For multiBase data, this includes -signal regions that overlap a regionSet region at all (1 base -overlap or more) and the signal for each overlapping region is -given the same weight for the average regardless of how much it overlaps. -The "simpleMedian" method is the same as "simpleMean" but takes the median -instead of the mean. -"proportionWeightedMean" is a weighted average of all signalCoord -regions that overlap with regionSet regions. For each signalCoord region -that overlaps with a regionSet region, we calculate what proportion -of the regionSet region is covered. Then this proportion is used to -weight the signal value when calculating the mean. -The denominator of the mean -is the sum of all the proportion overlaps.} - -\item{absVal}{Logical. If TRUE, take the absolute value of values in -signal. Choose TRUE if you think there may be some -genomic loci in a region set that will increase and others -will decrease (if there may be anticorrelation between -regions in a region set). Choose FALSE if you expect regions in a -given region set to all change in the same direction (all be positively -correlated with each other).} - -\item{olList}{list. Each list item should be a "SortedByQueryHits" object -(output of findOverlaps function). Each hits object should have the overlap -information between signalCoord and one item of GRList (one unique region set). -The region sets from GRList must be the "subject" in findOverlaps -and signalCoord must be the "query". E.g. findOverlaps(subject=regionSet, -query=signalCoord). -Providing this information can greatly improve permutation speed since the -overlaps will not have to be calculated for each permutation. -The "runCOCOAPerm" function calculates this information only once, internally, -so this does not have to be provided when using that function. When using -this parameter, signalCoord, -genomicSignal, and each region set must be in the same order as they were -when olList was created. Otherwise, the wrong genomic loci will be referenced -(e.g. if epigenetic features were filtered out of genomicSignal after olList -was created.)} - -\item{centerGenomicSignal}{Logical. Should rows in genomicSignal -be centered based on -their means? (subtracting row mean from each row)} - -\item{centerTargetVar}{Logical. Should columns in targetVar be -centered based -on their means? (subtract column mean from each column)} - -\item{variationMetric}{Character. The metric to use to quantify the -association between each feature in genomicSignal and each target -variable in sampleLabels. -Either "cor" (Pearson correlation), -"cov" (covariation), or "spearmanCor" (Spearman correlation).} - -\item{nPerm}{Numeric. The number of permutations to do.} - -\item{useSimpleCache}{Logical. Whether to use save caches. Caches -will be created for each permutation so that if the function is disrupted -it can restart where it left off. The final results are also saved -as a cache. See simpleCache package for more details.} - -\item{cacheDir}{Character. The path for the directory in which the -caches should be saved.} - -\item{dataID}{Character. A unique identifier for this dataset -(for saving results with simpleCache)} - -\item{testType}{Character. Parameter for `getPermStat`. Whether to -create p values based on one a two sided test or a lesser/greater one -sided test. Options are: "greater", "lesser", "two-sided"} - -\item{gammaFitMethod}{Character. method to use for fitting the gamma -distribution to null distribution. Options are -"mme" (moment matching estimation), "mle" (maximum likelihood estimation), -"qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit -estimation). See ?COCOA::getGammaPVal and -?fitdistrplus::fitdist() for more info.} - -\item{realScoreInDist}{Logical. Should the actual score (from -test with no permutations) be included in the null distribution -when fitting the gamma distribution. realScoreInDist=TRUE is -recommended.} - -\item{force}{Logical. If force=TRUE, when fitting the gamma distribution -returns an error (as may happen when a method other than "mme" -is used) then allow the error. If force=FALSE, when fitting the -gamma distribution returns an error then don't return an error but -instead use the "mme" method -for fitting that specific gamma distribution.} - -\item{verbose}{A "logical" object. Whether progress -of the function should be shown. One -bar indicates the region set is completed.} - -\item{returnCovInfo}{logical. If TRUE, the following coverage and -region set info will -be calculated and included in function output: regionSetCoverage, -signalCoverage, totalRegionNumber, and meanRegionSize. For the -proportionWeightedMean scoring method, -sumProportionOverlap will also be calculated.} - -\item{...}{Character. Optional additional arguments for simpleCache.} -} -\value{ -Returns a list with the following 4 items: 1. a list of length nPerm -where each item is a data.frame of the COCOA scores from a single -permutation. Each data.frame is the output of `runCOCOA()` -2. a data.table/data.frame of empirical p-values (the -output of `getPermStat`) 3. a -data.table/data.frame of z-scores (the output of `getPermStat`. -4. a data.frame of p-values based on -the gamma approximation (the output of getGammaPVal(). -} -\description{ -This is a convenience function that runs multiple steps of the -permutation process together: it runs COCOA permutations, converts these -to null distributions, gets the empirical p value (which is limited by the -number of permutations), gets z scores, and fits a gamma distribution -to each null distribution to estimate p values (not limited by the -number of permutations), -Requires that the user has previously calculated the real COCOA scores. -See these individual functions for more info on each step: runCOCOA, -convertToFromNullDist, getPermStat, and getGammaPVal. -} -\details{ -For reproducibility, set seed with 'set.seed()' function before running. -} -\examples{ -data("esr1_chr1") -data("nrf1_chr1") -data("brcaMethylData1") -data("brcaMCoord1") -pcScores <- prcomp(t(brcaMethylData1))$x -targetVarCols <- c("PC1", "PC2") -targetVar <- pcScores[, targetVarCols] - -# give the actual order of samples to `runCOCOA` to get the real scores -correctSampleOrder=1:nrow(targetVar) -realRSScores <- runCOCOA(genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRangesList(esr1_chr1, nrf1_chr1), - signalCol=c("PC1", "PC2"), - targetVar=targetVar, - sampleOrder=correctSampleOrder, - variationMetric="cor") - -# give random order of samples to get random COCOA scores -# so you start building a null distribution for each region set -# (see vignette for example of building a null distribution with `runCOCOA`) -randomOrder <- sample(1:nrow(targetVar), - size=nrow(targetVar), - replace=FALSE) -randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRangesList(esr1_chr1, nrf1_chr1), - signalCol=c("PC1", "PC2"), - targetVar=targetVar, - sampleOrder=randomOrder, - variationMetric="cor") - -# runCOCOAPerm -permResults <- runCOCOAPerm(genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRangesList(esr1_chr1, nrf1_chr1), - rsScores=realRSScores, - targetVar=targetVar, - signalCol=c("PC1", "PC2"), - variationMetric="cor", - nPerm = 10, - useSimpleCache=FALSE) -permResults - - -} diff --git a/man/signalAlongAxis.Rd b/man/signalAlongAxis.Rd index f5d23ee..15d3f3d 100755 --- a/man/signalAlongAxis.Rd +++ b/man/signalAlongAxis.Rd @@ -14,6 +14,8 @@ signalAlongAxis( topXVariables = NULL, variableScores = NULL, decreasing = TRUE, + regionAnnoGRList = NULL, + plotRegionMean = FALSE, cluster_columns = FALSE, cluster_rows = FALSE, row_title = "Sample", @@ -73,6 +75,14 @@ nrow(genomicSignal)). Only used if topXVariables is given. The highest \item{decreasing}{Logical. Whether samples should be sorted in decreasing order of `orderByCol` or not (FALSE is increasing order).} +\item{regionAnnoGRList}{GRangesList. Each GRanges in the list should be +a genomic annotation. These will be displayed above or below the heatmap.} + +\item{plotRegionMean}{Logical. If TRUE, the genomicSignal will be averaged +for each region in regionSet and those region averages +will be plotted instead of the +original variables in genomicSignal.} + \item{cluster_columns}{Logical. Whether to cluster columns (the genomic signal, e.g. DNA methylation values for each CpG).} diff --git a/vignettes/COCOA_analysis_details.Rmd b/vignettes/COCOA_analysis_details.Rmd new file mode 100755 index 0000000..13f438c --- /dev/null +++ b/vignettes/COCOA_analysis_details.Rmd @@ -0,0 +1,1100 @@ +--- +title: "COCOA Analysis Details" +author: "John Lawson" +date: "`r Sys.Date()`" +output: + BiocStyle::html_document: + toc_float: true + toc_collapsed: true +vignette: > + %\VignetteIndexEntry{COCOA Analysis Details} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +# The COCOA Bioconductor package + + +**Coordinate Covariation Analysis (COCOA) is a method to understand epigenetic variation among samples. It does this by annotating your sample epigenetic variation using region sets as reference data. COCOA was designed for DNA methylation data and chromatin accessibility data, but can be used on any data includes genomic coordinates.** + + +Epigenetic differences between samples can be hard to interpret because epigenetic data is high dimensional. One way to improve interpretation is to group high dimensional data into functional categories. COCOA does this using groups of related genomic regions, which we refer to as region sets. A region set, commonly stored as a BED file, is a set of genomic regions that share a biological annotation. Some examples are: binding sites for a certain transcription factor; cell-type specific open chromatin; or regions with a certain histone modification. COCOA uses reference region sets derived from public data to annotate inter-sample epigenetic variation. + +## COCOA modes: supervised and unsupervised + +COCOA has two modes: supervised, and unsupervised. A supervised COCOA analysis annotates epigenetic variation that is related to a specific sample phenotype of interest (such as a molecular phenotype, or a higher-level phenotype). For example, we might want to see what epigenetic variation is associated with disease status, disease severity, survival, a certain mutation, or the expression of a certain gene. In contrast, an unsupervised COCOA analysis annotates inter-sample epigenetic variation without requiring a sample phenotype. Often, unsupervised analyses will look at the major sources of inter-sample variation by using a dimensionality reduction technique such as principal component analysis (PCA). COCOA can annotate those major sources of inter-sample variation (e.g. the principal components) for linear dimensionality reduction methods like PCA. To annotate the epigenetic variation of interest, whether supervised or unsupervised, COCOA finds reference region sets that are most associated with that variation, giving you biologically meaningful insight into variation in your data. + +COCOA can be used in many different contexts so feel free to skip to the section that is most relevant to you. You may want to start with the [Basic workflow] section for a high level overview of the COCOA workflow. In this vignette, we'll show COCOA workflows with [DNA methylation](#DNAm) and [chromatin accessibility](#ATAC) data and for each of those we will demonstrate a supervised and unsupervised analysis. The vignettes are very similar since a similar COCOA workflow is used. For a more in-depth description of the method, see the "[Method details]" section and [the COCOA paper](https://doi.org/10.1186/s13059-020-02139-4). + +# Basic workflow + +We start off with a matrix of epigenetic data in which rows correspond to the epigenetic features (e.g. CpG or accessible chromatin region) and columns correspond to samples. Each epigenetic feature must have genomic coordinates, which are stored in a separate object. + +## Step 1: Define target variable + +For a supervised analysis, the target variable is defined by you. For an unsupervised analysis, you first do dimensionality reduction on your epigenetic data. Then, you take the principal component/latent factor sample scores for the principal components you are interested in and treat them as your target variables. + +## Step 2: Quantify association between inter-sample epigenetic variation and target variable + +Once you've defined your target variable, the next step is to quantify the association between your epigenetic data and the target variables. This produces a score for each epigenetic feature that represents how much it contributes to the target variable. We'll refer to these scores as **feature contribution scores**. + +### Ways to quantify association + +There are several ways to quantify association. The metrics that are built in to COCOA are covariation, Pearson correlation, and Spearman correlation. You could also use another metric of your choice that results in a score for each epigenetic feature representing its association with the target variable. For correlation, we take the correlation coefficient as the feature contribution score for each epigenetic feature. For example, if your target variable is a PC score, then the correlation coefficient for each epigenetic feature represents how much it is associated with a given principal component. + +## Step 3: Annotate variation with region sets + +The next step in COCOA is to find out which reference region sets are most associated with variation in the target variable. To do this, we use the COCOA algorithm to score each reference region set based on the epigenetic features from your data that it overlaps. Essentially, the score for a region set is the average of the scores of the epigenetic features that it overlaps (see the `scoringMetric` parameter in ?runCOCOA for more details). + +After computing a score for each region set, the region sets can be ranked to see which region sets are most associated with the target variable. High scores for small region sets are more likely to be due to noise but this is addressed by the permutation test. Finding region sets where epigenetic variation is associated with variation in the target variable can give biological insight into your data. + +### Optional step: Permutation test + +To account for the different sizes of region sets and to assess statistical significance of the results, we can do a permutation test to estimate p-values. We run COCOA on shuffled data to get null distributions for each region set. More specifically, we shuffle the samples' values for the target variable and recompute COCOA scores as described above. After doing this many times, the permutation COCOA scores for a given region set make up a region set-specific null distribution. This null distribution accounts for the size of the region set. If multiple target variables are used, each target variable, region set combination will have its own unique null distribution. To avoid the need to run a large number of permutations, we can run a small number of permutations (a few hundred) and then fit a gamma distribution to each null distribution. We use the gamma distribution to estimate a p-value for the real COCOA score for each region set. It should be noted that the gamma approximation is accurate for high p values, but may overestimate the significance of low p values. Because of this, the approximation may be helpful for screening out region sets that are not significant but should be interpreted carefully for low p-values. Our analysis of the gamma approximation as used by COCOA is shown in the COCOA paper. As mentioned before, the unsupervised COCOA test does not use PCA loadings as the feature contribution scores. This is done so that PCA, a time-consuming computation, does not have to be recomputed for every permutation. Instead, the PC sample scores are treated as the target variable and shuffled so that the feature contribution scores can be recalculated. + +# COCOA for DNA methylation data {#DNAm} + +COCOA uses a database of region sets to annotate inter-sample epigenetic variation in your data. In this vignette, we will see how COCOA can find meaningful sources of DNA methylation variation in breast cancer patients. + +First, we will show how to use COCOA to find region sets where DNA methylation variation is associated with variation in a phenotype of interest (the target variable) ([Supervised COCOA](#sCOCOADNAm)). Then, we will show how to find region sets that are associated with the principal components of PCA of our DNA methylation data ([Unsupervised COCOA](#uCOCOADNAm)). + +## Our data + +We will use data from The Cancer Genome Atlas: 450k DNA methylation microarrays for breast cancer patients (TCGA-BRCA, https://portal.gdc.cancer.gov/). We (the authors) ran an unsupervised COCOA analysis outside of this vignette on the full methylation data for 657 patients and a large region set database of over 2000 region sets. We used those results to select region sets and a subset of the DNA methylation data for this vignette. We are using two of the highest and two of the lowest scoring region sets from that analysis. Those region sets are transcription factor binding regions from ChIP-seq experiments (with the same reference genome as our breast cancer data): ESR1 in MCF-7 cells, GATA3 in MCF-7 cells, NRF1 in HepG2 cells, and Atf1 in K562 cells. Only chr1 regions are included in the vignette. For a real analysis, we recommend using hundreds or thousands of region sets. For sources of region sets, see the "[Region set database]" section. We also have subsetted the DNA methylation data to a small number of CpGs that we determined were necessary for the vignette in order to keep the vignette data small. Since we're using only a small subset of the data, the vignette results may not necessarily be generalizable to the full data. In your real analysis, we recommend using as many CpGs as possible. + +## Supervised COCOA {#sCOCOADNAm} + +Goal: Understand epigenetic variation related to a specific phenotype + +In a supervised COCOA analysis, we want to annotate epigenetic variation that is related to variation in one or more sample phenotypes. These sample phenotypes could be molecular (e.g. expression of a protein marker, presence/variant allele frequency of a mutation, or expression of a certain gene). The sample phenotypes can also be higher-level organism phenotypes (e.g. patient survival, cancer stage, disease severity). COCOA will identify region sets where the epigenetic signal is most related to the phenotype. These region sets then can help to understand the relationship between the phenotype and epigenetic state. + +Vignette analysis goal: As mentioned previously, we are looking at DNA methylation in breast cancer patients. Our goal is to understand epigenetic variation related to our phenotype of interest, estrogen receptor (ER) status. + +### Quantify relationship between chosen sample phenotype and epigenetic data + +First we will load the necessary data: BRCA DNA methylation data with genomic coordinates, region sets to test, and our phenotype of interest, ER status (part of `brcaMetadata`). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +library(COCOA) +data("esr1_chr1") +data("gata3_chr1") +data("nrf1_chr1") +data("atf3_chr1") +data("brcaMCoord1") +data("brcaMethylData1") +data("brcaMetadata") +``` + +ER status is our target variable so we'll extract that from the patient metadata. The target variables must be a `data.frame` object with columns corresponding to target variables (here, `drop=FALSE` keeps the data from turning into a vector, since we have only 1 target variable). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +myPhen <- "ER_status" +targetVarDF <- brcaMetadata[colnames(brcaMethylData1), myPhen, drop=FALSE] +``` + +Let's convert estrogen receptor status to a number so we can do calculations with it. Then we will quantify the association between ER status and the DNA methylation level at each cytosine using Pearson correlation. If NA's are present for any samples for a given CpG, the default correlation parameters will return NA. The parameter "pairwise.complete.obs" can be used to get correlation values even if some samples have NAs/missing data for a CpG (we use this in our code as an example even though our data does not have NAs). Whether to use this parameter is up to the judgment of the user, which could be decided based on what percent of samples have no data for a given CpG (filter out CpGs missing data for a certain percent of samples, use the rest). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +targetVarDF$ER_status <- scale(as.numeric(targetVarDF$ER_status), + center=TRUE, scale=FALSE) +methylCor <- cor(t(brcaMethylData1), targetVarDF$ER_status, + use = "pairwise.complete.obs") +# if the standard deviation of the methylation level +# for a CpG across samples is 0, +# cor() will return NA, so manually set the correlation to 0 for these CpGs +methylCor[is.na(methylCor)] <- 0 +colnames(methylCor) <- myPhen +``` + +Now we have a feature contribution score (the correlation coefficient in this case) for each methylation site that represents how much the methylation at that site is associated with ER status. We can use those individual cytosine scores to score region sets. We'll score our region sets with the `aggregateSignalGRList` function. + +### Score region sets + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +GRList <- GRangesList(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1) +regionSetNames <- c("ESR1", "GATA3", "ATF3", "NRF1") +rsScores <- aggregateSignalGRList(signal=methylCor, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + scoringMetric="default", + absVal=TRUE) +rsScores$regionSetName <- regionSetNames +rsScores +``` + +Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with the phenotype of interest, ER status. Since we are using the absolute value of the correlation coefficient for each CpG to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the DNA methylation in ER and GATA3 region sets is much more highly associated with ER status than in the NRF1 and ATF3 region sets. + +Both the correlation and scoring steps can be combined into one step with the `runCOCOA` function as will be shown in the next section. + +### Estimating statistical significance + +Now that we have a score for each region set, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. + +#### Permutations + +By shuffling the sample phenotypes and calculating fake COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the sample phenotypes. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +set.seed(100) + +nPerm <- 5 + +permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + targetVar=targetVarDF, + nPerm=nPerm, + variationMetric="cor", + useSimpleCache = FALSE) + +permRSScores[1:3] +``` + +We now have a list where each item is a result of `aggregateSignalGRList` for a permutation of the sample phenotypes. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +nullDistList <- convertToFromNullDist(permRSScores) +names(nullDistList) <- regionSetNames +nullDistList +``` + +We have four null distributions, one for each region set in `GRList`. + +#### Fit gamma distribution to null distributions and get p-values + +To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# p-values based on fitted gamma distributions +gPValDF <- getGammaPVal(rsScores = rsScores, + nullDistList = nullDistList, + signalCol = myPhen, + method = "mme", realScoreInDist = TRUE) +gPValDF <- cbind(gPValDF, + rsScores[, colnames(rsScores)[!(colnames(rsScores) + %in% myPhen)]]) +gPValDF <- cbind(gPValDF, regionSetNames) +gPValDF +``` + +Fitting a gamma distribution with only 5 permutations does not really make sense so that is one reason the p-values for ER and GATA3 (which we would expect to be lower than NRF1/ATF3) are higher than those of NRF1 and ATF3. If you increase the number of permutations to 50, you will see that ER and GATA3 have lower p-values than ATF3 and NRF1 in that more realistic scenario. Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores for ER and GATA3 were much larger than those of NRF1 and ATF3. + +After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). + +We can also use the null distributions to directly calculate empirical p-values: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +getPermStat(rsScores = rsScores, + nullDistList = nullDistList, + signalCol = myPhen) +``` + +An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. + +The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. + +## Unsupervised COCOA {#uCOCOADNAm} + +Goal: Understand major sources of epigenetic variation present in data without knowing beforehand what they are. + +In an unsupervised COCOA analysis, we normally begin with a dimensionality reduction technique such as PCA that identifies major sources/axes of inter-sample variation. Then the PC/latent factors are treated as the target variables for COCOA and a workflow similar to the supervised COCOA analysis is used. We identify region sets where the epigenetic signal varies along the PC axis. These region sets can help understand the biological meaning of the PCs/latent factors. + +Vignette analysis goal: As mentioned previously, we are looking at DNA methylation in breast cancer patients. Our goal is to understand sources of inter-sample epigenetic variation in an unsupervised way (without using sample phenotypes or groups). + +### Quantify relationship between latent factors and epigenetic data + +First we will load the required packages and necessary data: BRCA DNA methylation data with genomic coordinates and region sets to test: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +library(COCOA) +library(data.table) +library(ggplot2) +data("esr1_chr1") +data("gata3_chr1") +data("nrf1_chr1") +data("atf3_chr1") +data("brcaMCoord1") +data("brcaMethylData1") +data("brcaMetadata") +``` + +Next, we will do PCA on our breast cancer DNA methylation data. PCA could take a while for large datasets (e.g. longer than 30 minutes). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +pca <- prcomp(t(brcaMethylData1)) +pcScores <- pca$x + +plot(pcScores[, c("PC1", "PC2")]) +``` + +Each point is a sample. It is not immediately clear what the meaning of PC1 (or PC2) is and we see a wide spectrum of variation among samples. COCOA will help us understand the biological meaning of the PCs. + +After the PCA, you might want to look at plots of the first few PCs and consider removing extreme outliers and rerunning the PCA since PCA can be affected by outliers, although this depends on your analysis. We note that COCOA will still work even if there are not distinct clusters of samples in your PCA plot and if you do not have known groups of samples. In this vignette, we will look at principal components 1-4 but this choice also depends on the context of your analysis. + +We treat the PC scores as our "target variables" for this analysis and calculate the correlation between the DNA methylation level of each CpG and each PC. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +PCsToAnnotate <- paste0("PC", 1:4) +targetVar <- pcScores[, PCsToAnnotate] +targetVar <- as.matrix(scale(targetVar, + center=TRUE, scale=FALSE)) +methylCor <- cor(t(brcaMethylData1), targetVar, + use = "pairwise.complete.obs") +# if the standard deviation of the methylation level +# for a CpG across samples is 0, +# cor() will return NA, so manually set the correlation to 0 for these CpGs +methylCor[is.na(methylCor)] <- 0 +``` + +### Score region sets + +Let's put our region sets into one object to simplify the next steps of the analysis. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# prepare data +GRList <- GRangesList(esr1_chr1, gata3_chr1, nrf1_chr1, atf3_chr1) +regionSetNames <- c("esr1_chr1", "gata3_chr1", "nrf1_chr1", "atf3_chr1") +``` + +Now let's give each region set a score with `aggregateSignalGRList()` to quantify how much it is associated with each principal component: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionSetScores <- aggregateSignalGRList(signal=methylCor, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=PCsToAnnotate, + scoringMetric="default") +regionSetScores$regionSetName <- regionSetNames +regionSetScores +``` + +Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with each PC. Since we are using the absolute value of the correlation coefficient for each CpG to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the DNA methylation in ER and GATA3 region sets is more highly associated with the PC scores than the DNA methylation in the NRF1 and ATF3 region sets is. Out of PCs 1-4, ER and GATA3 have the highest scores for PC1. This result makes sense when we visualize ER status on the PCA plot. As you can see in the figure below, PC1 separates samples pretty well based on ER status (even though ER status was not used when doing the PCA). + + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +annoPCScores <- data.frame(pcScores, ER_status=as.factor(brcaMetadata[row.names(pcScores), "ER_status"])) +ggplot(data = annoPCScores, mapping = aes(x=PC1, y=PC2, col=ER_status)) + geom_point() + ggtitle("PCA of a subset of DNA methylation data from breast cancer patients") + theme_classic() +``` + +We can visualize the scores of region sets of interest with the `plotAnnoScoreDist` function. This function allows you to supply regular expressions matching one or more groups of region sets and will color the region set scores according to their group. This can be useful to see trends when using a large region set database. Here are scores for PC1, coloring region sets related to ER: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +plotAnnoScoreDist(rsScores = regionSetScores, + colToPlot = "PC1", + pattern = "GATA3|ESR1", + patternName = "ER-related", + rsNameCol = "regionSetName", + alpha=1) +``` + +As an easy way to visualize how the region sets are ranked for multiple target variables, we can use `rsScoreHeatmap`: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +rsScoreHeatmap(regionSetScores, + signalCol=paste0("PC", 1:4), + rsNameCol = "regionSetName", + orderByCol = "PC1", + column_title = "Region sets ordered by score for PC1") +``` + +We can see that GATA3 had the highest score for PC 1 but that ESR1 had a higher score for PCs 2, 3, and 4. If you want to arrange the heatmap by region set scores for another PC, just change the orderByCol parameter like so: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +rsScoreHeatmap(regionSetScores, + signalCol=paste0("PC", 1:4), + rsNameCol = "regionSetName", + orderByCol = "PC2", + column_title = "Region sets ordered by score for PC2") +``` + +### Estimating statistical significance + +Now that we have a score for each region set for each PC, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. + +#### Permutations + +By shuffling the target variable and calculating COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the samples' target variables. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +set.seed(100) + +nPerm <- 5 + +permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + targetVar=targetVarDF, + nPerm=nPerm, + variationMetric="cor", + useSimpleCache = FALSE) + +permRSScores[1:3] +``` + +We now have a list where each item is a result of `aggregateSignalGRList`. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +nullDistList <- convertToFromNullDist(permRSScores) +names(nullDistList) <- regionSetNames +nullDistList +``` + +We have four null distributions, one for each region set in `GRList`. + +#### Fit gamma distribution to null distributions and get p-values + +To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# p-values based on fitted gamma distributions +gPValDF <- getGammaPVal(rsScores = regionSetScores, + nullDistList = nullDistList, + signalCol = PCsToAnnotate, + method = "mme", realScoreInDist = TRUE) +gPValDF <- cbind(gPValDF, + regionSetScores[, colnames(regionSetScores)[!(colnames(regionSetScores) + %in% PCsToAnnotate)]]) +gPValDF <- cbind(gPValDF, regionSetNames) +gPValDF +``` + +You might find it surprising that the p-values of NRF1 and ATF3 are lower than those of ER and GATA3. Keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores for ER and GATA3 were much larger than those of NRF1 and ATF3, even if NRF1 and ATF3 might have lower p-values. + +After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). + +We can also use the null distributions to directly calculate empirical p-values: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +getPermStat(rsScores = regionSetScores, + nullDistList = nullDistList, + signalCol = PCsToAnnotate) +``` + +An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. + +The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. + +## Further understanding the results (visualization) + +We can further understand the variability in these region sets in several ways: + +1. Look at whether variability is specific to the regions of interest compared to the genome around these regions. +2. Visualize the epigenetic signal in these regions and whether it correlates with the target variable. +3. Look at the feature contribution scores (e.g. correlation coefficients) in each region of a given region set to see whether all regions have high contribution scores or just a subset the regions. Also we can see if the same regions have high contribution scores for multiple target variables. + +To demonstrate these techniques, we'll be using results from [the unsupervised analysis above](#uCOCOADNAm). + +### Specificity of variation to the regions of interest + +We can see whether variability associated with the target variable is specific to the region of interest by comparing the region of interest to the surrounding genome. To do this, we will calculate the average feature contribution scores (FCS) of a wide area surrounding the regions of interest (14 kb centered on each region set region). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +wideGRList <- lapply(GRList, resize, width=14000, fix="center") +fcsProfile <- lapply(wideGRList, function(x) getMetaRegionProfile(signal=methylCor, + signalCoord=brcaMCoord1, + regionSet=x, + signalCol=PCsToAnnotate, + binNum=21)) +``` + +We will normalize the result for each PC so we can better compare them. Here we normalize by subtracting the mean absolute FCS of all epigenetic features for each PC from the region set profiles for the corresponding PC. Then we get the plot scale so we can easily compare the different profiles. These normalization steps are helpful for comparing the meta-region profiles but not necessarily required so it's not essential that you understand the below code. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# average FCS from each PC to normalize so PCs can be compared with each other +avFCS <- apply(X=methylCor[, PCsToAnnotate], + MARGIN=2, + FUN=function(x) mean(abs(x))) + +# normalize +fcsProfile <- lapply(fcsProfile, + FUN=function(x) as.data.frame(mapply(FUN = function(y, z) x[, y] - z, + y=PCsToAnnotate, z=avFCS))) +binID = 1:nrow(fcsProfile[[1]]) +fcsProfile <- lapply(fcsProfile, FUN=function(x) cbind(binID, x)) + +# for the plot scale +maxVal <- max(sapply(fcsProfile, FUN=function(x) max(x[, PCsToAnnotate]))) +minVal <- min(sapply(fcsProfile, FUN=function(x) min(x[, PCsToAnnotate]))) + +# convert to long format for plots +fcsProfile <- lapply(X=fcsProfile, FUN=function(x) tidyr::gather(data=x, key="PC", value="meanFCS", PCsToAnnotate)) +fcsProfile <- lapply(fcsProfile, + function(x){x$PC <- factor(x$PC, levels=PCsToAnnotate); return(x)}) +``` + +Let's look at the plots! + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +wrapper <- function(x, ...) paste(strwrap(x, ...), collapse="\n") +profilePList <- list() +for (i in seq_along(fcsProfile)) { + + thisRS <- fcsProfile[[i]] + + profilePList[[i]] <- ggplot(data=thisRS, + mapping=aes(x=binID , y=meanFCS)) + + geom_line() + ylim(c(minVal, maxVal)) + facet_wrap(facets="PC") + + ggtitle(label=wrapper(regionSetNames[i], width=30)) + + xlab("Genome around region set, 14 kb") + + ylab("Normalized mean FCS") + + theme(panel.grid.major.x=element_blank(), + panel.grid.minor.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + profilePList[[i]] + +} +profilePList[[1]] +profilePList[[2]] +profilePList[[3]] +profilePList[[4]] +``` + +These plots show the average magnitude of the feature contribution scores in the genome around and including the regions of interest. The FCS for an input epigenetic feature, indicates how much that input feature varies in the same direction as the target variable (here, the PC scores). If you used correlation or covariation to get the feature contribution scores, a peak in the middle of the profile indicates that there is increased covariation in the regions of interest compared to the surrounding genome. It suggests that those regions are changing in a coordinated way whereas the surrounding genome is not changing in a coordinated way to the same extent. A peak suggests that the variation in the target variable (the PCs) may be somehow specifically related to the region set although it is not clear whether the region set is causally linked to the variation or just affected by other things that are causing the variation associated with the target variable. Some region sets may have an increased FCS but no peak, for example, some histone modification region sets like H3K27me3 or H3K9me3. This doesn't necessarily mean these regions are not relevant. It could just mean that there is variability in larger blocks of the genome around these histone modifications (the expanded regions might also overlap with each other). For details on how the meta-region profile was created, check out [the section about it in the "Method details" section](#mrProfile) of this vignette or see the docs for `getMetaRegionProfile` with `?COCOA::getMetaRegionProfile`. + +These plots show that ESR1 and GATA3 binding regions demonstrate higher covariation along the PC1 axis than the surrounding genome does (because they have a peak in the middle) while NRF1 and ATF3 do not. So for example, if you line up samples by their PC1 score, then as you go from low to high PC1 score, the DNA methylation of ESR1 binding regions will generally change in a coordinated way across samples (PCs are based on covariation) but the DNA methylation in the surrounding genome would not change as much or would not change in a coordinated way. The results from our four region sets suggest that ESR1 and GATA3 regions specifically contribute to the variation along the PC 1 (as well as perhaps other PCs), helping us understand the biological meaning of the variation captured by those PCs: at least in part related to estrogen receptor. + +### The raw data + +If a region set has a high score for a certain target variable, we would expect that the epigenetic signal in at least some of those regions would correlate with the target variable. In other words, as you go from a high value of the target variable to a low value of the target variable, in this case as you go along the PC axis, the epigenetic signal will either go up or down. Let's look at the epigenetic signal in ESR1 regions. In the following plot, each column is a CpG in an ESR1 region and each row is a patient. We're only showing the 100 CpGs with the highest feature contribution scores but you can adjust or remove the `topXVariables` parameter to visualize more CpGs. Patients are ordered by their PC score for PC1. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +signalAlongAxis(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + regionSet=esr1_chr1, + sampleScores=pcScores, + topXVariables = 100, + variableScores = abs(methylCor[, "PC1"]), + orderByCol="PC1", cluster_columns=TRUE, + column_title = "Individual cytosine/CpG", + name = "DNA methylation level", + show_row_names=FALSE) +``` + +The CpGs that show high variation along the PC axis are the ones that contribute to the ESR1 region set being ranked highly in our analysis. + +Now let's look at one of the region sets, NRF1, that had a lower score for PC1: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +signalAlongAxis(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + regionSet=nrf1_chr1, + sampleScores=pcScores, + topXVariables = 100, + variableScores = abs(methylCor[, "PC1"]), + orderByCol="PC1", + cluster_columns=TRUE, + column_title = "Individual cytosine/CpG", + name = "DNA methylation level", + show_row_names=FALSE) +``` + +When patients are ordered according to PC1 score, we can see that there is very little covariation of the epigenetic signal in these regions. Therefore, it is not surprising that the NRF1 region set had a low score for this PC. + +Since COCOA ranks region sets based on their relative scores in comparison to other region sets tested, there will always be a region set with a best score. The permutation test gives an idea of statistical significance but not necessarily effect size. The actual COCOA scores give an idea of the magnitude of variation of a region set but it can be a good idea to also visually check the raw genomic signal in your top region sets to see how great the extent of variation is along the PC. + +### Feature contribution scores of individual regions + +This plot can help you learn more about the contribution of individual region set regions to the region set score for each target variable. For example, if the estrogen receptor region set was associated with PCs 1, 3, and 4, we might wonder whether the same regions are causing the association with these PCs or whether different regions are associated with each PC. To do this, we will first calculate the average absolute feature contribution score for each region in a region set (obtained by averaging a given target variable's FCS for epigenetic features that overlap that region). Then we can use the distribution of FCS for each target variable to convert each region's FCS to a percentile to see how extreme/high that region is for each target variable. Let's look at the plot for estrogen receptor: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionQuantileByTargetVar(signal = methylCor, + signalCoord = brcaMCoord1, + regionSet = esr1_chr1, + rsName = "Estrogen receptor (chr1)", + signalCol=paste0("PC", 1:4), + maxRegionsToPlot = 8000, + cluster_rows = TRUE, + cluster_columns = FALSE, + column_title = rsName, + name = "Percentile of feature contribution scores in PC") +``` + +We can see that some of the same regions have high FCS for multiple PCs (i.e. these regions are important for these PCs). Also, there are some regions that do not have high FCS for PCs 1-3, suggesting that these regions are not associated with the largest sources of covariation in the data. Overall, PC1 has the highest FCS, consistent with our meta-region profiles (highest peak for PC1). + +For contrast, we can look at the regions of NRF1: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionQuantileByTargetVar(signal = methylCor, + signalCoord = brcaMCoord1, + regionSet = nrf1_chr1, + rsName = "NRF1 (chr1)", + signalCol=paste0("PC", 1:4), + maxRegionsToPlot = 8000, + cluster_rows = TRUE, + cluster_columns = FALSE, + column_title = rsName, + name = "Percentile of feature contribution scores in PC") +``` + + +We can see that fewer regions in the NRF1 region set have high FCS for PCs 1-4. This is consistent with this region set being ranked lower by COCOA for association with these PCs. + +# COCOA for chromatin accessibility data (ATAC-seq) {#ATAC} + +COCOA uses a database of region sets to annotate inter-sample epigenetic variation in your data. In this vignette, we will see how COCOA can find meaningful sources of chromatin accessibility variation in breast cancer patients. + +First, we will show how to use COCOA to find region sets where chromatin accessibility variation is associated with variation in a phenotype of interest (the target variable) ([Supervised COCOA](#sCOCOAATAC)). Then, we will show how to find region sets that are associated with the principal components of PCA of our chromatin accessibility data ([Unsupervised COCOA](#uCOCOAATAC)). + +## Our data + +We will use chromatin accessibility data (ATAC-seq) from breast cancer patients from The Cancer Genome Atlas (TCGA-BRCA) from [Corces et al., 2018](#refs). We have a matrix of counts in peak regions for each sample. We (the authors) have subsetted this data to a small number of peak regions that we determined were necessary for the vignette in order to keep the vignette data small. Since we're using only a small subset of the data, the vignette results may not necessarily be generalizable to the full data. In your real analysis, we recommend using as many ATAC-seq regions as possible. We selected two region sets for this vignette that we expected to show variation in chromatin accessibility between breast cancer patients and two region sets that we expected to not show variation. Those region sets are transcription factor binding regions from ChIP-seq experiments (with the same reference genome as our breast cancer data): ESR1 in MCF-7 cells, GATA3 in MCF-7 cells, NRF1 in HepG2 cells, and Atf1 in K562 cells. Only chr1 regions are included in the vignette. For a real analysis, we recommend using hundreds or thousands of region sets. For sources of region sets, see the "[Region set database]" section. + +## Supervised COCOA {#sCOCOAATAC} + +Goal: Understand epigenetic variation related to a specific phenotype + +In a supervised COCOA analysis, we want to annotate epigenetic variation that is related to variation in one or more sample phenotypes. These sample phenotypes could be molecular (e.g. expression of a protein marker, presence/variant allele frequency of a mutation, or expression of a certain gene). The sample phenotypes can also be higher-level organism phenotypes (e.g. patient survival, cancer stage, disease severity). COCOA will identify region sets where the epigenetic signal is most related to the phenotype. These region sets then can help to understand the relationship between the phenotype and epigenetic state. + +Vignette analysis goal: As mentioned previously, we are looking at chromatin accessibility data in breast cancer patients. Our goal is to understand epigenetic variation related to our phenotype of interest, estrogen receptor (ER) status. + +### Quantify relationship between chosen sample phenotype and epigenetic data + +First we will load the necessary data: BRCA chromatin accessibility data with genomic coordinates, region sets to test, and our phenotype of interest, ER status (part of `brcaMetadata`). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +library(COCOA) +data("esr1_chr1") +data("gata3_chr1") +data("nrf1_chr1") +data("atf3_chr1") +data("brcaATACCoord1") +data("brcaATACData1") +data("brcaMetadata") +``` + +As mentioned, ER status is our target phenotype so we'll pull that info out of the patient metadata. Since we only have one target variable, we are keeping the data as a data.frame object using drop=FALSE (instead of targetVarDF becoming a vector). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +myPhen <- "ER_status" +targetVarDF <- brcaMetadata[colnames(brcaATACData1), myPhen, drop=FALSE] +``` + +Let's convert estrogen receptor status to a number so we can do calculations with it. Then we will quantify the association between ER status and the ATAC-seq counts in each peak region using Pearson correlation. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +targetVarDF$ER_status <- scale(as.numeric(targetVarDF$ER_status), + center=TRUE, scale=FALSE) +atacCor <- cor(t(brcaATACData1), targetVarDF$ER_status, + use = "pairwise.complete.obs") +# if the standard deviation of the epigenetic signal +# for a peak region across samples is 0, +# cor() will return NA, so manually set the correlation to 0 for these regions +atacCor[is.na(atacCor)] <- 0 +colnames(atacCor) <- myPhen +``` + +Now we have a feature contribution score (the correlation coefficient in this case) for each peak region that represents how much the accessibility at that site is associated with ER status. We can use those individual peak region scores to score reference region sets from our region set database. We'll score our region sets with the `aggregateSignalGRList` function. + +### Score region sets + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +GRList <- GRangesList(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1) +regionSetNames <- c("ESR1", "GATA3", "ATF3", "NRF1") +rsScores <- aggregateSignalGRList(signal=atacCor, + signalCoord=brcaATACCoord1, + GRList=GRList, + signalCol=myPhen, + scoringMetric="default", + absVal=TRUE) +rsScores$regionSetName <- regionSetNames +rsScores +``` + +Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with the phenotype of interest, ER status. Since we are using the absolute value of the correlation coefficient for each peak region to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that there is not much difference in the chromatin accessibility associated with ER status in the ER and GATA3 region sets compared to the NRF1 and ATF3 region sets. While this is a little unexpected (we would expect chromatin accessibility in ER and GATA3 region sets to be associated with ER status), we hypothesize that this might be because the clinical ER+/- designation does not accurately represent the molecular phenotype of some samples. This hypothesis is based on [our unsupervised analysis](#uCOCOAATAC) where some ER- samples cluster with the ER+ samples and vice versa so you can check that out if you are curious. + +Both the correlation and scoring steps can be combined into one step with the `runCOCOA` function as will be shown in the next section. + +### Estimating statistical significance + +Now that we have a score for each region set, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. + +#### Permutations + +By shuffling the sample phenotypes and calculating fake COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the sample phenotypes. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +set.seed(100) + +nPerm <- 5 + +permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + targetVar=targetVarDF, + nPerm=nPerm, + variationMetric="cor", + useSimpleCache = FALSE) + +permRSScores[1:3] +``` + +We now have a list where each item is a result of `aggregateSignalGRList` for a permutation of the sample phenotypes. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +nullDistList <- convertToFromNullDist(permRSScores) +names(nullDistList) <- regionSetNames +nullDistList +``` + +We have four null distributions, one for each region set in `GRList`. + +#### Fit gamma distribution to null distributions and get p-values + +To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# p-values based on fitted gamma distributions +gPValDF <- getGammaPVal(rsScores = rsScores, + nullDistList = nullDistList, + signalCol = myPhen, + method = "mme", realScoreInDist = TRUE) +gPValDF <- cbind(gPValDF, + rsScores[, colnames(rsScores)[!(colnames(rsScores) + %in% myPhen)]]) +gPValDF <- cbind(gPValDF, regionSetNames) +gPValDF +``` + +Fitting a gamma distribution with only 5 permutations does not really make sense but we do it here just to illustrate function usage. You can increase the number of permutations and see how that will affect the p-values (with 100 permutatios ER has the lowest p-value with p<0.05) Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores can be used as an indicator of effect size. + +After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). + +We can also use the null distributions to directly calculate empirical p-values: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +getPermStat(rsScores = rsScores, + nullDistList = nullDistList, + signalCol = myPhen) +``` + +An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. + +The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. + +## Unsupervised COCOA {#uCOCOAATAC} + +Goal: Understand major sources of epigenetic variation present in data without knowing beforehand what they are. + +In an unsupervised COCOA analysis, we normally begin with a dimensionality reduction technique such as PCA that identifies major sources/axes of inter-sample variation. Then the principal components/latent factors are treated as the target variables for COCOA and a workflow similar to the supervised COCOA analysis is used. We identify region sets where the epigenetic signal varies along the PC axis. These region sets can help understand the biological meaning of the PCs/latent factors. + +Vignette analysis goal: As mentioned previously, we are looking at chromatin accessibility in breast cancer patients. Our goal is to understand sources of inter-sample epigenetic variation in an unsupervised way (without using sample phenotypes or groups). + +### Quantify relationship between latent factors and epigenetic data + +First we will load the required packages and necessary data: BRCA ATAC-seq count data with genomic coordinates and region sets to test: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +library(COCOA) +library(data.table) +library(ggplot2) +data("esr1_chr1") +data("gata3_chr1") +data("nrf1_chr1") +data("atf3_chr1") +data("brcaATACCoord1") +data("brcaATACData1") +``` + +Next, we will do PCA on our breast cancer chromatin accessibility data. PCA could take a while for large datasets (e.g. longer than 30 minutes). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +pca <- prcomp(t(brcaATACData1)) +pcScores <- pca$x + +plot(pcScores[, c("PC1", "PC2")]) +``` + +Each point is a sample. It is not immediately clear what the meaning of PC1 (or PC2) is and we see a wide spectrum of variation among samples. COCOA will help us understand the biological meaning of the PCs. + +After the PCA, you might want to look at plots of the first few PCs and consider removing extreme outliers and rerunning the PCA since PCA can be affected by outliers, although this depends on your analysis. We note that COCOA will still work even if there are not distinct clusters of samples in your PCA plot and if you do not have known groups of samples. In this vignette, we will look at principal components 1-4 but this choice also depends on the context of your analysis. + +We treat the PC scores as our "target variables" for this analysis and calculate the correlation between the ATAC-seq counts in each peak region and each PC. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +PCsToAnnotate <- paste0("PC", 1:4) +targetVar <- pcScores[, PCsToAnnotate] +targetVar <- as.matrix(scale(targetVar, + center=TRUE, scale=FALSE)) +atacCor <- cor(t(brcaATACData1), targetVar, + use = "pairwise.complete.obs") +# if the standard deviation of the ATAC-seq counts +# for a peak region across samples is 0, +# cor() will return NA, so manually set the correlation to 0 for these regions +atacCor[is.na(atacCor)] <- 0 +``` + +### Score region sets + +Let's put our region sets into one object to simplify the next steps of the analysis. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# prepare data +GRList <- GRangesList(esr1_chr1, gata3_chr1, nrf1_chr1, atf3_chr1) +regionSetNames <- c("esr1_chr1", "gata3_chr1", "nrf1_chr1", "atf3_chr1") +``` + +Now let's give each region set a score with `aggregateSignalGRList()` to quantify how much it is associated with each principal component: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionSetScores <- aggregateSignalGRList(signal=atacCor, + signalCoord=brcaATACCoord1, + GRList=GRList, + signalCol=PCsToAnnotate, + scoringMetric="default") +regionSetScores$regionSetName <- regionSetNames +regionSetScores +``` + +Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with each PC. Since we are using the absolute value of the correlation coefficient for each peak region to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the chromatin accessibility in ER and GATA3 region sets is more highly associated with the PC scores than chromatin accessibility in the NRF1 and ATF3 region sets is. Out of PCs 1-4, ER and GATA3 have the highest scores for PC1. This result makes sense when we visualize ER status on the PCA plot. As you can see in the figure below, PC1 has some ability to separate samples based on ER status (even though ER status was not used when doing the PCA). + + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +annoPCScores <- data.frame(pcScores, ER_status=as.factor(brcaMetadata[row.names(pcScores), "ER_status"])) +ggplot(data = annoPCScores, mapping = aes(x=PC1, y=PC2, col=ER_status)) + geom_point() + ggtitle("PCA of a subset of chromatin accessibility data from breast cancer patients") + theme_classic() +``` +Given that some ER+ samples cluster with the ER- samples and vice versa, it is interesting to consider whether the clinical ER+/- designation is accurately representing the molecular phenotype of the samples although there are multiple potential reasons for this including tumor heterogeneity (this trend is still present when using the full ATAC-seq count matrix which includes additional samples). + +We can visualize the scores of region sets of interest with the `plotAnnoScoreDist` function. This function allows you to supply regular expressions matching one or more groups of region sets and will color the region set scores according to their group. This can be useful to see trends when using a large region set database. Here are scores for PC1, coloring region sets related to ER: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +plotAnnoScoreDist(rsScores = regionSetScores, + colToPlot = "PC1", + pattern = "GATA3|ESR1", + patternName = "ER-related", + rsNameCol = "regionSetName", + alpha=1) +``` + +As an easy way to visualize how the region sets are ranked for multiple target variables, we can use `rsScoreHeatmap`: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +rsScoreHeatmap(regionSetScores, + signalCol=paste0("PC", 1:4), + rsNameCol = "regionSetName", + orderByCol = "PC1", + column_title = "Region sets ordered by score for PC1") +``` + +We can see that GATA3 had the highest score for PC 1 followed by ESR1 but PCs 2, 3, and 4 seem more associated with NRF1 and ATF3. If you want to arrange the heatmap by region set scores for another PC, just change the orderByCol parameter like so: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +rsScoreHeatmap(regionSetScores, + signalCol=paste0("PC", 1:4), + rsNameCol = "regionSetName", + orderByCol = "PC2", + column_title = "Region sets ordered by score for PC2") +``` + +### Estimating statistical significance + +Now that we have a score for each region set for each PC, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. + +#### Permutations + +By shuffling the target variable and calculating COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the samples' target variables. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +set.seed(100) + +nPerm <- 5 + +permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + targetVar=targetVarDF, + nPerm=nPerm, + variationMetric="cor", + useSimpleCache = FALSE) + +permRSScores[1:3] +``` + +We now have a list where each item is a result of `aggregateSignalGRList`. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +nullDistList <- convertToFromNullDist(permRSScores) +names(nullDistList) <- regionSetNames +nullDistList +``` + +We have four null distributions, one for each region set in `GRList`. + +#### Fit gamma distribution to null distributions and get p-values + +To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# p-values based on fitted gamma distributions +gPValDF <- getGammaPVal(rsScores = regionSetScores, + nullDistList = nullDistList, + signalCol = PCsToAnnotate, + method = "mme", realScoreInDist = TRUE) +gPValDF <- cbind(gPValDF, + regionSetScores[, colnames(regionSetScores)[!(colnames(regionSetScores) + %in% PCsToAnnotate)]]) +gPValDF <- cbind(gPValDF, regionSetNames) +gPValDF +``` + +Fitting a gamma distribution with only 5 permutations does not really make sense but we do it here just to illustrate function usage. You can increase the number of permutations and see how that will affect the p-values. Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores can be used as an indicator of effect size. + +After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). + +We can also use the null distributions to directly calculate empirical p-values: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +getPermStat(rsScores = regionSetScores, + nullDistList = nullDistList, + signalCol = PCsToAnnotate) +``` + +An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. + +The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. + +## Further understanding the results (visualization) + +We can further understand the variability in these region sets in several ways: + +1. Look at whether variability is specific to the regions of interest compared to the genome around these regions. +2. Visualize the epigenetic signal in these regions and whether it correlates with the target variable. +3. Look at the feature contribution scores (e.g. correlation coefficients) in each region of a given region set to see whether all regions have high contribution scores or just a subset the regions. Also we can see if the same regions have high contribution scores for multiple target variables. + +To demonstrate these techniques, we'll be using results from [the unsupervised analysis above](#uCOCOAATAC). + +### Specificity of variation to the regions of interest + +We can see whether variability associated with the target variable is specific to the region of interest by comparing the region of interest to the surrounding genome. To do this, we will calculate the average feature contribution scores (FCS) of a wide area surrounding the regions of interest (14 kb centered on each region set region). + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +wideGRList <- lapply(GRList, resize, width=14000, fix="center") +fcsProfile <- lapply(wideGRList, function(x) getMetaRegionProfile(signal=atacCor, + signalCoord=brcaATACCoord1, + regionSet=x, + signalCol=PCsToAnnotate, + binNum=21)) +``` + +We will normalize the result for each PC so we can better compare them. Here we normalize by subtracting the mean absolute FCS of all epigenetic features for each PC from the region set profiles for the corresponding PC. Then we get the plot scale so we can easily compare the different profiles. These normalization steps are helpful for comparing the meta-region profiles but not necessarily required so it's not essential that you understand the below code. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +# average FCS from each PC to normalize so PCs can be compared with each other +avFCS <- apply(X=atacCor[, PCsToAnnotate], + MARGIN=2, + FUN=function(x) mean(abs(x))) + +# normalize +fcsProfile <- lapply(fcsProfile, + FUN=function(x) as.data.frame(mapply(FUN = function(y, z) x[, y] - z, + y=PCsToAnnotate, z=avFCS))) +binID = 1:nrow(fcsProfile[[1]]) +fcsProfile <- lapply(fcsProfile, FUN=function(x) cbind(binID, x)) + +# for the plot scale +maxVal <- max(sapply(fcsProfile, FUN=function(x) max(x[, PCsToAnnotate]))) +minVal <- min(sapply(fcsProfile, FUN=function(x) min(x[, PCsToAnnotate]))) + +# convert to long format for plots +fcsProfile <- lapply(X=fcsProfile, FUN=function(x) tidyr::gather(data=x, key="PC", value="meanFCS", PCsToAnnotate)) +fcsProfile <- lapply(fcsProfile, + function(x){x$PC <- factor(x$PC, levels=PCsToAnnotate); return(x)}) +``` + +Let's look at the plots! + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +wrapper <- function(x, ...) paste(strwrap(x, ...), collapse="\n") +profilePList <- list() +for (i in seq_along(fcsProfile)) { + + thisRS <- fcsProfile[[i]] + + profilePList[[i]] <- ggplot(data=thisRS, + mapping=aes(x=binID , y=meanFCS)) + + geom_line() + ylim(c(minVal, maxVal)) + facet_wrap(facets="PC") + + ggtitle(label=wrapper(regionSetNames[i], width=30)) + + xlab("Genome around region set, 14 kb") + + ylab("Normalized mean FCS") + + theme(panel.grid.major.x=element_blank(), + panel.grid.minor.x=element_blank(), + axis.text.x=element_blank(), + axis.ticks.x=element_blank()) + profilePList[[i]] + +} +profilePList[[1]] +profilePList[[2]] +profilePList[[3]] +profilePList[[4]] +``` + +These plots show the average magnitude of the feature contribution scores in the genome around and including the regions of interest. The FCS for an input epigenetic feature, indicates how much that input feature varies in the same direction as the target variable (here, the PC scores). If you used correlation or covariation to get the feature contribution scores, a peak in the middle of the profile indicates that there is increased covariation in the regions of interest compared to the surrounding genome. It suggests that those regions are changing in a coordinated way whereas the surrounding genome is not changing in a coordinated way to the same extent. A peak suggests that the variation in the target variable (the PCs) may be somehow specifically related to the region set although it is not clear whether the region set is causally linked to the variation or just affected by other things that are causing the variation associated with the target variable. Some region sets may have an increased FCS but no peak, for example, some histone modification region sets like H3K27me3 or H3K9me3. This doesn't necessarily mean these regions are not relevant. It could just mean that there is variability in larger blocks of the genome around these histone modifications (the expanded regions might also overlap with each other). For details on how the meta-region profile was created, check out [the section about it in the "Method details" section](#mrProfile) of this vignette or see the docs for `getMetaRegionProfile` with `?COCOA::getMetaRegionProfile`. + +These plots show that ESR1 and GATA3 binding regions demonstrate higher covariation along the PC1 axis than the surrounding genome does (because they have a peak in the middle) while NRF1 and ATF3 do not. So for example, if you line up samples by their PC1 score, then as you go from low to high PC1 score, the chromatin accessibility of ESR1 binding regions will generally change in a coordinated way across samples (PCs are based on covariation) but the chromatin accessibility in the surrounding genome would not change as much or would not change in a coordinated way. The results from our four region sets suggest that ESR1 and GATA3 regions specifically contribute to the variation along the PC 1, helping us understand the biological meaning of the variation captured by that PC: at least in part related to estrogen receptor. + +### The raw data + +If a region set has a high score for a certain target variable, we would expect that the epigenetic signal in at least some of those regions would correlate with the target variable. In other words, as you go from a high value of the target variable to a low value of the target variable, in this case as you go along the PC axis, the epigenetic signal will either go up or down. Let's look at the epigenetic signal in ESR1 regions. In the following plot, each column is an ATAC-seq peak region that overlaps in an ESR1 region and each row is a patient. Patients are ordered by their PC score for PC1. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +signalAlongAxis(genomicSignal=brcaATACData1, + signalCoord=brcaATACCoord1, + regionSet=esr1_chr1, + sampleScores=pcScores, + orderByCol="PC1", cluster_columns=TRUE, + column_title = "Individual ATAC-seq region", + name = "Normalized signal in ATAC-seq regions", + show_row_names=FALSE, + show_column_names=FALSE) +``` + +Looking at the raw data confirms that chromatin accessibility is in fact varying along the PC axis. It appears that some but not all peak regions vary greatly along the PC axis. The peak regions that show high variation along the PC axis are the ones that contribute to the ESR1 region set being ranked highly in our analysis. + +Now let's look at one of the region sets, NRF1, that had a lower score for PC1: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +signalAlongAxis(genomicSignal=brcaATACData1, + signalCoord=brcaATACCoord1, + regionSet=nrf1_chr1, + sampleScores=pcScores, + orderByCol="PC1", + cluster_columns=TRUE, + column_title = "Individual ATAC-seq region", + name = "Normalized signal in ATAC-seq regions", + show_row_names=FALSE, + show_column_names=FALSE) +``` + +When patients are ordered according to PC1 score, we can see that there is less covariation of the epigenetic signal in these regions compared to the ESR1 regions. Therefore, it is not surprising that the NRF1 region set had a low score for this PC. + +Since COCOA ranks region sets based on their relative scores in comparison to other region sets tested, there will always be a region set with a best score. The permutation test gives an idea of statistical significance but not necessarily effect size. The actual COCOA scores give an idea of the magnitude of variation of a region set but it can be a good idea to visually check the raw genomic signal in your top region sets to see how great the extent of variation is along the PC. + +### Feature contribution scores of individual regions + +This plot can help you learn more about the contribution of individual region set regions to the region set score for each target variable. For example, if the estrogen receptor region set was associated with PCs 1, 3, and 4, we might wonder whether the same regions are causing the association with these PCs or whether different regions are associated with each PC. To do this, we will first calculate the average absolute feature contribution score for each region in a region set (obtained by averaging a given target variable's FCS for epigenetic features that overlap that region). Then we can use the distribution of FCS for each target variable to convert each region's FCS to a percentile to see how extreme/high that region is for each target variable. Let's look at the plot for estrogen receptor: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionQuantileByTargetVar(signal = atacCor, + signalCoord = brcaATACCoord1, + regionSet = esr1_chr1, + rsName = "Estrogen receptor (chr1)", + signalCol=paste0("PC", 1:4), + maxRegionsToPlot = 8000, + cluster_rows = TRUE, + cluster_columns = FALSE, + column_title = rsName, + name = "Percentile of feature contribution scores in PC") +``` + +A high FCS indicates that a region is important for a PC. We can see that not many regions have high FCS for all four PCs. Overall, PC1 has the highest FCS, consistent with our meta-region profiles (peak for PC1). + +For contrast, we can look at the regions of NRF1: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +regionQuantileByTargetVar(signal = atacCor, + signalCoord = brcaATACCoord1, + regionSet = nrf1_chr1, + rsName = "NRF1 (chr1)", + signalCol=paste0("PC", 1:4), + maxRegionsToPlot = 8000, + cluster_rows = TRUE, + cluster_columns = FALSE, + column_title = rsName, + name = "Percentile of feature contribution scores in PC") +``` + + +We can see that fewer regions in the NRF1 region set have high FCS for PC 1. This is consistent with this region set being ranked lower by COCOA for that PC. + +# Additional details + +## Method details + +The overall conceptual steps are described in the [Basic workflow] section. In this section, we give more details about some of the individual parts of that workflow. For further information, see [the COCOA paper](https://doi.org/10.1186/s13059-020-02139-4). + +### Region set database + +COCOA uses a database of region sets to gain biological insight into sources of variability in your data. A region set is a set of genomic regions that share a biological annotation. This includes transcription factor (TF) binding regions (e.g. from ChIP-seq), regions with a certain histone modification (e.g. ChIP-seq) or chromatin accessibility regions (e.g. DNase/ATAC-seq). Most of these region sets are from experimental data but don't necessarily have to be. For instance, you could use predicted TF binding regions based on the TF motif. The big picture goal of using a region set database is to connect variation between samples to an interpretable biological meaning: the known annotation of a region set. For each target variable (phenotype or latent factor), COCOA will give a score to each region set that quantifies how much that region set is associated with the inter-sample variation of that target variable. + +COCOA should be done with many region sets (i.e. hundreds or > 1000). A region set can be a simple ".bed"" file with three columns containing the genomic locations of the regions: chr (for chromosome), start, and end. In R, this data can be represented as a data.frame or as a GRanges object. Publicly available collections of region sets can be found online (e.g. http://databio.org/regiondb) and region sets can be accessed through Bioconductor packages (e.g. LOLA and AnnotationHub). The region sets must be from the same reference genome as your sample data (although you could use [the liftOver tool](https://genome.ucsc.edu/cgi-bin/hgLiftOver) to convert from one genome version to another). The region sets can come from anywhere so if you experimentally or computationally generate your own region sets, you can just include those with the others when running the COCOA analysis. + +We'll show some sample code for loading a LOLA region set database (unevaluated). Loading the database may take a few minutes: + +```{r, eval=FALSE, message=FALSE, warning=FALSE} +library(LOLA) + +# reading in the region sets +# load LOLA database +lolaPath <- paste0("path/to/LOLACore/genomeVersion/") +regionSetDB <- loadRegionDB(lolaPath) + +# metadata about the region sets +loRegionAnno <- regionSetDB$regionAnno +lolaCoreRegionAnno <- loRegionAnno +collections <- c("cistrome_cistrome", "cistrome_epigenome", "codex", + "encode_segmentation", "encode_tfbs", "ucsc_features") +collectionInd <- lolaCoreRegionAnno$collection %in% collections +lolaCoreRegionAnno <- lolaCoreRegionAnno[collectionInd, ] +regionSetName <- lolaCoreRegionAnno$filename +regionSetDescription <- lolaCoreRegionAnno$description + +# the actual region sets +GRList <- GRangesList(regionSetDB$regionGRL[collectionInd]) + +# since we have what we need, we can delete this to free up memory +rm("regionSetDB") +``` + +This `GRList` object can be used with COCOA as the region set database, along with any other region sets you may have from other sources. + +### Aggregating info from individual features + +Since differences between samples in individual epigenetic features may be hard to interpret, COCOA uses region sets to aggregate nucleotide/region level info into a more condensed, interpretable form. As mentioned in the vignette, each epigenetic feature has a "feature contribution score" (FCS) for a given target variable and the magnitude of the FCS represents how much that feature contributes to that target variable. Also, each original epigenetic feature is associated with a genomic coordinate or a region. COCOA will use this information to give each region set in the region set database a score for each target variable. For a given target variable-region set combination (for example PC1 and the region set esr1_chr1), we first identify all the epigenetic features that overlap with the region set. Then the scoring of the region set depends on the scoring metric chosen. The "regionMean" method is described here although the other methods are described in function docs. For the "regionMean" method that is used for single base data like DNA methylation, we identify the FCS for the features that overlap the region set and average the (absolute) FCS by region (average FCS in each region to get one value per region of the region set). Then we average the region values to get a single average for that region set which is its score. We repeat this calculation for all target variable-region set combinations. Now for a given target variable, we can rank the region sets by their score/FCS average to see which region sets are most associated with that target variable (higher FCS average means a greater association with the target variable). The biological annotation of the top ranked region sets for the target variable can help you understand variation among your samples. + +### Making a "meta-region" profile {#mrProfile} +A "meta-region" profile is a summary of the FCS in the genome in and around the regions of a region set. This is created with the `getMetaRegionProfile` function. The calculations are similar to those of `aggregateSignalGRList` with a few major differences. Instead of using the region set as is, we will expand each region in the region set on both sides so we can also look at the surrounding genome. We will then split each region into the same number of bins (of approximately equal size). Then we average (the absolute value of) all FCS that overlap a bin to get a single average FCS for each bin. For region-based epigenetic data like ATAC-seq, this average can be a weighted average based on how much each ATAC-seq region overlaps each region set region. Then (for both single base and region-based data) we combine information from all the regions by averaging the corresponding bins from the different regions (all bin1's averaged together, all bin2's averaged together, etc.). Finally, we average the profile symmetrically over the center (the first bin becomes the average of the first and last bin, the second bin becomes the average of the second and second to last bin, etc.). We do this because the orientation of the regions in the genome is arbitrary: we did not use any strand information or relate the regions to any directional information. The "meta-region" profile gives a summary in a single profile of all the regions in a region set and allows you to compare the regions to the surrounding genome. See ?getMetaRegionProfile for more details and options. + +## Q and A + +1. Where can I get more information about COCOA? + +For more in-depth info on COCOA methods and examples of use, check out our paper: + +Lawson, J.T., Smith, J.P., Bekiranov, S. et al. COCOA: coordinate covariation analysis of epigenetic heterogeneity. Genome Biol 21, 240 (2020). https://doi.org/10.1186/s13059-020-02139-4 + +For package documentation, see the vignettes and reference manual on the Bioconductor website. You can also check out ongoing development or report an issue with COCOA [on Github](https://github.com/databio/COCOA). + + +2. What data types can COCOA be used with? + +So far, COCOA has been validated on single base pair resolution DNA methylation data, chromatin accessibility data, and a multi-omics analysis that included DNA methylation. Theoretically, COCOA could work with any type of genomic coordinate-based data: data where you have a genomic coordinate or range and an associated value. This could include histone modification data, single nucleotide polymorphism/mutation data, copy number variation etc. although COCOA would probably work better for data where smaller regions or single bases are measured. + + +3. Can COCOA be used with other dimensionality reduction techniques such as t-SNE? + +Short answer for t-SNE: no. In general though, it depends. COCOA must have a score for each original dimension that quantifies how much it contributes to the new dimension. Since t-SNE maps the original dimensions to new dimensions in a nonlinear way, the mappings of the original dimensions to the new dimensions are not comparable to each other and cannot be aggregated into a single score for a region set in a uniform way. + + +4. What does the name COCOA mean? + +The method is called Coordinate Covariation Analysis because it looks at covariation/correlation of individual signals/features at genomic coordinates and how those features relate to a target variable. COCOA annotates the covariation/correlation of individual genomic features with region sets in order to gain insight into variation between samples. + + +## Related references {#refs} + +Lawson, J.T., Smith, J.P., Bekiranov, S. et al. COCOA: coordinate covariation analysis of epigenetic heterogeneity. Genome Biol 21, 240 (2020). https://doi.org/10.1186/s13059-020-02139-4 + +ATACseq data source: +Corces MR, Granja JM, Shams S, et al. The chromatin accessibility landscape of primary human cancers. Science. 2018;362(6413):eaav1898. doi:10.1126/science.aav1898 + +This paper involves gene sets instead of region sets but provided some inspiration for our method: +Frost HR, Li Z, Moore JH. Principal component gene set enrichment (PCGSE). BioData Mining. 2015;8:25. doi:10.1186/s13040-015-0059-z. + +Gamma p-value approximation for permutation tests: +Winkler AM, Ridgway GR, Douaud G, Nichols TE, Smith SM. Faster permutation inference in brain imaging. Neuroimage. 2016;141:502-516. doi:10.1016/j.neuroimage.2016.05.068 diff --git a/vignettes/IntroToCOCOA.Rmd b/vignettes/IntroToCOCOA.Rmd index 65d3b6a..b4fd11b 100755 --- a/vignettes/IntroToCOCOA.Rmd +++ b/vignettes/IntroToCOCOA.Rmd @@ -1,5 +1,5 @@ --- -title: "Introduction to Coordinate Covariation Analysis" +title: "Introduction to COCOA" author: "John Lawson" date: "`r Sys.Date()`" output: @@ -7,108 +7,52 @@ output: toc_float: true toc_collapsed: true vignette: > - %\VignetteIndexEntry{Introduction to Coordinate Covariation Analysis} + %\VignetteIndexEntry{Introduction to COCOA} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- +COCOA annotates epigenetic variation. -# The COCOA Bioconductor package +This vignette will show a very simple example of how the main COCOA functions work with minimal details to get you off to a quick start. For more in-depth information, please see our other vignette. - -**Coordinate Covariation Analysis (COCOA) is a method to understand epigenetic variation among samples. It does this by annotating your sample epigenetic variation using region sets as reference data. COCOA was designed for DNA methylation data and chromatin accessibility data, but can be used on any data includes genomic coordinates.** - - -Epigenetic differences between samples can be hard to interpret because epigenetic data is high dimensional. One way to improve interpretation is to group high dimensional data into functional categories. COCOA does this using groups of related genomic regions, which we refer to as region sets. A region set, commonly stored as a BED file, is a set of genomic regions that share a biological annotation. Some examples are: binding sites for a certain transcription factor; cell-type specific open chromatin; or regions with a certain histone modification. COCOA uses reference region sets derived from public data to annotate inter-sample epigenetic variation. - -## COCOA modes: supervised and unsupervised - -COCOA has two modes: supervised, and unsupervised. A supervised COCOA analysis annotates epigenetic variation that is related to a specific sample phenotype of interest (such as a molecular phenotype, or a higher-level phenotype). For example, we might want to see what epigenetic variation is associated with disease status, disease severity, survival, a certain mutation, or the expression of a certain gene. In contrast, an unsupervised COCOA analysis annotates inter-sample epigenetic variation without requiring a sample phenotype. Often, unsupervised analyses will look at the major sources of inter-sample variation by using a dimensionality reduction technique such as principal component analysis (PCA). COCOA can annotate those major sources of inter-sample variation (e.g. the principal components) for linear dimensionality reduction methods like PCA. To annotate the epigenetic variation of interest, whether supervised or unsupervised, COCOA finds reference region sets that are most associated with that variation, giving you biologically meaningful insight into variation in your data. - -COCOA can be used in many different contexts so feel free to skip to the section that is most relevant to you. You may want to start with the [Basic workflow] section for a high level overview of the COCOA workflow. In this vignette, we'll show COCOA workflows with [DNA methylation](#DNAm) and [chromatin accessibility](#ATAC) data and for each of those we will demonstrate a supervised and unsupervised analysis. The vignettes are very similar since a similar COCOA workflow is used. For a more in-depth description of the method, see the "[Method details]" section and [the COCOA paper](https://doi.org/10.1186/s13059-020-02139-4). - -# Basic workflow - -We start off with a matrix of epigenetic data in which rows correspond to the epigenetic features (e.g. CpG or accessible chromatin region) and columns correspond to samples. Each epigenetic feature must have genomic coordinates, which are stored in a separate object. - -## Step 1: Define target variable - -For a supervised analysis, the target variable is defined by you. For an unsupervised analysis, you first do dimensionality reduction on your epigenetic data. Then, you take the principal component/latent factor sample scores for the principal components you are interested in and treat them as your target variables. - -## Step 2: Quantify association between inter-sample epigenetic variation and target variable - -Once you've defined your target variable, the next step is to quantify the association between your epigenetic data and the target variables. This produces a score for each epigenetic feature that represents how much it contributes to the target variable. We'll refer to these scores as **feature contribution scores**. - -### Ways to quantify association - -There are several ways to quantify association. The metrics that are built in to COCOA are covariation, Pearson correlation, and Spearman correlation. You could also use another metric of your choice that results in a score for each epigenetic feature representing its association with the target variable. For correlation, we take the correlation coefficient as the feature contribution score for each epigenetic feature. For example, if your target variable is a PC score, then the correlation coefficient for each epigenetic feature represents how much it is associated with a given principal component. - -## Step 3: Annotate variation with region sets - -The next step in COCOA is to find out which reference region sets are most associated with variation in the target variable. To do this, we use the COCOA algorithm to score each reference region set based on the epigenetic features from your data that it overlaps. Essentially, the score for a region set is the average of the scores of the epigenetic features that it overlaps (see the `scoringMetric` parameter in ?runCOCOA for more details). - -After computing a score for each region set, the region sets can be ranked to see which region sets are most associated with the target variable. High scores for small region sets are more likely to be due to noise but this is addressed by the permutation test. Finding region sets where epigenetic variation is associated with variation in the target variable can give biological insight into your data. - -### Optional step: Permutation test - -To account for the different sizes of region sets and to assess statistical significance of the results, we can do a permutation test to estimate p-values. We run COCOA on shuffled data to get null distributions for each region set. More specifically, we shuffle the samples' values for the target variable and recompute COCOA scores as described above. After doing this many times, the permutation COCOA scores for a given region set make up a region set-specific null distribution. This null distribution accounts for the size of the region set. If multiple target variables are used, each target variable, region set combination will have its own unique null distribution. To avoid the need to run a large number of permutations, we can run a small number of permutations (a few hundred) and then fit a gamma distribution to each null distribution. We use the gamma distribution to estimate a p-value for the real COCOA score for each region set. It should be noted that the gamma approximation is accurate for high p values, but may overestimate the significance of low p values. Because of this, the approximation may be helpful for screening out region sets that are not significant but should be interpreted carefully for low p-values. Our analysis of the gamma approximation as used by COCOA is shown in the COCOA paper. As mentioned before, the unsupervised COCOA test does not use PCA loadings as the feature contribution scores. This is done so that PCA, a time-consuming computation, does not have to be recomputed for every permutation. Instead, the PC sample scores are treated as the target variable and shuffled so that the feature contribution scores can be recalculated. - -# COCOA for DNA methylation data {#DNAm} - -COCOA uses a database of region sets to annotate inter-sample epigenetic variation in your data. In this vignette, we will see how COCOA can find meaningful sources of DNA methylation variation in breast cancer patients. - -First, we will show how to use COCOA to find region sets where DNA methylation variation is associated with variation in a phenotype of interest (the target variable) ([Supervised COCOA](#sCOCOADNAm)). Then, we will show how to find region sets that are associated with the principal components of PCA of our DNA methylation data ([Unsupervised COCOA](#uCOCOADNAm)). - -## Our data - -We will use data from The Cancer Genome Atlas: 450k DNA methylation microarrays for breast cancer patients (TCGA-BRCA, https://portal.gdc.cancer.gov/). We (the authors) ran an unsupervised COCOA analysis outside of this vignette on the full methylation data for 657 patients and a large region set database of over 2000 region sets. We used those results to select region sets and a subset of the DNA methylation data for this vignette. We are using two of the highest and two of the lowest scoring region sets from that analysis. Those region sets are transcription factor binding regions from ChIP-seq experiments (with the same reference genome as our breast cancer data): ESR1 in MCF-7 cells, GATA3 in MCF-7 cells, NRF1 in HepG2 cells, and Atf1 in K562 cells. Only chr1 regions are included in the vignette. For a real analysis, we recommend using hundreds or thousands of region sets. For sources of region sets, see the "[Region set database]" section. We also have subsetted the DNA methylation data to a small number of CpGs that we determined were necessary for the vignette in order to keep the vignette data small. Since we're using only a small subset of the data, the vignette results may not necessarily be generalizable to the full data. In your real analysis, we recommend using as many CpGs as possible. - -## Supervised COCOA {#sCOCOADNAm} - -Goal: Understand epigenetic variation related to a specific phenotype - -In a supervised COCOA analysis, we want to annotate epigenetic variation that is related to variation in one or more sample phenotypes. These sample phenotypes could be molecular (e.g. expression of a protein marker, presence/variant allele frequency of a mutation, or expression of a certain gene). The sample phenotypes can also be higher-level organism phenotypes (e.g. patient survival, cancer stage, disease severity). COCOA will identify region sets where the epigenetic signal is most related to the phenotype. These region sets then can help to understand the relationship between the phenotype and epigenetic state. - -Vignette analysis goal: As mentioned previously, we are looking at DNA methylation in breast cancer patients. Our goal is to understand epigenetic variation related to our phenotype of interest, estrogen receptor (ER) status. - -### Quantify relationship between chosen sample phenotype and epigenetic data - -First we will load the necessary data: BRCA DNA methylation data with genomic coordinates, region sets to test, and our phenotype of interest, ER status (part of `brcaMetadata`). +First we load COCOA and the data for the analysis: region sets and epigenetic data with genomic coordinates. ```{r, eval=TRUE, message=FALSE, warning=FALSE} library(COCOA) + +# region sets data("esr1_chr1") data("gata3_chr1") data("nrf1_chr1") data("atf3_chr1") + +# DNA methylation data data("brcaMCoord1") data("brcaMethylData1") -data("brcaMetadata") -``` -ER status is our target variable so we'll extract that from the patient metadata. The target variables must be a `data.frame` object with columns corresponding to target variables (here, `drop=FALSE` keeps the data from turning into a vector, since we have only 1 target variable). +``` ```{r, eval=TRUE, message=FALSE, warning=FALSE} -myPhen <- "ER_status" -targetVarDF <- brcaMetadata[colnames(brcaMethylData1), myPhen, drop=FALSE] +pca <- prcomp(t(brcaMethylData1)) +pcScores <- pca$x + +plot(pcScores[, c("PC1", "PC2")]) ``` -Let's convert estrogen receptor status to a number so we can do calculations with it. Then we will quantify the association between ER status and the DNA methylation level at each cytosine using Pearson correlation. If NA's are present for any samples for a given CpG, the default correlation parameters will return NA. The parameter "pairwise.complete.obs" can be used to get correlation values even if some samples have NAs/missing data for a CpG (we use this in our code as an example even though our data does not have NAs). Whether to use this parameter is up to the judgment of the user, which could be decided based on what percent of samples have no data for a given CpG (filter out CpGs missing data for a certain percent of samples, use the rest). ```{r, eval=TRUE, message=FALSE, warning=FALSE} -targetVarDF$ER_status <- scale(as.numeric(targetVarDF$ER_status), - center=TRUE, scale=FALSE) -methylCor <- cor(t(brcaMethylData1), targetVarDF$ER_status, +PCsToAnnotate <- paste0("PC", 1:4) +targetVar <- pcScores[, PCsToAnnotate] +targetVar <- as.matrix(scale(targetVar, + center=TRUE, scale=FALSE)) +methylCor <- cor(t(brcaMethylData1), targetVar, use = "pairwise.complete.obs") # if the standard deviation of the methylation level # for a CpG across samples is 0, # cor() will return NA, so manually set the correlation to 0 for these CpGs methylCor[is.na(methylCor)] <- 0 -colnames(methylCor) <- myPhen ``` -Now we have a feature contribution score (the correlation coefficient in this case) for each methylation site that represents how much the methylation at that site is associated with ER status. We can use those individual cytosine scores to score region sets. We'll score our region sets with the `aggregateSignalGRList` function. - -### Score region sets ```{r, eval=TRUE, message=FALSE, warning=FALSE} GRList <- GRangesList(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1) @@ -123,41 +67,23 @@ rsScores$regionSetName <- regionSetNames rsScores ``` -Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with the phenotype of interest, ER status. Since we are using the absolute value of the correlation coefficient for each CpG to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the DNA methylation in ER and GATA3 region sets is much more highly associated with ER status than in the NRF1 and ATF3 region sets. - -Both the correlation and scoring steps can be combined into one step with the `runCOCOA` function as will be shown in the next section. - -### Estimating statistical significance - -Now that we have a score for each region set, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. - -#### Permutations - -By shuffling the sample phenotypes and calculating fake COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. If the samples were in the correct order, `runCOCOA` would return the real COCOA scores but since we are giving a shuffled sample order (`sampleOrder` parameter), we will be correlating the epigenetic data with the shuffled sample phenotypes to create null distribution COCOA scores. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the sample phenotypes. - ```{r, eval=TRUE, message=FALSE, warning=FALSE} set.seed(100) nPerm <- 5 -permRSScores <- list() -for (i in 1:nPerm) { - # shuffling sample labels - sampleOrder <- sample(1:nrow(targetVarDF), nrow(targetVarDF)) - permRSScores[[i]] <- runCOCOA(sampleOrder=sampleOrder, - genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRList, - signalCol=myPhen, - targetVar=targetVarDF, - variationMetric="cor") - permRSScores[[i]]$regionSetName <- regionSetNames -} +permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, + signalCoord=brcaMCoord1, + GRList=GRList, + signalCol=myPhen, + targetVar=targetVarDF, + nPerm=nPerm, + variationMetric="cor", + useSimpleCache = FALSE) permRSScores[1:3] ``` -We now have a list where each item is a result of `aggregateSignalGRList` for a permutation of the sample phenotypes. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: ```{r, eval=TRUE, message=FALSE, warning=FALSE} nullDistList <- convertToFromNullDist(permRSScores) @@ -165,11 +91,6 @@ names(nullDistList) <- regionSetNames nullDistList ``` -We have four null distributions, one for each region set in `GRList`. - -#### Fit gamma distribution to null distributions and get p-values - -To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). ```{r, eval=TRUE, message=FALSE, warning=FALSE} # p-values based on fitted gamma distributions @@ -184,11 +105,6 @@ gPValDF <- cbind(gPValDF, regionSetNames) gPValDF ``` -Fitting a gamma distribution with only 5 permutations does not really make sense so that is one reason the p-values for ER and GATA3 (which we would expect to be lower than NRF1/ATF3) are higher than those of NRF1 and ATF3. If you increase the number of permutations to 50, you will see that ER and GATA3 have lower p-values than ATF3 and NRF1 in that more realistic scenario. Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores for ER and GATA3 were much larger than those of NRF1 and ATF3. - -After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). - -We can also use the null distributions to directly calculate empirical p-values: ```{r, eval=TRUE, message=FALSE, warning=FALSE} getPermStat(rsScores = rsScores, @@ -196,94 +112,9 @@ getPermStat(rsScores = rsScores, signalCol = myPhen) ``` -An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. - -The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. - -## Unsupervised COCOA {#uCOCOADNAm} - -Goal: Understand major sources of epigenetic variation present in data without knowing beforehand what they are. - -In an unsupervised COCOA analysis, we normally begin with a dimensionality reduction technique such as PCA that identifies major sources/axes of inter-sample variation. Then the PC/latent factors are treated as the target variables for COCOA and a workflow similar to the supervised COCOA analysis is used. We identify region sets where the epigenetic signal varies along the PC axis. These region sets can help understand the biological meaning of the PCs/latent factors. - -Vignette analysis goal: As mentioned previously, we are looking at DNA methylation in breast cancer patients. Our goal is to understand sources of inter-sample epigenetic variation in an unsupervised way (without using sample phenotypes or groups). - -### Quantify relationship between latent factors and epigenetic data - -First we will load the required packages and necessary data: BRCA DNA methylation data with genomic coordinates and region sets to test: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -library(COCOA) -library(data.table) -library(ggplot2) -data("esr1_chr1") -data("gata3_chr1") -data("nrf1_chr1") -data("atf3_chr1") -data("brcaMCoord1") -data("brcaMethylData1") -data("brcaMetadata") -``` - -Next, we will do PCA on our breast cancer DNA methylation data. PCA could take a while for large datasets (e.g. longer than 30 minutes). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -pca <- prcomp(t(brcaMethylData1)) -pcScores <- pca$x - -plot(pcScores[, c("PC1", "PC2")]) -``` - -Each point is a sample. It is not immediately clear what the meaning of PC1 (or PC2) is and we see a wide spectrum of variation among samples. COCOA will help us understand the biological meaning of the PCs. - -After the PCA, you might want to look at plots of the first few PCs and consider removing extreme outliers and rerunning the PCA since PCA can be affected by outliers, although this depends on your analysis. We note that COCOA will still work even if there are not distinct clusters of samples in your PCA plot and if you do not have known groups of samples. In this vignette, we will look at principal components 1-4 but this choice also depends on the context of your analysis. - -We treat the PC scores as our "target variables" for this analysis and calculate the correlation between the DNA methylation level of each CpG and each PC. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -PCsToAnnotate <- paste0("PC", 1:4) -targetVar <- pcScores[, PCsToAnnotate] -targetVar <- as.matrix(scale(targetVar, - center=TRUE, scale=FALSE)) -methylCor <- cor(t(brcaMethylData1), targetVar, - use = "pairwise.complete.obs") -# if the standard deviation of the methylation level -# for a CpG across samples is 0, -# cor() will return NA, so manually set the correlation to 0 for these CpGs -methylCor[is.na(methylCor)] <- 0 -``` - -### Score region sets - -Let's put our region sets into one object to simplify the next steps of the analysis. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# prepare data -GRList <- GRangesList(esr1_chr1, gata3_chr1, nrf1_chr1, atf3_chr1) -regionSetNames <- c("esr1_chr1", "gata3_chr1", "nrf1_chr1", "atf3_chr1") -``` - -Now let's give each region set a score with `aggregateSignalGRList()` to quantify how much it is associated with each principal component: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -regionSetScores <- aggregateSignalGRList(signal=methylCor, - signalCoord=brcaMCoord1, - GRList=GRList, - signalCol=PCsToAnnotate, - scoringMetric="default") -regionSetScores$regionSetName <- regionSetNames -regionSetScores -``` - -Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with each PC. Since we are using the absolute value of the correlation coefficient for each CpG to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the DNA methylation in ER and GATA3 region sets is more highly associated with the PC scores than the DNA methylation in the NRF1 and ATF3 region sets is. Out of PCs 1-4, ER and GATA3 have the highest scores for PC1. This result makes sense when we visualize ER status on the PCA plot. As you can see in the figure below, PC1 separates samples pretty well based on ER status (even though ER status was not used when doing the PCA). - - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -annoPCScores <- data.frame(pcScores, ER_status=as.factor(brcaMetadata[row.names(pcScores), "ER_status"])) -ggplot(data = annoPCScores, mapping = aes(x=PC1, y=PC2, col=ER_status)) + geom_point() + ggtitle("PCA of a subset of DNA methylation data from breast cancer patients") + theme_classic() -``` +# Visualization -We can visualize the scores of region sets of interest with the `plotAnnoScoreDist` function. This function allows you to supply regular expressions matching one or more groups of region sets and will color the region set scores according to their group. This can be useful to see trends when using a large region set database. Here are scores for PC1, coloring region sets related to ER: +Visualize the region set scores, coloring by region set group of interest. ```{r, eval=TRUE, message=FALSE, warning=FALSE} plotAnnoScoreDist(rsScores = regionSetScores, @@ -294,7 +125,7 @@ plotAnnoScoreDist(rsScores = regionSetScores, alpha=1) ``` -As an easy way to visualize how the region sets are ranked for multiple target variables, we can use `rsScoreHeatmap`: +Visualize how the region sets are ranked for multiple target variables, we can use `rsScoreHeatmap`: ```{r, eval=TRUE, message=FALSE, warning=FALSE} rsScoreHeatmap(regionSetScores, @@ -304,102 +135,7 @@ rsScoreHeatmap(regionSetScores, column_title = "Region sets ordered by score for PC1") ``` -We can see that GATA3 had the highest score for PC 1 but that ESR1 had a higher score for PCs 2, 3, and 4. If you want to arrange the heatmap by region set scores for another PC, just change the orderByCol parameter like so: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -rsScoreHeatmap(regionSetScores, - signalCol=paste0("PC", 1:4), - rsNameCol = "regionSetName", - orderByCol = "PC2", - column_title = "Region sets ordered by score for PC2") -``` - -### Estimating statistical significance - -Now that we have a score for each region set for each PC, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. - -#### Permutations - -By shuffling the target variable and calculating COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. If the samples were in the correct order, `runCOCOA` would return the real COCOA scores but since we are giving a shuffled sample order (`sampleOrder` parameter), we will be correlating the epigenetic data with the shuffled sample target variables to create null distribution COCOA scores. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the samples' target variables. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -set.seed(100) - -nPerm <- 5 -permRSScores <- list() - -for (i in 1:nPerm) { - # shuffling sample labels - sampleOrder <- sample(1:nrow(targetVar), nrow(targetVar)) - permRSScores[[i]] <- runCOCOA(sampleOrder=sampleOrder, - genomicSignal=brcaMethylData1, - signalCoord=brcaMCoord1, - GRList=GRList, - signalCol=PCsToAnnotate, - targetVar=targetVar, - variationMetric="cor") - permRSScores[[i]]$regionSetName <- regionSetNames -} - -permRSScores[1:3] -``` - -We now have a list where each item is a result of `aggregateSignalGRList`. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -nullDistList <- convertToFromNullDist(permRSScores) -names(nullDistList) <- regionSetNames -nullDistList -``` - -We have four null distributions, one for each region set in `GRList`. - -#### Fit gamma distribution to null distributions and get p-values - -To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# p-values based on fitted gamma distributions -gPValDF <- getGammaPVal(rsScores = regionSetScores, - nullDistList = nullDistList, - signalCol = PCsToAnnotate, - method = "mme", realScoreInDist = TRUE) -gPValDF <- cbind(gPValDF, - regionSetScores[, colnames(regionSetScores)[!(colnames(regionSetScores) - %in% PCsToAnnotate)]]) -gPValDF <- cbind(gPValDF, regionSetNames) -gPValDF -``` - -You might find it surprising that the p-values of NRF1 and ATF3 are lower than those of ER and GATA3. Keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores for ER and GATA3 were much larger than those of NRF1 and ATF3, even if NRF1 and ATF3 might have lower p-values. - -After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). - -We can also use the null distributions to directly calculate empirical p-values: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -getPermStat(rsScores = regionSetScores, - nullDistList = nullDistList, - signalCol = PCsToAnnotate) -``` - -An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. - -The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. - -## Further understanding the results (visualization) - -We can further understand the variability in these region sets in several ways: - -1. Look at whether variability is specific to the regions of interest compared to the genome around these regions. -2. Visualize the epigenetic signal in these regions and whether it correlates with the target variable. -3. Look at the feature contribution scores (e.g. correlation coefficients) in each region of a given region set to see whether all regions have high contribution scores or just a subset the regions. Also we can see if the same regions have high contribution scores for multiple target variables. - -To demonstrate these techniques, we'll be using results from [the unsupervised analysis above](#uCOCOADNAm). - -### Specificity of variation to the regions of interest - -We can see whether variability associated with the target variable is specific to the region of interest by comparing the region of interest to the surrounding genome. To do this, we will calculate the average feature contribution scores (FCS) of a wide area surrounding the regions of interest (14 kb centered on each region set region). +Visualize the specificity of epigenetic variation to the regions of interest compared to the surrounding genome: ```{r, eval=TRUE, message=FALSE, warning=FALSE} wideGRList <- lapply(GRList, resize, width=14000, fix="center") @@ -408,11 +144,7 @@ fcsProfile <- lapply(wideGRList, function(x) getMetaRegionProfile(signal=methylC regionSet=x, signalCol=PCsToAnnotate, binNum=21)) -``` -We will normalize the result for each PC so we can better compare them. Here we normalize by subtracting the mean absolute FCS of all epigenetic features for each PC from the region set profiles for the corresponding PC. Then we get the plot scale so we can easily compare the different profiles. These normalization steps are helpful for comparing the meta-region profiles but not necessarily required so it's not essential that you understand the below code. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} # average FCS from each PC to normalize so PCs can be compared with each other avFCS <- apply(X=methylCor[, PCsToAnnotate], MARGIN=2, @@ -433,11 +165,7 @@ minVal <- min(sapply(fcsProfile, FUN=function(x) min(x[, PCsToAnnotate]))) fcsProfile <- lapply(X=fcsProfile, FUN=function(x) tidyr::gather(data=x, key="PC", value="meanFCS", PCsToAnnotate)) fcsProfile <- lapply(fcsProfile, function(x){x$PC <- factor(x$PC, levels=PCsToAnnotate); return(x)}) -``` - -Let's look at the plots! -```{r, eval=TRUE, message=FALSE, warning=FALSE} wrapper <- function(x, ...) paste(strwrap(x, ...), collapse="\n") profilePList <- list() for (i in seq_along(fcsProfile)) { @@ -463,13 +191,7 @@ profilePList[[3]] profilePList[[4]] ``` -These plots show the average magnitude of the feature contribution scores in the genome around and including the regions of interest. The FCS for an input epigenetic feature, indicates how much that input feature varies in the same direction as the target variable (here, the PC scores). If you used correlation or covariation to get the feature contribution scores, a peak in the middle of the profile indicates that there is increased covariation in the regions of interest compared to the surrounding genome. It suggests that those regions are changing in a coordinated way whereas the surrounding genome is not changing in a coordinated way to the same extent. A peak suggests that the variation in the target variable (the PCs) may be somehow specifically related to the region set although it is not clear whether the region set is causally linked to the variation or just affected by other things that are causing the variation associated with the target variable. Some region sets may have an increased FCS but no peak, for example, some histone modification region sets like H3K27me3 or H3K9me3. This doesn't necessarily mean these regions are not relevant. It could just mean that there is variability in larger blocks of the genome around these histone modifications (the expanded regions might also overlap with each other). For details on how the meta-region profile was created, check out [the section about it in the "Method details" section](#mrProfile) of this vignette or see the docs for `getMetaRegionProfile` with `?COCOA::getMetaRegionProfile`. - -These plots show that ESR1 and GATA3 binding regions demonstrate higher covariation along the PC1 axis than the surrounding genome does (because they have a peak in the middle) while NRF1 and ATF3 do not. So for example, if you line up samples by their PC1 score, then as you go from low to high PC1 score, the DNA methylation of ESR1 binding regions will generally change in a coordinated way across samples (PCs are based on covariation) but the DNA methylation in the surrounding genome would not change as much or would not change in a coordinated way. The results from our four region sets suggest that ESR1 and GATA3 regions specifically contribute to the variation along the PC 1 (as well as perhaps other PCs), helping us understand the biological meaning of the variation captured by those PCs: at least in part related to estrogen receptor. - -### The raw data - -If a region set has a high score for a certain target variable, we would expect that the epigenetic signal in at least some of those regions would correlate with the target variable. In other words, as you go from a high value of the target variable to a low value of the target variable, in this case as you go along the PC axis, the epigenetic signal will either go up or down. Let's look at the epigenetic signal in ESR1 regions. In the following plot, each column is a CpG in an ESR1 region and each row is a patient. We're only showing the 100 CpGs with the highest feature contribution scores but you can adjust or remove the `topXVariables` parameter to visualize more CpGs. Patients are ordered by their PC score for PC1. +Visualize the epigenetic signal in regions of interest, with samples ordered by a chosen variable: ```{r, eval=TRUE, message=FALSE, warning=FALSE} signalAlongAxis(genomicSignal=brcaMethylData1, @@ -484,9 +206,6 @@ signalAlongAxis(genomicSignal=brcaMethylData1, show_row_names=FALSE) ``` -The CpGs that show high variation along the PC axis are the ones that contribute to the ESR1 region set being ranked highly in our analysis. - -Now let's look at one of the region sets, NRF1, that had a lower score for PC1: ```{r, eval=TRUE, message=FALSE, warning=FALSE} signalAlongAxis(genomicSignal=brcaMethylData1, @@ -502,13 +221,7 @@ signalAlongAxis(genomicSignal=brcaMethylData1, show_row_names=FALSE) ``` -When patients are ordered according to PC1 score, we can see that there is very little covariation of the epigenetic signal in these regions. Therefore, it is not surprising that the NRF1 region set had a low score for this PC. - -Since COCOA ranks region sets based on their relative scores in comparison to other region sets tested, there will always be a region set with a best score. The permutation test gives an idea of statistical significance but not necessarily effect size. The actual COCOA scores give an idea of the magnitude of variation of a region set but it can be a good idea to also visually check the raw genomic signal in your top region sets to see how great the extent of variation is along the PC. - -### Feature contribution scores of individual regions - -This plot can help you learn more about the contribution of individual region set regions to the region set score for each target variable. For example, if the estrogen receptor region set was associated with PCs 1, 3, and 4, we might wonder whether the same regions are causing the association with these PCs or whether different regions are associated with each PC. To do this, we will first calculate the average absolute feature contribution score for each region in a region set (obtained by averaging a given target variable's FCS for epigenetic features that overlap that region). Then we can use the distribution of FCS for each target variable to convert each region's FCS to a percentile to see how extreme/high that region is for each target variable. Let's look at the plot for estrogen receptor: +Visualize scores for individual regions in a region set for each target variable: ```{r, eval=TRUE, message=FALSE, warning=FALSE} regionQuantileByTargetVar(signal = methylCor, @@ -523,9 +236,6 @@ regionQuantileByTargetVar(signal = methylCor, name = "Percentile of feature contribution scores in PC") ``` -We can see that some of the same regions have high FCS for multiple PCs (i.e. these regions are important for these PCs). Also, there are some regions that do not have high FCS for PCs 1-3, suggesting that these regions are not associated with the largest sources of covariation in the data. Overall, PC1 has the highest FCS, consistent with our meta-region profiles (highest peak for PC1). - -For contrast, we can look at the regions of NRF1: ```{r, eval=TRUE, message=FALSE, warning=FALSE} regionQuantileByTargetVar(signal = methylCor, @@ -540,572 +250,6 @@ regionQuantileByTargetVar(signal = methylCor, name = "Percentile of feature contribution scores in PC") ``` - -We can see that fewer regions in the NRF1 region set have high FCS for PCs 1-4. This is consistent with this region set being ranked lower by COCOA for association with these PCs. - -# COCOA for chromatin accessibility data (ATAC-seq) {#ATAC} - -COCOA uses a database of region sets to annotate inter-sample epigenetic variation in your data. In this vignette, we will see how COCOA can find meaningful sources of chromatin accessibility variation in breast cancer patients. - -First, we will show how to use COCOA to find region sets where chromatin accessibility variation is associated with variation in a phenotype of interest (the target variable) ([Supervised COCOA](#sCOCOAATAC)). Then, we will show how to find region sets that are associated with the principal components of PCA of our chromatin accessibility data ([Unsupervised COCOA](#uCOCOAATAC)). - -## Our data - -We will use chromatin accessibility data (ATAC-seq) from breast cancer patients from The Cancer Genome Atlas (TCGA-BRCA) from [Corces et al., 2018](#refs). We have a matrix of counts in peak regions for each sample. We (the authors) have subsetted this data to a small number of peak regions that we determined were necessary for the vignette in order to keep the vignette data small. Since we're using only a small subset of the data, the vignette results may not necessarily be generalizable to the full data. In your real analysis, we recommend using as many ATAC-seq regions as possible. We selected two region sets for this vignette that we expected to show variation in chromatin accessibility between breast cancer patients and two region sets that we expected to not show variation. Those region sets are transcription factor binding regions from ChIP-seq experiments (with the same reference genome as our breast cancer data): ESR1 in MCF-7 cells, GATA3 in MCF-7 cells, NRF1 in HepG2 cells, and Atf1 in K562 cells. Only chr1 regions are included in the vignette. For a real analysis, we recommend using hundreds or thousands of region sets. For sources of region sets, see the "[Region set database]" section. - -## Supervised COCOA {#sCOCOAATAC} - -Goal: Understand epigenetic variation related to a specific phenotype - -In a supervised COCOA analysis, we want to annotate epigenetic variation that is related to variation in one or more sample phenotypes. These sample phenotypes could be molecular (e.g. expression of a protein marker, presence/variant allele frequency of a mutation, or expression of a certain gene). The sample phenotypes can also be higher-level organism phenotypes (e.g. patient survival, cancer stage, disease severity). COCOA will identify region sets where the epigenetic signal is most related to the phenotype. These region sets then can help to understand the relationship between the phenotype and epigenetic state. - -Vignette analysis goal: As mentioned previously, we are looking at chromatin accessibility data in breast cancer patients. Our goal is to understand epigenetic variation related to our phenotype of interest, estrogen receptor (ER) status. - -### Quantify relationship between chosen sample phenotype and epigenetic data - -First we will load the necessary data: BRCA chromatin accessibility data with genomic coordinates, region sets to test, and our phenotype of interest, ER status (part of `brcaMetadata`). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -library(COCOA) -data("esr1_chr1") -data("gata3_chr1") -data("nrf1_chr1") -data("atf3_chr1") -data("brcaATACCoord1") -data("brcaATACData1") -data("brcaMetadata") -``` - -As mentioned, ER status is our target phenotype so we'll pull that info out of the patient metadata. Since we only have one target variable, we are keeping the data as a data.frame object using drop=FALSE (instead of targetVarDF becoming a vector). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -myPhen <- "ER_status" -targetVarDF <- brcaMetadata[colnames(brcaATACData1), myPhen, drop=FALSE] -``` - -Let's convert estrogen receptor status to a number so we can do calculations with it. Then we will quantify the association between ER status and the ATAC-seq counts in each peak region using Pearson correlation. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -targetVarDF$ER_status <- scale(as.numeric(targetVarDF$ER_status), - center=TRUE, scale=FALSE) -atacCor <- cor(t(brcaATACData1), targetVarDF$ER_status, - use = "pairwise.complete.obs") -# if the standard deviation of the epigenetic signal -# for a peak region across samples is 0, -# cor() will return NA, so manually set the correlation to 0 for these regions -atacCor[is.na(atacCor)] <- 0 -colnames(atacCor) <- myPhen -``` - -Now we have a feature contribution score (the correlation coefficient in this case) for each peak region that represents how much the accessibility at that site is associated with ER status. We can use those individual peak region scores to score reference region sets from our region set database. We'll score our region sets with the `aggregateSignalGRList` function. - -### Score region sets - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -GRList <- GRangesList(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1) -regionSetNames <- c("ESR1", "GATA3", "ATF3", "NRF1") -rsScores <- aggregateSignalGRList(signal=atacCor, - signalCoord=brcaATACCoord1, - GRList=GRList, - signalCol=myPhen, - scoringMetric="default", - absVal=TRUE) -rsScores$regionSetName <- regionSetNames -rsScores -``` - -Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with the phenotype of interest, ER status. Since we are using the absolute value of the correlation coefficient for each peak region to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that there is not much difference in the chromatin accessibility associated with ER status in the ER and GATA3 region sets compared to the NRF1 and ATF3 region sets. While this is a little unexpected (we would expect chromatin accessibility in ER and GATA3 region sets to be associated with ER status), we hypothesize that this might be because the clinical ER+/- designation does not accurately represent the molecular phenotype of some samples. This hypothesis is based on [our unsupervised analysis](#uCOCOAATAC) where some ER- samples cluster with the ER+ samples and vice versa so you can check that out if you are curious. - -Both the correlation and scoring steps can be combined into one step with the `runCOCOA` function as will be shown in the next section. - -### Estimating statistical significance - -Now that we have a score for each region set, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. - -#### Permutations - -By shuffling the sample phenotypes and calculating fake COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. If the samples were in the correct order, `runCOCOA` would return the real COCOA scores but since we are giving a shuffled sample order (`sampleOrder` parameter), we will be correlating the epigenetic data with the shuffled sample phenotypes to create null distribution COCOA scores. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the sample phenotypes. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -set.seed(100) - -nPerm <- 5 -permRSScores <- list() - -for (i in 1:nPerm) { - # shuffling sample labels - sampleOrder <- sample(1:nrow(targetVarDF), nrow(targetVarDF)) - permRSScores[[i]] <- runCOCOA(sampleOrder=sampleOrder, - genomicSignal=brcaATACData1, - signalCoord=brcaATACCoord1, - GRList=GRList, - signalCol=myPhen, - targetVar=targetVarDF, - variationMetric="cor") - permRSScores[[i]]$regionSetName <- regionSetNames -} - -permRSScores[1:3] -``` - -We now have a list where each item is a result of `aggregateSignalGRList` for a permutation of the sample phenotypes. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -nullDistList <- convertToFromNullDist(permRSScores) -names(nullDistList) <- regionSetNames -nullDistList -``` - -We have four null distributions, one for each region set in `GRList`. - -#### Fit gamma distribution to null distributions and get p-values - -To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# p-values based on fitted gamma distributions -gPValDF <- getGammaPVal(rsScores = rsScores, - nullDistList = nullDistList, - signalCol = myPhen, - method = "mme", realScoreInDist = TRUE) -gPValDF <- cbind(gPValDF, - rsScores[, colnames(rsScores)[!(colnames(rsScores) - %in% myPhen)]]) -gPValDF <- cbind(gPValDF, regionSetNames) -gPValDF -``` - -Fitting a gamma distribution with only 5 permutations does not really make sense but we do it here just to illustrate function usage. You can increase the number of permutations and see how that will affect the p-values (with 100 permutatios ER has the lowest p-value with p<0.05) Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores can be used as an indicator of effect size. - -After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). - -We can also use the null distributions to directly calculate empirical p-values: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -getPermStat(rsScores = rsScores, - nullDistList = nullDistList, - signalCol = myPhen) -``` - -An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. - -The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. - -## Unsupervised COCOA {#uCOCOAATAC} - -Goal: Understand major sources of epigenetic variation present in data without knowing beforehand what they are. - -In an unsupervised COCOA analysis, we normally begin with a dimensionality reduction technique such as PCA that identifies major sources/axes of inter-sample variation. Then the principal components/latent factors are treated as the target variables for COCOA and a workflow similar to the supervised COCOA analysis is used. We identify region sets where the epigenetic signal varies along the PC axis. These region sets can help understand the biological meaning of the PCs/latent factors. - -Vignette analysis goal: As mentioned previously, we are looking at chromatin accessibility in breast cancer patients. Our goal is to understand sources of inter-sample epigenetic variation in an unsupervised way (without using sample phenotypes or groups). - -### Quantify relationship between latent factors and epigenetic data - -First we will load the required packages and necessary data: BRCA ATAC-seq count data with genomic coordinates and region sets to test: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -library(COCOA) -library(data.table) -library(ggplot2) -data("esr1_chr1") -data("gata3_chr1") -data("nrf1_chr1") -data("atf3_chr1") -data("brcaATACCoord1") -data("brcaATACData1") -``` - -Next, we will do PCA on our breast cancer chromatin accessibility data. PCA could take a while for large datasets (e.g. longer than 30 minutes). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -pca <- prcomp(t(brcaATACData1)) -pcScores <- pca$x - -plot(pcScores[, c("PC1", "PC2")]) -``` - -Each point is a sample. It is not immediately clear what the meaning of PC1 (or PC2) is and we see a wide spectrum of variation among samples. COCOA will help us understand the biological meaning of the PCs. - -After the PCA, you might want to look at plots of the first few PCs and consider removing extreme outliers and rerunning the PCA since PCA can be affected by outliers, although this depends on your analysis. We note that COCOA will still work even if there are not distinct clusters of samples in your PCA plot and if you do not have known groups of samples. In this vignette, we will look at principal components 1-4 but this choice also depends on the context of your analysis. - -We treat the PC scores as our "target variables" for this analysis and calculate the correlation between the ATAC-seq counts in each peak region and each PC. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -PCsToAnnotate <- paste0("PC", 1:4) -targetVar <- pcScores[, PCsToAnnotate] -targetVar <- as.matrix(scale(targetVar, - center=TRUE, scale=FALSE)) -atacCor <- cor(t(brcaATACData1), targetVar, - use = "pairwise.complete.obs") -# if the standard deviation of the ATAC-seq counts -# for a peak region across samples is 0, -# cor() will return NA, so manually set the correlation to 0 for these regions -atacCor[is.na(atacCor)] <- 0 -``` - -### Score region sets - -Let's put our region sets into one object to simplify the next steps of the analysis. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# prepare data -GRList <- GRangesList(esr1_chr1, gata3_chr1, nrf1_chr1, atf3_chr1) -regionSetNames <- c("esr1_chr1", "gata3_chr1", "nrf1_chr1", "atf3_chr1") -``` - -Now let's give each region set a score with `aggregateSignalGRList()` to quantify how much it is associated with each principal component: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -regionSetScores <- aggregateSignalGRList(signal=atacCor, - signalCoord=brcaATACCoord1, - GRList=GRList, - signalCol=PCsToAnnotate, - scoringMetric="default") -regionSetScores$regionSetName <- regionSetNames -regionSetScores -``` - -Now we have a score for each region set that represents how much epigenetic variation in that region set is associated with each PC. Since we are using the absolute value of the correlation coefficient for each peak region to score our region sets, the possible region set scores range from 0 to 1, with 0 representing no correlation and 1 representing complete correlation. From our initial results, it appears that the chromatin accessibility in ER and GATA3 region sets is more highly associated with the PC scores than chromatin accessibility in the NRF1 and ATF3 region sets is. Out of PCs 1-4, ER and GATA3 have the highest scores for PC1. This result makes sense when we visualize ER status on the PCA plot. As you can see in the figure below, PC1 has some ability to separate samples based on ER status (even though ER status was not used when doing the PCA). - - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -annoPCScores <- data.frame(pcScores, ER_status=as.factor(brcaMetadata[row.names(pcScores), "ER_status"])) -ggplot(data = annoPCScores, mapping = aes(x=PC1, y=PC2, col=ER_status)) + geom_point() + ggtitle("PCA of a subset of chromatin accessibility data from breast cancer patients") + theme_classic() -``` -Given that some ER+ samples cluster with the ER- samples and vice versa, it is interesting to consider whether the clinical ER+/- designation is accurately representing the molecular phenotype of the samples although there are multiple potential reasons for this including tumor heterogeneity (this trend is still present when using the full ATAC-seq count matrix which includes additional samples). - -We can visualize the scores of region sets of interest with the `plotAnnoScoreDist` function. This function allows you to supply regular expressions matching one or more groups of region sets and will color the region set scores according to their group. This can be useful to see trends when using a large region set database. Here are scores for PC1, coloring region sets related to ER: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -plotAnnoScoreDist(rsScores = regionSetScores, - colToPlot = "PC1", - pattern = "GATA3|ESR1", - patternName = "ER-related", - rsNameCol = "regionSetName", - alpha=1) -``` - -As an easy way to visualize how the region sets are ranked for multiple target variables, we can use `rsScoreHeatmap`: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -rsScoreHeatmap(regionSetScores, - signalCol=paste0("PC", 1:4), - rsNameCol = "regionSetName", - orderByCol = "PC1", - column_title = "Region sets ordered by score for PC1") -``` - -We can see that GATA3 had the highest score for PC 1 followed by ESR1 but PCs 2, 3, and 4 seem more associated with NRF1 and ATF3. If you want to arrange the heatmap by region set scores for another PC, just change the orderByCol parameter like so: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -rsScoreHeatmap(regionSetScores, - signalCol=paste0("PC", 1:4), - rsNameCol = "regionSetName", - orderByCol = "PC2", - column_title = "Region sets ordered by score for PC2") -``` - -### Estimating statistical significance - -Now that we have a score for each region set for each PC, we can to get an idea of the statistical significance of these results. We will do this using a permutation test. The permutation test is computationally expensive and, in some cases, may not be needed since the COCOA scores by themselves might be enough to gain insight your data. However, if a p-value estimate is desired, the permutation test can return that. - -#### Permutations - -By shuffling the target variable and calculating COCOA scores, we can create a null distribution for each region set. The `runCOCOA` function does the two main COCOA steps: quantifying the variation and scoring the region sets. If the samples were in the correct order, `runCOCOA` would return the real COCOA scores but since we are giving a shuffled sample order (`sampleOrder` parameter), we will be correlating the epigenetic data with the shuffled sample target variables to create null distribution COCOA scores. We will be running five permutations for the vignette but many more should be done for a real analysis (probably at least a few hundred). We need to set the seed for reproducible results since we are doing random shuffling of the samples' target variables. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -set.seed(100) - -nPerm <- 5 -permRSScores <- list() - -for (i in 1:nPerm) { - # shuffling sample labels - sampleOrder <- sample(1:nrow(targetVar), nrow(targetVar)) - permRSScores[[i]] <- runCOCOA(sampleOrder=sampleOrder, - genomicSignal=brcaATACData1, - signalCoord=brcaATACCoord1, - GRList=GRList, - signalCol=PCsToAnnotate, - targetVar=targetVar, - variationMetric="cor") - permRSScores[[i]]$regionSetName <- regionSetNames -} - -permRSScores[1:3] -``` - -We now have a list where each item is a result of `aggregateSignalGRList`. We have `nPerm` list items in permRSScores (only showing 3 above). We'll reformat it so that we have a list where each item is a null distribution for a single region set: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -nullDistList <- convertToFromNullDist(permRSScores) -names(nullDistList) <- regionSetNames -nullDistList -``` - -We have four null distributions, one for each region set in `GRList`. - -#### Fit gamma distribution to null distributions and get p-values - -To reduce the number of permutations (and the run time for COCOA), we will approximate each null distribution with a gamma distribution [(Winkler et al., 2016)](#refs). After fitting the gamma distribution to each null distribution, we can use the gamma distributions to get p-values. We fit the gamma distribution and get the p-value with `getGammaPVal`. Take a look at the COCOA paper for more information on the best use of the gamma approximation (it may overestimate significance of low p-values). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# p-values based on fitted gamma distributions -gPValDF <- getGammaPVal(rsScores = regionSetScores, - nullDistList = nullDistList, - signalCol = PCsToAnnotate, - method = "mme", realScoreInDist = TRUE) -gPValDF <- cbind(gPValDF, - regionSetScores[, colnames(regionSetScores)[!(colnames(regionSetScores) - %in% PCsToAnnotate)]]) -gPValDF <- cbind(gPValDF, regionSetNames) -gPValDF -``` - -Fitting a gamma distribution with only 5 permutations does not really make sense but we do it here just to illustrate function usage. You can increase the number of permutations and see how that will affect the p-values. Also, keep in mind that a low p-value does not necessarily indicate a large effect size. The actual COCOA scores can be used as an indicator of effect size. - -After getting p-values, we also want to do multiple testing correction to account for the number of region sets in our region set database (not shown here but see `?p.adjust`). - -We can also use the null distributions to directly calculate empirical p-values: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -getPermStat(rsScores = regionSetScores, - nullDistList = nullDistList, - signalCol = PCsToAnnotate) -``` - -An empirical p-value of zero is returned when the real region set score was more extreme than any scores in its null distribution. What this really suggests is that the true p-value is less than ( 1/nPerm ) although it is unclear how much less. - -The empirical p-values are limited by the number of permutations done. When multiple testing correction is done to correct for testing many region sets in a large region set database, the empirical test normally cannot achieve significant p-values with a reasonable number of permutations. For that reason, the gamma distribution approximation is recommended to allow the possibility of lower p-values. - -## Further understanding the results (visualization) - -We can further understand the variability in these region sets in several ways: - -1. Look at whether variability is specific to the regions of interest compared to the genome around these regions. -2. Visualize the epigenetic signal in these regions and whether it correlates with the target variable. -3. Look at the feature contribution scores (e.g. correlation coefficients) in each region of a given region set to see whether all regions have high contribution scores or just a subset the regions. Also we can see if the same regions have high contribution scores for multiple target variables. - -To demonstrate these techniques, we'll be using results from [the unsupervised analysis above](#uCOCOAATAC). - -### Specificity of variation to the regions of interest - -We can see whether variability associated with the target variable is specific to the region of interest by comparing the region of interest to the surrounding genome. To do this, we will calculate the average feature contribution scores (FCS) of a wide area surrounding the regions of interest (14 kb centered on each region set region). - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -wideGRList <- lapply(GRList, resize, width=14000, fix="center") -fcsProfile <- lapply(wideGRList, function(x) getMetaRegionProfile(signal=atacCor, - signalCoord=brcaATACCoord1, - regionSet=x, - signalCol=PCsToAnnotate, - binNum=21)) -``` - -We will normalize the result for each PC so we can better compare them. Here we normalize by subtracting the mean absolute FCS of all epigenetic features for each PC from the region set profiles for the corresponding PC. Then we get the plot scale so we can easily compare the different profiles. These normalization steps are helpful for comparing the meta-region profiles but not necessarily required so it's not essential that you understand the below code. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# average FCS from each PC to normalize so PCs can be compared with each other -avFCS <- apply(X=atacCor[, PCsToAnnotate], - MARGIN=2, - FUN=function(x) mean(abs(x))) - -# normalize -fcsProfile <- lapply(fcsProfile, - FUN=function(x) as.data.frame(mapply(FUN = function(y, z) x[, y] - z, - y=PCsToAnnotate, z=avFCS))) -binID = 1:nrow(fcsProfile[[1]]) -fcsProfile <- lapply(fcsProfile, FUN=function(x) cbind(binID, x)) - -# for the plot scale -maxVal <- max(sapply(fcsProfile, FUN=function(x) max(x[, PCsToAnnotate]))) -minVal <- min(sapply(fcsProfile, FUN=function(x) min(x[, PCsToAnnotate]))) - -# convert to long format for plots -fcsProfile <- lapply(X=fcsProfile, FUN=function(x) tidyr::gather(data=x, key="PC", value="meanFCS", PCsToAnnotate)) -fcsProfile <- lapply(fcsProfile, - function(x){x$PC <- factor(x$PC, levels=PCsToAnnotate); return(x)}) -``` - -Let's look at the plots! - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -wrapper <- function(x, ...) paste(strwrap(x, ...), collapse="\n") -profilePList <- list() -for (i in seq_along(fcsProfile)) { - - thisRS <- fcsProfile[[i]] - - profilePList[[i]] <- ggplot(data=thisRS, - mapping=aes(x=binID , y=meanFCS)) + - geom_line() + ylim(c(minVal, maxVal)) + facet_wrap(facets="PC") + - ggtitle(label=wrapper(regionSetNames[i], width=30)) + - xlab("Genome around region set, 14 kb") + - ylab("Normalized mean FCS") + - theme(panel.grid.major.x=element_blank(), - panel.grid.minor.x=element_blank(), - axis.text.x=element_blank(), - axis.ticks.x=element_blank()) - profilePList[[i]] - -} -profilePList[[1]] -profilePList[[2]] -profilePList[[3]] -profilePList[[4]] -``` - -These plots show the average magnitude of the feature contribution scores in the genome around and including the regions of interest. The FCS for an input epigenetic feature, indicates how much that input feature varies in the same direction as the target variable (here, the PC scores). If you used correlation or covariation to get the feature contribution scores, a peak in the middle of the profile indicates that there is increased covariation in the regions of interest compared to the surrounding genome. It suggests that those regions are changing in a coordinated way whereas the surrounding genome is not changing in a coordinated way to the same extent. A peak suggests that the variation in the target variable (the PCs) may be somehow specifically related to the region set although it is not clear whether the region set is causally linked to the variation or just affected by other things that are causing the variation associated with the target variable. Some region sets may have an increased FCS but no peak, for example, some histone modification region sets like H3K27me3 or H3K9me3. This doesn't necessarily mean these regions are not relevant. It could just mean that there is variability in larger blocks of the genome around these histone modifications (the expanded regions might also overlap with each other). For details on how the meta-region profile was created, check out [the section about it in the "Method details" section](#mrProfile) of this vignette or see the docs for `getMetaRegionProfile` with `?COCOA::getMetaRegionProfile`. - -These plots show that ESR1 and GATA3 binding regions demonstrate higher covariation along the PC1 axis than the surrounding genome does (because they have a peak in the middle) while NRF1 and ATF3 do not. So for example, if you line up samples by their PC1 score, then as you go from low to high PC1 score, the chromatin accessibility of ESR1 binding regions will generally change in a coordinated way across samples (PCs are based on covariation) but the chromatin accessibility in the surrounding genome would not change as much or would not change in a coordinated way. The results from our four region sets suggest that ESR1 and GATA3 regions specifically contribute to the variation along the PC 1, helping us understand the biological meaning of the variation captured by that PC: at least in part related to estrogen receptor. - -### The raw data - -If a region set has a high score for a certain target variable, we would expect that the epigenetic signal in at least some of those regions would correlate with the target variable. In other words, as you go from a high value of the target variable to a low value of the target variable, in this case as you go along the PC axis, the epigenetic signal will either go up or down. Let's look at the epigenetic signal in ESR1 regions. In the following plot, each column is an ATAC-seq peak region that overlaps in an ESR1 region and each row is a patient. Patients are ordered by their PC score for PC1. - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -signalAlongAxis(genomicSignal=brcaATACData1, - signalCoord=brcaATACCoord1, - regionSet=esr1_chr1, - sampleScores=pcScores, - orderByCol="PC1", cluster_columns=TRUE, - column_title = "Individual ATAC-seq region", - name = "Normalized signal in ATAC-seq regions", - show_row_names=FALSE, - show_column_names=FALSE) -``` - -Looking at the raw data confirms that chromatin accessibility is in fact varying along the PC axis. It appears that some but not all peak regions vary greatly along the PC axis. The peak regions that show high variation along the PC axis are the ones that contribute to the ESR1 region set being ranked highly in our analysis. - -Now let's look at one of the region sets, NRF1, that had a lower score for PC1: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -signalAlongAxis(genomicSignal=brcaATACData1, - signalCoord=brcaATACCoord1, - regionSet=nrf1_chr1, - sampleScores=pcScores, - orderByCol="PC1", - cluster_columns=TRUE, - column_title = "Individual ATAC-seq region", - name = "Normalized signal in ATAC-seq regions", - show_row_names=FALSE, - show_column_names=FALSE) -``` - -When patients are ordered according to PC1 score, we can see that there is less covariation of the epigenetic signal in these regions compared to the ESR1 regions. Therefore, it is not surprising that the NRF1 region set had a low score for this PC. - -Since COCOA ranks region sets based on their relative scores in comparison to other region sets tested, there will always be a region set with a best score. The permutation test gives an idea of statistical significance but not necessarily effect size. The actual COCOA scores give an idea of the magnitude of variation of a region set but it can be a good idea to visually check the raw genomic signal in your top region sets to see how great the extent of variation is along the PC. - -### Feature contribution scores of individual regions - -This plot can help you learn more about the contribution of individual region set regions to the region set score for each target variable. For example, if the estrogen receptor region set was associated with PCs 1, 3, and 4, we might wonder whether the same regions are causing the association with these PCs or whether different regions are associated with each PC. To do this, we will first calculate the average absolute feature contribution score for each region in a region set (obtained by averaging a given target variable's FCS for epigenetic features that overlap that region). Then we can use the distribution of FCS for each target variable to convert each region's FCS to a percentile to see how extreme/high that region is for each target variable. Let's look at the plot for estrogen receptor: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -regionQuantileByTargetVar(signal = atacCor, - signalCoord = brcaATACCoord1, - regionSet = esr1_chr1, - rsName = "Estrogen receptor (chr1)", - signalCol=paste0("PC", 1:4), - maxRegionsToPlot = 8000, - cluster_rows = TRUE, - cluster_columns = FALSE, - column_title = rsName, - name = "Percentile of feature contribution scores in PC") -``` - -A high FCS indicates that a region is important for a PC. We can see that not many regions have high FCS for all four PCs. Overall, PC1 has the highest FCS, consistent with our meta-region profiles (peak for PC1). - -For contrast, we can look at the regions of NRF1: - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -regionQuantileByTargetVar(signal = atacCor, - signalCoord = brcaATACCoord1, - regionSet = nrf1_chr1, - rsName = "NRF1 (chr1)", - signalCol=paste0("PC", 1:4), - maxRegionsToPlot = 8000, - cluster_rows = TRUE, - cluster_columns = FALSE, - column_title = rsName, - name = "Percentile of feature contribution scores in PC") -``` - - -We can see that fewer regions in the NRF1 region set have high FCS for PC 1. This is consistent with this region set being ranked lower by COCOA for that PC. - -# Additional details - -## Method details - -The overall conceptual steps are described in the [Basic workflow] section. In this section, we give more details about some of the individual parts of that workflow. For further information, see [the COCOA paper](https://doi.org/10.1186/s13059-020-02139-4). - -### Region set database - -COCOA uses a database of region sets to gain biological insight into sources of variability in your data. A region set is a set of genomic regions that share a biological annotation. This includes transcription factor (TF) binding regions (e.g. from ChIP-seq), regions with a certain histone modification (e.g. ChIP-seq) or chromatin accessibility regions (e.g. DNase/ATAC-seq). Most of these region sets are from experimental data but don't necessarily have to be. For instance, you could use predicted TF binding regions based on the TF motif. The big picture goal of using a region set database is to connect variation between samples to an interpretable biological meaning: the known annotation of a region set. For each target variable (phenotype or latent factor), COCOA will give a score to each region set that quantifies how much that region set is associated with the inter-sample variation of that target variable. - -COCOA should be done with many region sets (i.e. hundreds or > 1000). A region set can be a simple ".bed"" file with three columns containing the genomic locations of the regions: chr (for chromosome), start, and end. In R, this data can be represented as a data.frame or as a GRanges object. Publicly available collections of region sets can be found online (e.g. http://databio.org/regiondb) and region sets can be accessed through Bioconductor packages (e.g. LOLA and AnnotationHub). The region sets must be from the same reference genome as your sample data (although you could use [the liftOver tool](https://genome.ucsc.edu/cgi-bin/hgLiftOver) to convert from one genome version to another). The region sets can come from anywhere so if you experimentally or computationally generate your own region sets, you can just include those with the others when running the COCOA analysis. - -We'll show some sample code for loading a LOLA region set database (unevaluated). Loading the database may take a few minutes: - -```{r, eval=FALSE, message=FALSE, warning=FALSE} -library(LOLA) - -# reading in the region sets -# load LOLA database -lolaPath <- paste0("path/to/LOLACore/genomeVersion/") -regionSetDB <- loadRegionDB(lolaPath) - -# metadata about the region sets -loRegionAnno <- regionSetDB$regionAnno -lolaCoreRegionAnno <- loRegionAnno -collections <- c("cistrome_cistrome", "cistrome_epigenome", "codex", - "encode_segmentation", "encode_tfbs", "ucsc_features") -collectionInd <- lolaCoreRegionAnno$collection %in% collections -lolaCoreRegionAnno <- lolaCoreRegionAnno[collectionInd, ] -regionSetName <- lolaCoreRegionAnno$filename -regionSetDescription <- lolaCoreRegionAnno$description - -# the actual region sets -GRList <- GRangesList(regionSetDB$regionGRL[collectionInd]) - -# since we have what we need, we can delete this to free up memory -rm("regionSetDB") -``` - -This `GRList` object can be used with COCOA as the region set database, along with any other region sets you may have from other sources. - -### Aggregating info from individual features - -Since differences between samples in individual epigenetic features may be hard to interpret, COCOA uses region sets to aggregate nucleotide/region level info into a more condensed, interpretable form. As mentioned in the vignette, each epigenetic feature has a "feature contribution score" (FCS) for a given target variable and the magnitude of the FCS represents how much that feature contributes to that target variable. Also, each original epigenetic feature is associated with a genomic coordinate or a region. COCOA will use this information to give each region set in the region set database a score for each target variable. For a given target variable-region set combination (for example PC1 and the region set esr1_chr1), we first identify all the epigenetic features that overlap with the region set. Then the scoring of the region set depends on the scoring metric chosen. The "regionMean" method is described here although the other methods are described in function docs. For the "regionMean" method that is used for single base data like DNA methylation, we identify the FCS for the features that overlap the region set and average the (absolute) FCS by region (average FCS in each region to get one value per region of the region set). Then we average the region values to get a single average for that region set which is its score. We repeat this calculation for all target variable-region set combinations. Now for a given target variable, we can rank the region sets by their score/FCS average to see which region sets are most associated with that target variable (higher FCS average means a greater association with the target variable). The biological annotation of the top ranked region sets for the target variable can help you understand variation among your samples. - -### Making a "meta-region" profile {#mrProfile} -A "meta-region" profile is a summary of the FCS in the genome in and around the regions of a region set. This is created with the `getMetaRegionProfile` function. The calculations are similar to those of `aggregateSignalGRList` with a few major differences. Instead of using the region set as is, we will expand each region in the region set on both sides so we can also look at the surrounding genome. We will then split each region into the same number of bins (of approximately equal size). Then we average (the absolute value of) all FCS that overlap a bin to get a single average FCS for each bin. For region-based epigenetic data like ATAC-seq, this average can be a weighted average based on how much each ATAC-seq region overlaps each region set region. Then (for both single base and region-based data) we combine information from all the regions by averaging the corresponding bins from the different regions (all bin1's averaged together, all bin2's averaged together, etc.). Finally, we average the profile symmetrically over the center (the first bin becomes the average of the first and last bin, the second bin becomes the average of the second and second to last bin, etc.). We do this because the orientation of the regions in the genome is arbitrary: we did not use any strand information or relate the regions to any directional information. The "meta-region" profile gives a summary in a single profile of all the regions in a region set and allows you to compare the regions to the surrounding genome. See ?getMetaRegionProfile for more details and options. - -## Q and A - -1. Where can I get more information about COCOA? - -For more in-depth info on COCOA methods and examples of use, check out our paper: - -Lawson, J.T., Smith, J.P., Bekiranov, S. et al. COCOA: coordinate covariation analysis of epigenetic heterogeneity. Genome Biol 21, 240 (2020). https://doi.org/10.1186/s13059-020-02139-4 - -For package documentation, see the vignettes and reference manual on the Bioconductor website. You can also check out ongoing development or report an issue with COCOA [on Github](https://github.com/databio/COCOA). - - -2. What data types can COCOA be used with? - -So far, COCOA has been validated on single base pair resolution DNA methylation data, chromatin accessibility data, and a multi-omics analysis that included DNA methylation. Theoretically, COCOA could work with any type of genomic coordinate-based data: data where you have a genomic coordinate or range and an associated value. This could include histone modification data, single nucleotide polymorphism/mutation data, copy number variation etc. although COCOA would probably work better for data where smaller regions or single bases are measured. - - -3. Can COCOA be used with other dimensionality reduction techniques such as t-SNE? - -Short answer for t-SNE: no. In general though, it depends. COCOA must have a score for each original dimension that quantifies how much it contributes to the new dimension. Since t-SNE maps the original dimensions to new dimensions in a nonlinear way, the mappings of the original dimensions to the new dimensions are not comparable to each other and cannot be aggregated into a single score for a region set in a uniform way. - - -4. What does the name COCOA mean? - -The method is called Coordinate Covariation Analysis because it looks at covariation/correlation of individual signals/features at genomic coordinates and how those features relate to a target variable. COCOA annotates the covariation/correlation of individual genomic features with region sets in order to gain insight into variation between samples. - - ## Related references {#refs} Lawson, J.T., Smith, J.P., Bekiranov, S. et al. COCOA: coordinate covariation analysis of epigenetic heterogeneity. Genome Biol 21, 240 (2020). https://doi.org/10.1186/s13059-020-02139-4 From f5f232b147d6366bf98fa07abe2c754098ecd17e Mon Sep 17 00:00:00 2001 From: j-lawson Date: Tue, 16 Feb 2021 17:16:41 -0500 Subject: [PATCH 24/38] Add unit tests, modify code to prevent extra copying --- R/COCOA.R | 35 ++++++++++++-------- R/permutation.R | 57 ++++++++++++++++--------------- tests/testthat/test_all.R | 70 ++++++++++++++++++++++++++++++++------- 3 files changed, 110 insertions(+), 52 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 2c0c18b..6af425b 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -646,7 +646,7 @@ aggregateSignalGRList <- function(signal, if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { - return(resultsDF) + return(as.data.frame(resultsDF)) } else { resultsDT <- do.call(rbind, resultsList) resultsDF <- as.data.frame(resultsDT) @@ -654,9 +654,10 @@ aggregateSignalGRList <- function(signal, } } -# @param dataMat columns of dataMat should be samples/patients, rows should be genomic signal -# (each row corresponds to one genomic coordinate/range) -# @param featureMat Rows should be samples, columns should be "features" +# @param dataMat rows of dataMat should be samples/patients, +# columns should be genomic signal +# (each column corresponds to one genomic coordinate/range) +# @param featureMat Matrix. Rows should be samples, columns should be "features" # (whatever you want to get correlation with: eg PC scores), # all columns in featureMat will be used (subset when passing to function # in order to not use all columns) @@ -680,15 +681,15 @@ createCorFeatureMat <- function(dataMat, featureMat, centerDataMat=TRUE, centerFeatureMat = TRUE, testType="cor", covariate=NULL) { - featureMat <- as.matrix(featureMat) # copies? + # featureMat <- as.matrix(featureMat) # copies? featureNames <- colnames(featureMat) nFeatures <- ncol(featureMat) nDataDims <- nrow(dataMat) if (centerDataMat) { - cpgMeans <- rowMeans(dataMat, na.rm = TRUE) + cpgMeans <- colMeans(dataMat, na.rm = TRUE) # centering before calculating correlation - dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans) + dataMat <- apply(X = dataMat, MARGIN = 1, function(x) x - cpgMeans) } @@ -702,7 +703,7 @@ createCorFeatureMat <- function(dataMat, featureMat, } # avoid this copy and/or delay transpose until after calculating correlation? - dataMat <- as.data.frame(t(dataMat)) # copies + # dataMat <- as.data.frame(t(dataMat)) # copies, expect transposed form as input instead if (testType == "cor") { @@ -2252,8 +2253,10 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { rsMatList[[j]][, i] <- tmpVec[tmpCount:(tmpCount + matRowNVec[j]-1)] tmpCount = tmpCount + matRowNVec[j] } - } - } + } + } + # put in correct orientation for matrix multiplication to prevent transposition + rsMatList <- lapply(rsMatList, FUN = t) # transpose } else if (scoringMetric == "regionMean") { # instead of 1 give weight proportional to how many signalCoord are # overlapped @@ -2278,6 +2281,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } } } + # put in correct orientation for matrix multiplication to prevent transposition + rsMatList <- lapply(rsMatList, FUN = t) # transpose } else if (scoringMetric == "proportionWeightedMean") { sumProportionOverlap = rep(0, length(GRList)) @@ -2297,9 +2302,9 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # aggregate pOlap by signalCoord region olDT <- data.table(queryHits = queryHits(olList[[i]]), - pOlap=pOlap/denom) + pOlap=pOlap) normDT <- olDT[, .(coordSum = sum(pOlap)), by=queryHits] - tmpVec[normDT$queryHits] <- normDT$coordSum + tmpVec[normDT$queryHits] <- normDT$coordSum / denom # for coordinate chunks for (j in seq_along(rsMatList)) { @@ -2314,6 +2319,8 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # signalMat <- signalMat[queryHits(hits), ] # done in next step to prevent extra copying } } + # put in correct orientation for matrix multiplication to prevent transposition + rsMatList <- lapply(rsMatList, FUN = t) # transpose } else { stop("The given scoringMetric cannot be used with matrix scoring.") } @@ -2340,12 +2347,14 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { } # multiply region set matrices with data matrices to get COCOA score +# rsMatList mats should have rows as region sets and cols as features +# ^ so data won't have to be copied during transpose matScore <- function(rsMatList, signalMatList, rsInfo) { # rsMatList mats are features X region sets # signalMatList is features X target variables # multiply matrices - scoreL <- mapply(FUN = function(x, y) t(x) %*% y, + scoreL <- mapply(FUN = function(x, y) x %*% y, x=rsMatList, y=signalMatList, SIMPLIFY = FALSE) # each item in scoreL is region sets X target variables # combine results. Already normalized to one so can just add to get mean. diff --git a/R/permutation.R b/R/permutation.R index 1964c3b..5bfd995 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -134,40 +134,21 @@ runCOCOA <- function(genomicSignal, } rsScores = rsScores[, colsToAnnotate] # prevents error that occurs if extra column is factor - # more efficient to only do once - if (centerGenomicSignal) { - cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE) - # centering before calculating correlation - genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans) - # don't do later - centerGenomicSignal <- FALSE - } - if (centerTargetVar) { - featureMeans <- colMeans(targetVar, na.rm = TRUE) - # centering before calculating correlation (also, t() converts to matrix) - targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans)) - if (dim(targetVar)[1] == 1) { - targetVar <- t(targetVar) - } - # don't do later - centerTargetVar <- FALSE - } - checkConvertInputClasses(signal=genomicSignal, signalCoord=signalCoord, regionSet=NULL, signalCol=signalCol, rsOL=NULL) - # detect signalCoordType - # when signalCoord is a GRanges object - if (any(start(signalCoord) != end(signalCoord))) { - signalCoordType <- "multiBase" - } else { - signalCoordType <- "singleBase" - } - + # detect signalCoordType + # when signalCoord is a GRanges object + if (any(start(signalCoord) != end(signalCoord))) { + signalCoordType <- "multiBase" + } else { + signalCoordType <- "singleBase" + } + # if "default" scoring method is given, choose based on signalCoordType if (scoringMetric == "default") { @@ -181,6 +162,28 @@ runCOCOA <- function(genomicSignal, } } + # more efficient to only do once + if (centerGenomicSignal) { + cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE) + # centering before calculating correlation + genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans) + # don't do later + centerGenomicSignal <- FALSE + } + if (centerTargetVar) { + featureMeans <- colMeans(targetVar, na.rm = TRUE) + # centering before calculating correlation (also, t() converts to matrix) + targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans)) + if (dim(targetVar)[1] == 1) { + targetVar <- t(targetVar) + } + # don't do later + centerTargetVar <- FALSE + } + + # must be transposed now so it is not (and not copied) during each permutation + genomicSignal <- t(genomicSignal) + ######################################################################## # create region set overlap matrix # this code should be after code modifying "signal" diff --git a/tests/testthat/test_all.R b/tests/testthat/test_all.R index e834b57..df00169 100755 --- a/tests/testthat/test_all.R +++ b/tests/testthat/test_all.R @@ -41,22 +41,11 @@ regionSet2 <- data.table(chr = c("chr1", "chr1", "chr1", "chr2", "chr2"), end = c(1400, 1700, 3800, 4100, 4700)) regionSet2 <- MIRA:::dtToGr(regionSet2) - - - # don't use built in package data for tests because it will take longer -# running the tests - - -# aggregateSignal(signal = loadingMat, -# coordinateDT = coordinates, -# regionSet = regionSet) - # signalOLMetrics dataDT <- cbind(coordinateDT, as.data.frame(loadingMat)) - # making test data for Wilcoxon rank sum test chr3 <- seq(from=100, to = 700, by = 100) coordinateDTW <- data.table(chr=rep("chr3", length(chr3)), @@ -70,9 +59,66 @@ colnames(loadingMatW) <- c("PC2", "PC3") loadingMatW[7, "PC3"] <- 10 dataDTW <- cbind(coordinateDTW, as.data.frame(loadingMatW)) -test_that("aggregateSignal, scoring metrics, and aggregateSignalGRList", { +test_that("olToMat", { + myGRL <- GRangesList(regionSet1, regionSet2) + rmOLMat <- COCOA:::olToMat(signalCoord = COCOA:::dtToGr(regionCoordDT), + GRList = myGRL, scoringMetric = "simpleMean", maxRow = 2) + # each column (region set) should be normalized to equal 1 + oneVec <- c(regionSet1 = 1, regionSet2 = 1) + expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + rs1Ex <- rep(0.2, 6) + rs1Ex[4] <- 0 + rs1Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet1"] + expect_equal(rs1Act, rs1Ex) + rs2Ex <- c(0, 0.5, 0.5, 0, 0, 0) + rs2Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet2"] + expect_equal(rs2Act, rs2Ex) + rmOLMat <- COCOA:::olToMat(signalCoord = COCOA:::dtToGr(regionCoordDT), + GRList = myGRL, scoringMetric = "proportionWeightedMean", + maxRow = 2) + expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + # proportion of regionSet region overlapped by each signalCoord region + rs1Ex <- c(101/400+101/201, 51/201, 1, 0, 26/51, 151/451) + rs1Ex <- rs1Ex / sum(rs1Ex) + rs1Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet1"] + expect_equal(rs1Ex, rs1Act) + + # DNA methylation like data + rmOLMat <- COCOA:::olToMat(signalCoord = COCOA:::dtToGr(coordinateDT), + GRList = myGRL, scoringMetric = "regionMean", maxRow = 6) + expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + # proportion of regionSet region overlapped by each signalCoord region + rs2Ex <- c(rep(0,5), 0.5, 0.5, 0, 1, rep(0,11), 1, 0, 0.5, 0.5, rep(0,8)) + rs2Ex <- rs2Ex / sum(rs2Ex) + rs2Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet2"] + expect_equal(rs2Ex, rs2Act) + +}) +test_that("aggregateSignal, scoring metrics, and aggregateSignalGRList", { + + ####################################################################### + # test matrix scoring + tmpLoading <- loadingMat + tmpLoading[1, "PC1"] = 5 + tmpLoading[6, "PC3"] = 7 + + agRes <- aggregateSignalGRList(signal=tmpLoading, + signalCoord = COCOA:::dtToGr(coordinateDT), + GRList = myGRL, signalCol = c("PC1", "PC3"), + scoringMetric = "regionMean", + rsMatList = COCOA:::olToMat(COCOA:::dtToGr(coordinateDT), + GRList = myGRL, + scoringMetric = "regionMean", + maxRow=3)[[1]], + signalList = COCOA:::splitSignal(tmpLoading, maxRow = 3)) + expPC1 <- c(regionSet1=((1+5)/2 + 3) / 4, regionSet2=1) + expPC3 <- c(regionSet1=1, regionSet2= ((1+7)/2 + 3)/4) + expect_equal(agRes[, "PC1"], expPC1) + expect_equal(agRes[, "PC3"], expPC3) + + ######################################################################### # # test wilcoxon rank sum scoring metric rsWResults <- COCOA:::rsWilcox(dataDT = dataDTW, regionSet = regionSetW) PC2W <- wilcox.test(x = c(-2, 0, 1), y=c(-1, 2:4))$p.value From bad8b0015e1ef499c91473e9687a3e9d94cdd0ad Mon Sep 17 00:00:00 2001 From: j-lawson Date: Tue, 16 Feb 2021 21:45:44 -0500 Subject: [PATCH 25/38] Fix bug: mismatched dimensions --- R/permutation.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/permutation.R b/R/permutation.R index 5bfd995..f2f6e6a 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -195,6 +195,7 @@ runCOCOA <- function(genomicSignal, scoringMetric = scoringMetric) rsMatList <- olMatRes[[1]] rsInfo <- olMatRes[[2]] + GRList <- NULL } @@ -527,7 +528,7 @@ runCOCOA <- function(genomicSignal, if (is.null(signalList)) { signalList <- splitSignal(signal = featureLabelCor, - maxRow = nrow(rsMatList[[1]])) + maxRow = ncol(rsMatList[[1]])) } } From e4997c6cbf1a8b1c9da0d76a3ac9d9d86f48c14b Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 18 Feb 2021 10:13:43 -0500 Subject: [PATCH 26/38] Start making correlation more efficient, change cache names --- R/COCOA.R | 90 ++++++++++++++++++++++++------------- R/permutation.R | 8 ++-- man/getMetaRegionProfile.Rd | 2 +- man/runCOCOA.Rd | 4 ++ 4 files changed, 68 insertions(+), 36 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 6af425b..9605889 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -654,9 +654,10 @@ aggregateSignalGRList <- function(signal, } } -# @param dataMat rows of dataMat should be samples/patients, +# @param dataMat If noNA=FALSE, rows of dataMat should be samples/patients, # columns should be genomic signal -# (each column corresponds to one genomic coordinate/range) +# (each column corresponds to one genomic coordinate/range). If noNA=TRUE, rows +# and columns should be flipped. # @param featureMat Matrix. Rows should be samples, columns should be "features" # (whatever you want to get correlation with: eg PC scores), # all columns in featureMat will be used (subset when passing to function @@ -669,6 +670,11 @@ aggregateSignalGRList <- function(signal, # "spearmanCor (Spearman correlation) # "pcor" (partial correlation), "cov" (covariance (Pearson)), # @param covariate +# @param alreadyCenteredDM logical. Whether dataMat has already been centered. If +# known, can do matrix multiplication to calculate covariance (and correlation +# if scaling has been done) +# @param noNA logical. Assume there might be NA, Inf or NaN. If TRUE, the +# function will use matrix multiplication which is faster than cor/cov # # If a row in dataMat has 0 stand. deviation, correlation will be set to 0 # instead of NA as would be done by cor() @@ -679,7 +685,10 @@ aggregateSignalGRList <- function(signal, # featureMat = matrix(rnorm(20), 10, 2) createCorFeatureMat <- function(dataMat, featureMat, centerDataMat=TRUE, centerFeatureMat = TRUE, - testType="cor", covariate=NULL) { + testType="cor", covariate=NULL, + alreadyCenteredDM=FALSE, alreadyScaledDM=FALSE, + alreadyCenteredFM=FALSE, alreadyScaledFM=FALSE, + noNA=FALSE) { # featureMat <- as.matrix(featureMat) # copies? featureNames <- colnames(featureMat) @@ -690,6 +699,7 @@ createCorFeatureMat <- function(dataMat, featureMat, cpgMeans <- colMeans(dataMat, na.rm = TRUE) # centering before calculating correlation dataMat <- apply(X = dataMat, MARGIN = 1, function(x) x - cpgMeans) + alreadyCenteredDM <- TRUE } @@ -700,42 +710,58 @@ createCorFeatureMat <- function(dataMat, featureMat, if (dim(featureMat)[1] == 1) { featureMat <- t(featureMat) } + alreadyCenteredFM <- TRUE } # avoid this copy and/or delay transpose until after calculating correlation? # dataMat <- as.data.frame(t(dataMat)) # copies, expect transposed form as input instead - - if (testType == "cor") { - # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) - # how much do features correlate with each PC? - - # put epigenetic data first in cor() - featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="pearson") - - } else if (testType == "spearmanCor") { - # xtfrm(x) ranking - featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="spearman") - - # } else if (testType == "pcor") { - # # partial correlation (account for covariates), ppcor package - # - # featurePCCor <- apply(X = featureMat, MARGIN = 2, function(y) apply(X = dataMat, 2, - # FUN = function(x) pcor.test(x = x, y=y, - # z=covariate, - # method="pearson")$estimate)) - # - } else if (testType == "cov") { - featurePCCor <- cov(dataMat, featureMat, use="pairwise.complete.obs") - + # spearman not implemented yet + if (noNA & (testType != "spearmanCor")) { + if (testType == "cor") { + # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) + # how much do features correlate with each PC? + + # put epigenetic data first in cor() + featurePCCor <- dataMat %*% featureMat ## UPDATE + + } else if (testType == "cov") { + featurePCCor <- dataMat %*% featureMat ## UDPATE + + } else { + stop("invalid testType") + } } else { - stop("invalid testType") + if (testType == "cor") { + # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) + # how much do features correlate with each PC? + + # put epigenetic data first in cor() + featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="pearson") + + } else if (testType == "spearmanCor") { + # xtfrm(x) ranking + featurePCCor <- cor(dataMat, featureMat, use="pairwise.complete.obs", method="spearman") + + # } else if (testType == "pcor") { + # # partial correlation (account for covariates), ppcor package + # + # featurePCCor <- apply(X = featureMat, MARGIN = 2, function(y) apply(X = dataMat, 2, + # FUN = function(x) pcor.test(x = x, y=y, + # z=covariate, + # method="pearson")$estimate)) + # + } else if (testType == "cov") { + featurePCCor <- cov(dataMat, featureMat, use="pairwise.complete.obs") + + } else { + stop("invalid testType") + } + # if standard deviation of the data was zero, NA will be produced + # set to 0 because no standard deviation means no correlation with attribute of interest + featurePCCor[is.na(featurePCCor)] <- 0 } - - # if standard deviation of the data was zero, NA will be produced - # set to 0 because no standard deviation means no correlation with attribute of interest - featurePCCor[is.na(featurePCCor)] <- 0 colnames(featurePCCor) <- featureNames return(featurePCCor) @@ -749,7 +775,7 @@ createCorFeatureMat <- function(dataMat, featureMat, #' This profile can show enrichment #' of genomic signals with high feature contribution scores #' in the region set but not in the -#' surrounding genome, suggesting that variation is linked specifically +#' surrounding genome, suggiiiesting that variation is linked specifically #' to that region set. #' #' All regions in a given region set diff --git a/R/permutation.R b/R/permutation.R index f2f6e6a..4ad7027 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -68,6 +68,8 @@ #' for fitting that specific gamma distribution. #' @template verbose #' @template returnCovInfo +#' @param minRSCov integer. Region sets must have at least "minRSCov" regions +#' covered at least partially by epigenetic data to be scored. #' @param ... Character. Optional additional arguments for simpleCache. #' #' @@ -123,7 +125,7 @@ runCOCOA <- function(genomicSignal, realScoreInDist=TRUE, force=FALSE, verbose=TRUE, - returnCovInfo=FALSE, ...) { + returnCovInfo=FALSE, minRSCov=100, ...) { colsToAnnotate <- signalCol @@ -237,9 +239,9 @@ runCOCOA <- function(genomicSignal, helperFun <- function(x, y, ...) { # for (i in (length(rsPermScores) + 1):nPerm) { - onePermCacheName <- paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID, "_Cache", y) + onePermCacheName <- paste0("rsPermScores_", variationMetric, "_", dataID, "_Cache", y) # create sub caches, one for each permutation - simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), { + simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", variationMetric, "_", dataID), { tmp <- .runCOCOA_old(sampleOrder=x, genomicSignal=genomicSignal, diff --git a/man/getMetaRegionProfile.Rd b/man/getMetaRegionProfile.Rd index ace8c51..f729e37 100755 --- a/man/getMetaRegionProfile.Rd +++ b/man/getMetaRegionProfile.Rd @@ -115,7 +115,7 @@ NULL returned). This profile can show enrichment of genomic signals with high feature contribution scores in the region set but not in the -surrounding genome, suggesting that variation is linked specifically +surrounding genome, suggiiiesting that variation is linked specifically to that region set. } \details{ diff --git a/man/runCOCOA.Rd b/man/runCOCOA.Rd index f25d49b..43b0258 100755 --- a/man/runCOCOA.Rd +++ b/man/runCOCOA.Rd @@ -26,6 +26,7 @@ runCOCOA( force = FALSE, verbose = TRUE, returnCovInfo = FALSE, + minRSCov = 100, ... ) } @@ -174,6 +175,9 @@ signalCoverage, totalRegionNumber, and meanRegionSize. For the proportionWeightedMean scoring method, sumProportionOverlap will also be calculated.} +\item{minRSCov}{integer. Region sets must have at least "minRSCov" regions +covered at least partially by epigenetic data to be scored.} + \item{...}{Character. Optional additional arguments for simpleCache.} } \value{ From 47fadb02efc59ccf3fdd3f6ca1135416ae38566d Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 18 Feb 2021 11:27:11 -0500 Subject: [PATCH 27/38] Remove low cov region sets when making rs ol mat --- R/COCOA.R | 73 +++++++++++++++++++++------------------ R/permutation.R | 2 +- tests/testthat/test_all.R | 14 ++++---- 3 files changed, 48 insertions(+), 41 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 9605889..0051403 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -2216,7 +2216,8 @@ regionOLMean <- function(signalDT, signalGR, regionSet, # olMatList <- res[[1]] # coverageInfo <- res[[2]] # -olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { +olToMat = function(signalCoord, GRList, scoringMetric, + maxRow=500000, minRSCov=0) { # input checks if (is.null(names(GRList))) { @@ -2229,6 +2230,19 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, subject = x)) + regionSetCoverage <- rep(0, length(GRList)) + signalCoverage <- rep(0, length(GRList)) + + regionSetCoverage <- vapply(X = olList, + FUN = function(x) length(unique(subjectHits(x))), + FUN.VALUE = -1) + keepInd <- which(regionSetCoverage >= minRSCov) + rsNames <- names(GRList)[keepInd] + regionSetCoverage <- regionSetCoverage[keepInd] + signalCoverage <- vapply(X = olList[keepInd], + FUN = function(x) length(unique(queryHits(x))), + FUN.VALUE = -1) + # # each column is a region set # rsMat <- matrix(data = rep(0, length(GRList) * length(signalCoord)), # nrow=length(signalCoord)) @@ -2243,36 +2257,33 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { rsMatList <- list() if (nMat > 1) { for (matCount in 1:(nMat-1)) { - rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(GRList)), + rsMatList[[matCount]] = matrix(data=rep(0, maxRow * length(keepInd)), nrow=maxRow) - colnames(rsMatList[[matCount]]) <- names(GRList) + colnames(rsMatList[[matCount]]) <- rsNames } # final matrix might not be the same size - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(keepInd)), nrow=finalMatRows) - colnames(rsMatList[[nMat]]) <- names(GRList) + colnames(rsMatList[[nMat]]) <- rsNames } else { # nMat = 1 - rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(GRList)), + rsMatList[[nMat]] = matrix(data=rep(0, finalMatRows * length(keepInd)), nrow=finalMatRows) - colnames(rsMatList[[nMat]]) <- names(GRList) + colnames(rsMatList[[nMat]]) <- rsNames } - regionSetCoverage <- rep(0, length(GRList)) - signalCoverage <- rep(0, length(GRList)) + ####################################################################### - # tmpVec = rep(0, length(signalCoord)) + # calculate scores and assign to matrices in chunks if (scoringMetric == "simpleMean") { - for (i in seq_along(GRList)) { + for (i in seq_along(keepInd)) { - if (length(olList[[i]]) != 0) { - signalCoverage[i] = length(unique(queryHits(olList[[i]]))) - regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) + if (signalCoverage[i] != 0) { tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # normalize by number so that matrix multiplication during COCOA scoring will produce mean - tmpVec[unique(queryHits(olList[[i]]))] <- 1 / length(unique(queryHits(olList[[i]]))) + tmpVec[unique(queryHits(olList[[keepInd[i]]]))] <- 1 / signalCoverage[i] # for coordinate chunks for (j in seq_along(rsMatList)) { @@ -2287,14 +2298,12 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { # instead of 1 give weight proportional to how many signalCoord are # overlapped - for (i in seq_along(GRList)) { - if (length(olList[[i]]) != 0) { - signalCoverage[i] = length(unique(queryHits(olList[[i]]))) - regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) + for (i in seq_along(keepInd)) { + if (signalCoverage[i] != 0) { tmpCount = 1 tmpVec = rep(0, length(signalCoord)) # aggregate number of overlaps by region set region - tmp <- as.data.table(olList[[i]]) + tmp <- as.data.table(olList[[keepInd[i]]]) # want the count per region set number tmp[, rCount := (1/.N), , by=subjectHits] normFactor <- sum(tmp$rCount) @@ -2311,23 +2320,21 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { rsMatList <- lapply(rsMatList, FUN = t) # transpose } else if (scoringMetric == "proportionWeightedMean") { - sumProportionOverlap = rep(0, length(GRList)) - for (i in seq_along(GRList)) { - if (length(olList[[i]]) != 0) { - signalCoverage[i] = length(unique(queryHits(olList[[i]]))) - regionSetCoverage[i] = length(unique(subjectHits(olList[[i]]))) + sumProportionOverlap = rep(0, length(keepInd)) + for (i in seq_along(keepInd)) { + if (signalCoverage[i] != 0) { tmpCount = 1 tmpVec = rep(0, length(signalCoord)) - olap <- pintersect(GRList[[i]][subjectHits(olList[[i]])], - signalCoord[queryHits(olList[[i]])]) - pOlap <- width(olap) / width(GRList[[i]][subjectHits(olList[[i]])]) + olap <- pintersect(GRList[[keepInd[i]]][subjectHits(olList[[keepInd[i]]])], + signalCoord[queryHits(olList[[keepInd[i]]])]) + pOlap <- width(olap) / width(GRList[[keepInd[i]]][subjectHits(olList[[keepInd[i]]])]) # weighted average denom <- sum(pOlap) sumProportionOverlap[i] = denom # aggregate pOlap by signalCoord region - olDT <- data.table(queryHits = queryHits(olList[[i]]), + olDT <- data.table(queryHits = queryHits(olList[[keepInd[i]]]), pOlap=pOlap) normDT <- olDT[, .(coordSum = sum(pOlap)), by=queryHits] tmpVec[normDT$queryHits] <- normDT$coordSum / denom @@ -2351,17 +2358,17 @@ olToMat = function(signalCoord, GRList, scoringMetric, maxRow=500000) { stop("The given scoringMetric cannot be used with matrix scoring.") } - totalRegionNumber <- vapply(X = GRList, FUN = length, FUN.VALUE = -1) - meanRegionSize <- vapply(X = GRList, FUN = function(x) mean(width(x)), + totalRegionNumber <- vapply(X = GRList[keepInd], FUN = length, FUN.VALUE = -1) + meanRegionSize <- vapply(X = GRList[keepInd], FUN = function(x) mean(width(x)), FUN.VALUE = -1) # list item 2 if (scoringMetric == "proportionWeightedMean") { - rsInfo = data.frame(rsName=names(GRList), signalCoverage, + rsInfo = data.frame(rsName=rsNames, signalCoverage, regionSetCoverage, sumProportionOverlap, totalRegionNumber, meanRegionSize) } else { - rsInfo = data.frame(rsName=names(GRList), signalCoverage, + rsInfo = data.frame(rsName=rsNames, signalCoverage, regionSetCoverage, totalRegionNumber, meanRegionSize) } diff --git a/R/permutation.R b/R/permutation.R index 4ad7027..b202a47 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -194,7 +194,7 @@ runCOCOA <- function(genomicSignal, "proportionWeightedMean")) { olMatRes <- olToMat(signalCoord = signalCoord, GRList = GRList, - scoringMetric = scoringMetric) + scoringMetric = scoringMetric, minRSCov=minRSCov) rsMatList <- olMatRes[[1]] rsInfo <- olMatRes[[2]] GRList <- NULL diff --git a/tests/testthat/test_all.R b/tests/testthat/test_all.R index df00169..83c7d29 100755 --- a/tests/testthat/test_all.R +++ b/tests/testthat/test_all.R @@ -65,33 +65,33 @@ test_that("olToMat", { GRList = myGRL, scoringMetric = "simpleMean", maxRow = 2) # each column (region set) should be normalized to equal 1 oneVec <- c(regionSet1 = 1, regionSet2 = 1) - expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + expect_equal(rowSums(do.call(rmOLMat[[1]], what = cbind)), oneVec) rs1Ex <- rep(0.2, 6) rs1Ex[4] <- 0 - rs1Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet1"] + rs1Act <- do.call(rmOLMat[[1]], what = cbind)["regionSet1", ] expect_equal(rs1Act, rs1Ex) rs2Ex <- c(0, 0.5, 0.5, 0, 0, 0) - rs2Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet2"] + rs2Act <- do.call(rmOLMat[[1]], what = cbind)["regionSet2", ] expect_equal(rs2Act, rs2Ex) rmOLMat <- COCOA:::olToMat(signalCoord = COCOA:::dtToGr(regionCoordDT), GRList = myGRL, scoringMetric = "proportionWeightedMean", maxRow = 2) - expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + expect_equal(rowSums(do.call(rmOLMat[[1]], what = cbind)), oneVec) # proportion of regionSet region overlapped by each signalCoord region rs1Ex <- c(101/400+101/201, 51/201, 1, 0, 26/51, 151/451) rs1Ex <- rs1Ex / sum(rs1Ex) - rs1Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet1"] + rs1Act <- do.call(rmOLMat[[1]], what = cbind)["regionSet1", ] expect_equal(rs1Ex, rs1Act) # DNA methylation like data rmOLMat <- COCOA:::olToMat(signalCoord = COCOA:::dtToGr(coordinateDT), GRList = myGRL, scoringMetric = "regionMean", maxRow = 6) - expect_equal(colSums(do.call(rmOLMat[[1]], what = rbind)), oneVec) + expect_equal(rowSums(do.call(rmOLMat[[1]], what = cbind)), oneVec) # proportion of regionSet region overlapped by each signalCoord region rs2Ex <- c(rep(0,5), 0.5, 0.5, 0, 1, rep(0,11), 1, 0, 0.5, 0.5, rep(0,8)) rs2Ex <- rs2Ex / sum(rs2Ex) - rs2Act <- do.call(rmOLMat[[1]], what = rbind)[, "regionSet2"] + rs2Act <- do.call(rmOLMat[[1]], what = cbind)["regionSet2", ] expect_equal(rs2Ex, rs2Act) }) From 0f6a0facfe24b6289407d09a5f745dbf1b7a4e0e Mon Sep 17 00:00:00 2001 From: j-lawson Date: Thu, 18 Feb 2021 23:03:09 -0500 Subject: [PATCH 28/38] Use matrix calculations for correlation/covariance --- R/COCOA.R | 58 +++++++++++----------- R/permutation.R | 124 ++++++++++++++++++++++++++++++++---------------- man/runCOCOA.Rd | 10 ---- 3 files changed, 112 insertions(+), 80 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 0051403..b854d9b 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -662,10 +662,6 @@ aggregateSignalGRList <- function(signal, # (whatever you want to get correlation with: eg PC scores), # all columns in featureMat will be used (subset when passing to function # in order to not use all columns) -# @param centerDataMat logical object. Should rows in dataMat be centered based on -# their means? (subtracting row mean from each row) -# @param centerFeatureMat logical. Should columns in featureMat be centered based -# on their means? (subtract column mean from each column) # @param testType character object. Can be "cor" (Pearson correlation), # "spearmanCor (Spearman correlation) # "pcor" (partial correlation), "cov" (covariance (Pearson)), @@ -683,8 +679,7 @@ aggregateSignalGRList <- function(signal, # columns are the columns of featureMat # @examples dataMat = matrix(rnorm(50), 5, 10) # featureMat = matrix(rnorm(20), 10, 2) -createCorFeatureMat <- function(dataMat, featureMat, - centerDataMat=TRUE, centerFeatureMat = TRUE, +createCorFeatureMat <- function(dataMat, featureMat, testType="cor", covariate=NULL, alreadyCenteredDM=FALSE, alreadyScaledDM=FALSE, alreadyCenteredFM=FALSE, alreadyScaledFM=FALSE, @@ -694,40 +689,47 @@ createCorFeatureMat <- function(dataMat, featureMat, featureNames <- colnames(featureMat) nFeatures <- ncol(featureMat) nDataDims <- nrow(dataMat) - - if (centerDataMat) { - cpgMeans <- colMeans(dataMat, na.rm = TRUE) - # centering before calculating correlation - dataMat <- apply(X = dataMat, MARGIN = 1, function(x) x - cpgMeans) - alreadyCenteredDM <- TRUE - - } - - if (centerFeatureMat) { - featureMeans <- colMeans(featureMat, na.rm = TRUE) - # centering before calculating correlation(also, t() converts to matrix) - featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans)) - if (dim(featureMat)[1] == 1) { - featureMat <- t(featureMat) - } - alreadyCenteredFM <- TRUE - } + # avoid this copy and/or delay transpose until after calculating correlation? # dataMat <- as.data.frame(t(dataMat)) # copies, expect transposed form as input instead # spearman not implemented yet if (noNA & (testType != "spearmanCor")) { + if (!alreadyCenteredFM) { + featureMeans <- colMeans(featureMat, na.rm = TRUE) + # centering before calculating correlation(also, t() converts to matrix) + featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans)) + if (dim(featureMat)[1] == 1) { + featureMat <- t(featureMat) + } + } + if (!alreadyCenteredDM) { + # more efficient to only do once + cpgMeans <- rowMeans(dataMat, na.rm = TRUE) + # centering before calculating correlation + dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans) + } + + ##### cor or cov calculations if (testType == "cor") { + if (!alreadyScaledFM) { + featureMat <- scale(x = featureMat, center = FALSE, scale = TRUE) + } + if (!alreadyScaledDM) { + featSD <- apply(X = dataMat, MARGIN = 1, FUN = sd) + featSD[featSD == 0] = 1 # 0 sd if no variation, 1 will leave unchanged and not cause error + dataMat <- dataMat / featSD + } + # create feature correlation matrix with PCs (rows: features/CpGs, columns:PCs) # how much do features correlate with each PC? - # put epigenetic data first in cor() - featurePCCor <- dataMat %*% featureMat ## UPDATE + # put epigenetic data first, inner dimensions are samples + featurePCCor <- dataMat %*% featureMat } else if (testType == "cov") { - featurePCCor <- dataMat %*% featureMat ## UDPATE - + featurePCCor <- dataMat %*% featureMat } else { stop("invalid testType") } diff --git a/R/permutation.R b/R/permutation.R index b202a47..c421921 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -29,12 +29,6 @@ #' @template signalCol #' @template scoringMetric #' @template absVal -#' @param centerGenomicSignal Logical. Should rows in genomicSignal -#' be centered based on -#' their means? (subtracting row mean from each row) -#' @param centerTargetVar Logical. Should columns in targetVar be -#' centered based -#' on their means? (subtract column mean from each column) #' @param dataID Character. A unique identifier for this dataset #' (for saving results with simpleCache) #' @template variationMetric @@ -113,8 +107,6 @@ runCOCOA <- function(genomicSignal, signalCol=c("PC1", "PC2"), scoringMetric="default", absVal=TRUE, - centerGenomicSignal=TRUE, - centerTargetVar=TRUE, variationMetric="cor", nPerm=0, useSimpleCache=TRUE, @@ -127,6 +119,11 @@ runCOCOA <- function(genomicSignal, verbose=TRUE, returnCovInfo=FALSE, minRSCov=100, ...) { + # if doing matrix calculations of cor/cov, center and/or scale only once ahead of time + alreadyCenteredFM <- FALSE + alreadyCenteredDM <- FALSE + alreadyScaledFM <- FALSE + alreadyScaledDM <- FALSE colsToAnnotate <- signalCol allResultsList <- list() @@ -164,15 +161,18 @@ runCOCOA <- function(genomicSignal, } } - # more efficient to only do once - if (centerGenomicSignal) { + # if no NA, NaN or +/-Inf, can do matrix correlation calculations + # different COCOA step than matrix scoring of region sets + noNA <- all(is.finite(genomicSignal)) + if (noNA & (variationMetric %in% c("cov", "cor"))) { + + # more efficient to only do once cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE) # centering before calculating correlation genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans) # don't do later - centerGenomicSignal <- FALSE - } - if (centerTargetVar) { + alreadyCenteredDM <- TRUE + featureMeans <- colMeans(targetVar, na.rm = TRUE) # centering before calculating correlation (also, t() converts to matrix) targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans)) @@ -180,18 +180,44 @@ runCOCOA <- function(genomicSignal, targetVar <- t(targetVar) } # don't do later - centerTargetVar <- FALSE + alreadyCenteredFM <- TRUE + } + if (noNA & (variationMetric == "cor")) { + targetVar <- scale(x = targetVar, center = FALSE, scale = TRUE) + alreadyScaledFM <- TRUE + + featSD <- apply(X = genomicSignal, MARGIN = 1, FUN = sd) + featSD[featSD == 0] = 1 # 0 sd if no variation, 1 will leave unchanged and not cause error + genomicSignal <- genomicSignal / featSD + alreadyScaledDM <- TRUE + } + + + # matrix-based region set scoring only works for mean-based scoringMetric's + if (scoringMetric %in% c("simpleMean", + "regionMean", + "proportionWeightedMean")) { + doRSMatScore <- TRUE + } else { + doRSMatScore <- FALSE } + + - # must be transposed now so it is not (and not copied) during each permutation - genomicSignal <- t(genomicSignal) + if (!noNA) { + # orientation for "cor()" and "cov()" + # for these functions, samples need to be rows for both objects + # must be transposed now so it is not (and not copied) during each permutation + genomicSignal <- t(genomicSignal) + } + # for matrix correlation, samples should be rows for only one object (genSig and tarVar) + # data is already in correct orientation + ######################################################################## # create region set overlap matrix # this code should be after code modifying "signal" - if (scoringMetric %in% c("simpleMean", - "regionMean", - "proportionWeightedMean")) { + if (doRSMatScore) { olMatRes <- olToMat(signalCoord = signalCoord, GRList = GRList, scoringMetric = scoringMetric, minRSCov=minRSCov) @@ -224,12 +250,15 @@ runCOCOA <- function(genomicSignal, variationMetric = variationMetric, scoringMetric=scoringMetric, absVal=absVal, - centerGenomicSignal = centerGenomicSignal, - centerTargetVar = centerTargetVar, verbose=verbose, rsMatList = rsMatList, rsInfo = rsInfo, - returnCovInfo = returnCovInfo) + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) rsScores }, assignToVariable="rsScores") } @@ -252,12 +281,15 @@ runCOCOA <- function(genomicSignal, variationMetric = variationMetric, scoringMetric=scoringMetric, absVal=absVal, - centerGenomicSignal = centerGenomicSignal, - centerTargetVar = centerTargetVar, verbose=verbose, rsMatList = rsMatList, rsInfo = rsInfo, - returnCovInfo = returnCovInfo) + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) message(y) # must be ahead of object that is saved as cache, not after tmp @@ -280,12 +312,15 @@ runCOCOA <- function(genomicSignal, variationMetric = variationMetric, scoringMetric=scoringMetric, absVal=absVal, - centerGenomicSignal = centerGenomicSignal, - centerTargetVar = centerTargetVar, verbose=verbose, rsMatList = rsMatList, rsInfo = rsInfo, - returnCovInfo = returnCovInfo) + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) } @@ -302,7 +337,12 @@ runCOCOA <- function(genomicSignal, verbose=verbose, rsMatList=rsMatList, rsInfo = rsInfo, - returnCovInfo = returnCovInfo) + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) message(":", appendLF=FALSE) return(tmp) } @@ -432,12 +472,6 @@ runCOCOA <- function(genomicSignal, # @template scoringMetric # @template verbose # @template absVal -# @param centerGenomicSignal Logical. Should rows in genomicSignal -# be centered based on -# their means? (subtracting row mean from each row) -# @param centerTargetVar Logical. Should columns in targetVar be -# centered based -# on their means? (subtract column mean from each column) # @template returnCovInfo # @return data.frame. The output of aggregateSignalGRList for one permutation. # @examples @@ -484,9 +518,12 @@ runCOCOA <- function(genomicSignal, absVal=TRUE, rsMatList=NULL, rsInfo=NULL, - centerGenomicSignal=TRUE, - centerTargetVar=TRUE, - returnCovInfo=TRUE) { + returnCovInfo=TRUE, + alreadyCenteredDM = FALSE, + alreadyScaledDM = FALSE, + alreadyCenteredFM = FALSE, + alreadyScaledFM = FALSE, + noNA=FALSE) { signalList <- NULL # if vector is given, return error @@ -514,10 +551,13 @@ runCOCOA <- function(genomicSignal, # calculate correlation featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal, - featureMat = targetVar, - centerDataMat = centerGenomicSignal, - centerFeatureMat = centerTargetVar, - testType = variationMetric) + featureMat = targetVar, + testType = variationMetric, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) # more efficient to do only once instead of for each region set later on if (absVal) { diff --git a/man/runCOCOA.Rd b/man/runCOCOA.Rd index 43b0258..309f7d7 100755 --- a/man/runCOCOA.Rd +++ b/man/runCOCOA.Rd @@ -13,8 +13,6 @@ runCOCOA( signalCol = c("PC1", "PC2"), scoringMetric = "default", absVal = TRUE, - centerGenomicSignal = TRUE, - centerTargetVar = TRUE, variationMetric = "cor", nPerm = 0, useSimpleCache = TRUE, @@ -114,14 +112,6 @@ regions in a region set). Choose FALSE if you expect regions in a given region set to all change in the same direction (all be positively correlated with each other).} -\item{centerGenomicSignal}{Logical. Should rows in genomicSignal -be centered based on -their means? (subtracting row mean from each row)} - -\item{centerTargetVar}{Logical. Should columns in targetVar be -centered based -on their means? (subtract column mean from each column)} - \item{variationMetric}{Character. The metric to use to quantify the association between each feature in genomicSignal and each target variable in sampleLabels. From 4467317a7adef668f7b63841b685fca531ef5510 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 19 Feb 2021 10:00:00 -0500 Subject: [PATCH 29/38] Fix bug: wrong object class --- R/COCOA.R | 6 ++++++ R/permutation.R | 13 ++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index b854d9b..ff5db12 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -710,6 +710,12 @@ createCorFeatureMat <- function(dataMat, featureMat, # centering before calculating correlation dataMat <- apply(X = dataMat, MARGIN = 2, function(x) x - cpgMeans) } + if (!is(featureMat, "matrix")) { + featureMat <- as.matrix(featureMat) + } + if (!is(dataMat, "matrix")) { + dataMat <- as.matrix(dataMat) + } ##### cor or cov calculations if (testType == "cor") { diff --git a/R/permutation.R b/R/permutation.R index c421921..23f5c6d 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -536,18 +536,13 @@ runCOCOA <- function(genomicSignal, } - # subset to only signalCol - targetVar <- targetVar[, signalCol, drop=FALSE] - # because names are dropped for a single column data.frame when indexing # single col data.frame is automatically converted to numeric - featureNames <- colnames(targetVar) - # reorder the sample labels - targetVar <- data.frame(targetVar[sampleOrder, ]) - colnames(targetVar) <- featureNames - - + # drop=FALSE + # subset to only signalCol and reorder sample labels + targetVar <- targetVar[sampleOrder, signalCol, drop=FALSE] + # calculate correlation featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal, From 61d8fca4a08ab016cd55df7c2878712b401535cf Mon Sep 17 00:00:00 2001 From: j-lawson Date: Tue, 23 Feb 2021 12:25:46 -0500 Subject: [PATCH 30/38] Fix normalization bug, divide by n-1 for cor/cov --- R/COCOA.R | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index ff5db12..a0c34cf 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -687,8 +687,6 @@ createCorFeatureMat <- function(dataMat, featureMat, # featureMat <- as.matrix(featureMat) # copies? featureNames <- colnames(featureMat) - nFeatures <- ncol(featureMat) - nDataDims <- nrow(dataMat) # avoid this copy and/or delay transpose until after calculating correlation? @@ -696,10 +694,18 @@ createCorFeatureMat <- function(dataMat, featureMat, # spearman not implemented yet if (noNA & (testType != "spearmanCor")) { + # for matrix correlation, dataMat will be given with rows as features + # and cols as samples + # featureMat has the same orientation for noNA=T or F + + # must normalize covariation and correlation by sample number (n-1) + nSamples <- nrow(featureMat) + if (!alreadyCenteredFM) { featureMeans <- colMeans(featureMat, na.rm = TRUE) # centering before calculating correlation(also, t() converts to matrix) - featureMat <- t(apply(X = t(featureMat), MARGIN = 2, function(x) x - featureMeans)) + # apply also converts to matrix here I believe + featureMat <- apply(X = featureMat, MARGIN = 1, function(x) x - featureMeans) if (dim(featureMat)[1] == 1) { featureMat <- t(featureMat) } @@ -732,10 +738,10 @@ createCorFeatureMat <- function(dataMat, featureMat, # how much do features correlate with each PC? # put epigenetic data first, inner dimensions are samples - featurePCCor <- dataMat %*% featureMat + featurePCCor <- (dataMat %*% featureMat) / (nSamples - 1) } else if (testType == "cov") { - featurePCCor <- dataMat %*% featureMat + featurePCCor <- (dataMat %*% featureMat) / (nSamples - 1) } else { stop("invalid testType") } From 7408cc3eb74060afa22b60f427b10df0f840637d Mon Sep 17 00:00:00 2001 From: j-lawson Date: Tue, 23 Feb 2021 16:32:55 -0500 Subject: [PATCH 31/38] Fix error --- R/COCOA.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/COCOA.R b/R/COCOA.R index a0c34cf..3d912f2 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -705,7 +705,7 @@ createCorFeatureMat <- function(dataMat, featureMat, featureMeans <- colMeans(featureMat, na.rm = TRUE) # centering before calculating correlation(also, t() converts to matrix) # apply also converts to matrix here I believe - featureMat <- apply(X = featureMat, MARGIN = 1, function(x) x - featureMeans) + featureMat <- t(apply(X = featureMat, MARGIN = 1, function(x) x - featureMeans)) if (dim(featureMat)[1] == 1) { featureMat <- t(featureMat) } From 10c4411223e5793bced3c544dabe6bb8173fa620 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Wed, 24 Feb 2021 21:09:09 -0500 Subject: [PATCH 32/38] Add COCOA score normalization --- R/COCOA.R | 5 +++++ R/permutation.R | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/R/COCOA.R b/R/COCOA.R index 3d912f2..4376d6c 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -646,10 +646,15 @@ aggregateSignalGRList <- function(signal, if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { + resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], + featureVals=signal[, signalCol]) return(as.data.frame(resultsDF)) } else { resultsDT <- do.call(rbind, resultsList) resultsDF <- as.data.frame(resultsDT) + resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], + featureVals=signal[, signalCol]) + return(resultsDF) } } diff --git a/R/permutation.R b/R/permutation.R index 23f5c6d..3540e2d 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -638,7 +638,39 @@ permListToOneNullDist <- function(resultsList, rsInd) { return(rsNullDist) } +# for gammaNormalize() +pGammaSingle <- function(scoreVec, oneDist) { + + pValVec <- pgamma(q = scoreVec, shape = oneDist$estimate["shape"], + rate = oneDist$estimate["rate"], + lower.tail = FALSE) # lower.tail=FALSE means 1-x is given + return(pValVec) +} +# get gamma p-value for each region set based on feature values +# of all CpGs. This is done to normalize for the mean and variance of +# permutation feature value distributions +# @param featureVals matrix/data.frame. Should +gammaNormalize <- function(rsScores, featureVals, method="mme", force=FALSE) { + + # estimate distribution parameters + # returns a list where each item is a gamma dist for a column of featureVals + gList <- fitGammaNullDist(nullDistDF = featureVals, + method=method, + force=force) + + #### get p-val for score + pValList = list() + # once for each column + for (i in 1:ncol(rsScores)) { + pValList[[i]] <- as.data.frame(pGammaSingle(scoreVec = as.numeric(rsScores[, i]), + oneDist = gList[[i]])) + } + + pValDF <- as.data.frame(do.call("cbind", pValList)) + colnames(pValDF) <- colnames(rsScores) + return(pValDF) +} From c0657c1def9781fb8dfe6b150fb0d773f86c05c0 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Wed, 24 Feb 2021 22:14:41 -0500 Subject: [PATCH 33/38] Fix bug --- R/COCOA.R | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 4376d6c..af98f56 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -582,14 +582,24 @@ aggregateSignalGRList <- function(signal, } } - # convert object class outside aggregateSignal to extra prevent copying - # (one scoring method needs `signal` as a matrix though) - if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { - signal <- as.data.table(signal) - } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { - signal <- as.matrix(signal) + if (is.null(signalList)) { + # data.table is only used for non-matrix COCOA (legacy) + + # convert object class outside aggregateSignal to extra prevent copying + # (one scoring method needs `signal` as a matrix though) + if (!is(signal, "data.table") && (scoringMetric != "proportionWeightedMean")) { + signal <- as.data.table(signal) + } else if (!is(signal, "matrix") && (scoringMetric == "proportionWeightedMean")) { + signal <- as.matrix(signal) + } + + } else { + # needed for gamma normalization + if (!is(signal, "matrix")) { + signal <- as.matrix(signal) + } } - + # take absolute value outside aggregateSignal to prevent extra copying if (absVal) { From 6ae1e84b9007524b9123a7f6c2a0f1634a082d33 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 5 Mar 2021 21:30:38 -0500 Subject: [PATCH 34/38] Account for other cases --- R/COCOA.R | 3 ++ R/permutation.R | 105 +++++++++++++++++++++++++++--------------------- 2 files changed, 63 insertions(+), 45 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index af98f56..9144b11 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -662,6 +662,9 @@ aggregateSignalGRList <- function(signal, } else { resultsDT <- do.call(rbind, resultsList) resultsDF <- as.data.frame(resultsDT) + if (is(signal, "data.frame")) { + signal <- as.matrix(signal) + } resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], featureVals=signal[, signalCol]) diff --git a/R/permutation.R b/R/permutation.R index 3540e2d..75ffe33 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -67,13 +67,15 @@ #' @param ... Character. Optional additional arguments for simpleCache. #' #' -#' @return Returns a list with the following 4 items: 1. a list of length nPerm +#' @return Returns a list with the following 5 items: 1. a data.frame with +#' the region set scores. +#' 2. a list of length nPerm #' where each item is a data.frame of the COCOA scores from a single #' permutation. Each data.frame is the output of `runCOCOA()` -#' 2. a data.table/data.frame of empirical p-values (the -#' output of `getPermStat`) 3. a +#' 3. a data.table/data.frame of empirical p-values (the +#' output of `getPermStat`) 4. a #' data.table/data.frame of z-scores (the output of `getPermStat`. -#' 4. a data.frame of p-values based on +#' 5. a data.frame of p-values based on #' the gamma approximation (the output of getGammaPVal(). #' @examples #' data("esr1_chr1") @@ -224,18 +226,27 @@ runCOCOA <- function(genomicSignal, rsMatList <- olMatRes[[1]] rsInfo <- olMatRes[[2]] GRList <- NULL + if (!is.null(rsScores)) { + if (nrow(rsInfo) != nrow(rsScores)) { + stop("rsScores should be filtered beforehand to remove region sets with less than minRSCov coverage.") + } + } + + ## NOTE: there might be an error if rsScores is given but not screened by minRSCov } ####################################################################### - - indList <- list() - # generate random indices for shuffling of samples - for (i in 1:nPerm) { - indList[[i]] <- sample(1:nrow(targetVar), nrow(targetVar)) + if (nPerm > 0) { + indList <- list() + # generate random indices for shuffling of samples + # because a for loop is used, if nPerm is increased, the original + # items in indList will be the same. Keep for loop. + for (i in 1:nPerm) { + indList[[i]] <- sample(1:nrow(targetVar), nrow(targetVar)) + } } - # replicate(10, sample(seq_len(nrow(targetVar))) if (useSimpleCache) { @@ -263,43 +274,46 @@ runCOCOA <- function(genomicSignal, }, assignToVariable="rsScores") } - # create the main permutation cache - simpleCache(paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), { - - helperFun <- function(x, y, ...) { - # for (i in (length(rsPermScores) + 1):nPerm) { - onePermCacheName <- paste0("rsPermScores_", variationMetric, "_", dataID, "_Cache", y) - # create sub caches, one for each permutation - simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", variationMetric, "_", dataID), { + + if (nPerm > 0) { + # create the main permutation cache + simpleCache(paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), { - tmp <- .runCOCOA_old(sampleOrder=x, - genomicSignal=genomicSignal, - signalCoord=signalCoord, - GRList=GRList, - signalCol=colsToAnnotate, - targetVar=targetVar, - variationMetric = variationMetric, - scoringMetric=scoringMetric, - absVal=absVal, - verbose=verbose, - rsMatList = rsMatList, - rsInfo = rsInfo, - returnCovInfo = returnCovInfo, - noNA=noNA, - alreadyCenteredDM = alreadyCenteredDM, - alreadyScaledDM = alreadyScaledDM, - alreadyCenteredFM = alreadyCenteredFM, - alreadyScaledFM = alreadyScaledFM) - message(y) # must be ahead of object that is saved as cache, not after - tmp + helperFun <- function(x, y, ...) { + # for (i in (length(rsPermScores) + 1):nPerm) { + onePermCacheName <- paste0("rsPermScores_", variationMetric, "_", dataID, "_Cache", y) + # create sub caches, one for each permutation + simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", variationMetric, "_", dataID), { + + tmp <- .runCOCOA_old(sampleOrder=x, + genomicSignal=genomicSignal, + signalCoord=signalCoord, + GRList=GRList, + signalCol=colsToAnnotate, + targetVar=targetVar, + variationMetric = variationMetric, + scoringMetric=scoringMetric, + absVal=absVal, + verbose=verbose, + rsMatList = rsMatList, + rsInfo = rsInfo, + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) + message(y) # must be ahead of object that is saved as cache, not after + tmp + + }, cacheDir=cacheDir, assignToVariable="tmp", ...) + return(tmp) + } + rsPermScores = mapply(FUN = helperFun, x=indList, y=seq_along(indList), ..., SIMPLIFY=FALSE) + rsPermScores - }, cacheDir=cacheDir, assignToVariable="tmp", ...) - return(tmp) + }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...) } - rsPermScores = mapply(FUN = helperFun, x=indList, y=seq_along(indList), ..., SIMPLIFY=FALSE) - rsPermScores - - }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...) } else { if (is.null(rsScores)) { @@ -323,7 +337,7 @@ runCOCOA <- function(genomicSignal, alreadyScaledFM = alreadyScaledFM) } - + if (nPerm > 0) { helperFun2 <- function(x) { tmp <- .runCOCOA_old(sampleOrder=x, genomicSignal=genomicSignal, @@ -415,6 +429,7 @@ runCOCOA <- function(genomicSignal, } + allResultsList$rsScores <- rsScores allResultsList$permRSScores <- rsPermScores allResultsList$empiricalPVals <- rsPVals allResultsList$zScores <- rsZScores From 25762704f9a9527464b1992780b445a2200201a4 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 5 Mar 2021 22:03:41 -0500 Subject: [PATCH 35/38] Fix error --- R/permutation.R | 109 ++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/R/permutation.R b/R/permutation.R index 75ffe33..3524eff 100755 --- a/R/permutation.R +++ b/R/permutation.R @@ -243,33 +243,33 @@ runCOCOA <- function(genomicSignal, # generate random indices for shuffling of samples # because a for loop is used, if nPerm is increased, the original # items in indList will be the same. Keep for loop. - for (i in 1:nPerm) { + for (i in seq(nPerm)) { indList[[i]] <- sample(1:nrow(targetVar), nrow(targetVar)) } } - + if (useSimpleCache) { if (is.null(rsScores)) { simpleCache(paste0("rsScores_", variationMetric, "_", dataID), { rsScores <- .runCOCOA_old(sampleOrder=1:nrow(targetVar), - genomicSignal=genomicSignal, - signalCoord=signalCoord, - GRList=GRList, - signalCol=colsToAnnotate, - targetVar=targetVar, - variationMetric = variationMetric, - scoringMetric=scoringMetric, - absVal=absVal, - verbose=verbose, - rsMatList = rsMatList, - rsInfo = rsInfo, - returnCovInfo = returnCovInfo, - noNA=noNA, - alreadyCenteredDM = alreadyCenteredDM, - alreadyScaledDM = alreadyScaledDM, - alreadyCenteredFM = alreadyCenteredFM, - alreadyScaledFM = alreadyScaledFM) + genomicSignal=genomicSignal, + signalCoord=signalCoord, + GRList=GRList, + signalCol=colsToAnnotate, + targetVar=targetVar, + variationMetric = variationMetric, + scoringMetric=scoringMetric, + absVal=absVal, + verbose=verbose, + rsMatList = rsMatList, + rsInfo = rsInfo, + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) rsScores }, assignToVariable="rsScores") } @@ -312,7 +312,7 @@ runCOCOA <- function(genomicSignal, rsPermScores = mapply(FUN = helperFun, x=indList, y=seq_along(indList), ..., SIMPLIFY=FALSE) rsPermScores - }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...) + }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...) } } else { @@ -338,31 +338,32 @@ runCOCOA <- function(genomicSignal, } if (nPerm > 0) { - helperFun2 <- function(x) { - tmp <- .runCOCOA_old(sampleOrder=x, - genomicSignal=genomicSignal, - signalCoord=signalCoord, - GRList=GRList, - signalCol=colsToAnnotate, - targetVar=targetVar, - variationMetric = variationMetric, - scoringMetric=scoringMetric, - absVal=absVal, - verbose=verbose, - rsMatList=rsMatList, - rsInfo = rsInfo, - returnCovInfo = returnCovInfo, - noNA=noNA, - alreadyCenteredDM = alreadyCenteredDM, - alreadyScaledDM = alreadyScaledDM, - alreadyCenteredFM = alreadyCenteredFM, - alreadyScaledFM = alreadyScaledFM) - message(":", appendLF=FALSE) - return(tmp) + helperFun2 <- function(x) { + tmp <- .runCOCOA_old(sampleOrder=x, + genomicSignal=genomicSignal, + signalCoord=signalCoord, + GRList=GRList, + signalCol=colsToAnnotate, + targetVar=targetVar, + variationMetric = variationMetric, + scoringMetric=scoringMetric, + absVal=absVal, + verbose=verbose, + rsMatList=rsMatList, + rsInfo = rsInfo, + returnCovInfo = returnCovInfo, + noNA=noNA, + alreadyCenteredDM = alreadyCenteredDM, + alreadyScaledDM = alreadyScaledDM, + alreadyCenteredFM = alreadyCenteredFM, + alreadyScaledFM = alreadyScaledFM) + message(":", appendLF=FALSE) + return(tmp) + } + + rsPermScores <- lapply(X = indList, helperFun2) + } - - rsPermScores <- lapply(X = indList, helperFun2) - } @@ -1003,23 +1004,3 @@ getPermStatSingle <- function(rsScore, nullDist, } -############################################################################## -############### old ################### - -# Run COCOA permutations to get null distributions for each region set. -# For one permutation, there are several steps: -# First, we shuffle the sample labels. Second, we calculate the association -# between the epigenetic data and the shuffled sample labels using the -# chosen metric (e.g. correlation). Third, the resulting feature coefficients -# are used as input to the aggregateSignalGRList function to score each region set. -# This process is repeated `nPerm` times. -# -# @param groupByRS logical - -# @return A list where each item is a data.frame with the null -# distribution for a single region set. The length of the list -# is equal to the number of region sets. The number of rows of -# each data.frame is equal to the number of permutations. -# getNullDist <- function(groupByRS=TRUE) { -# -# } \ No newline at end of file From e307bd963ce12409d340133c693f09664a8ffd78 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Mon, 22 Mar 2021 21:53:25 -0400 Subject: [PATCH 36/38] Take out gamma normalization --- R/COCOA.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/COCOA.R b/R/COCOA.R index 9144b11..6315eac 100755 --- a/R/COCOA.R +++ b/R/COCOA.R @@ -656,18 +656,18 @@ aggregateSignalGRList <- function(signal, if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean", "proportionWeightedMean"))) { - resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], - featureVals=signal[, signalCol]) + # resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], + # featureVals=signal[, signalCol]) return(as.data.frame(resultsDF)) } else { resultsDT <- do.call(rbind, resultsList) resultsDF <- as.data.frame(resultsDT) - if (is(signal, "data.frame")) { - signal <- as.matrix(signal) - } - resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], - featureVals=signal[, signalCol]) - + # if (is(signal, "data.frame")) { + # signal <- as.matrix(signal) + # } + # resultsDF[, signalCol] <- gammaNormalize(rsScores=resultsDF[, signalCol], + # featureVals=signal[, signalCol]) + # return(resultsDF) } } From 0b48ef67e5a7750857cbb6deaafbbcc95824e926 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 9 Apr 2021 09:34:04 -0400 Subject: [PATCH 37/38] Add rs names if missing --- R/visualization.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/visualization.R b/R/visualization.R index 49ec1be..b2ae1d5 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -52,7 +52,8 @@ #' @param decreasing Logical. Whether samples should be sorted in #' decreasing order of `orderByCol` or not (FALSE is increasing order). #' @param regionAnnoGRList GRangesList. Each GRanges in the list should be -#' a genomic annotation. These will be displayed above or below the heatmap. +#' a genomic annotation. These will be displayed above or below the heatmap. +#' Should be a named list. Names will be displayed on plot. #' @param plotRegionMean Logical. If TRUE, the genomicSignal will be averaged #' for each region in regionSet and those region averages #' will be plotted instead of the @@ -219,6 +220,9 @@ signalAlongAxis <- function(genomicSignal, signalCoord, regionSet, if (!is.null(regionAnnoGRList)) { regionAnno <- data.frame(tmp=rep(0, length(coordGR))) annoNames <- names(regionAnnoGRList) + if (is.null(annoNames)) { + annoNames <- paste0("Region set ", seq_along(regionAnnoGRList)) + } for (i in seq_along(regionAnnoGRList)) { ################# need to make these factors From f1d1398821d62202f5951853a263571a89ff3a82 Mon Sep 17 00:00:00 2001 From: j-lawson Date: Fri, 15 Oct 2021 14:00:07 -0400 Subject: [PATCH 38/38] Old updates --- vignettes/IntroToCOCOA.Rmd | 49 ++++++++------------------------------ 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/vignettes/IntroToCOCOA.Rmd b/vignettes/IntroToCOCOA.Rmd index b4fd11b..ab9811c 100755 --- a/vignettes/IntroToCOCOA.Rmd +++ b/vignettes/IntroToCOCOA.Rmd @@ -57,14 +57,6 @@ methylCor[is.na(methylCor)] <- 0 ```{r, eval=TRUE, message=FALSE, warning=FALSE} GRList <- GRangesList(esr1_chr1, gata3_chr1, atf3_chr1, nrf1_chr1) regionSetNames <- c("ESR1", "GATA3", "ATF3", "NRF1") -rsScores <- aggregateSignalGRList(signal=methylCor, - signalCoord=brcaMCoord1, - GRList=GRList, - signalCol=myPhen, - scoringMetric="default", - absVal=TRUE) -rsScores$regionSetName <- regionSetNames -rsScores ``` ```{r, eval=TRUE, message=FALSE, warning=FALSE} @@ -72,44 +64,23 @@ set.seed(100) nPerm <- 5 -permRSScores <- runCOCOA(genomicSignal=brcaMethylData1, +resultsList <- runCOCOA(genomicSignal=brcaMethylData1, signalCoord=brcaMCoord1, GRList=GRList, - signalCol=myPhen, + signalCol=PCsToAnnotate, targetVar=targetVarDF, nPerm=nPerm, variationMetric="cor", useSimpleCache = FALSE) -permRSScores[1:3] -``` - - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -nullDistList <- convertToFromNullDist(permRSScores) -names(nullDistList) <- regionSetNames -nullDistList -``` - - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -# p-values based on fitted gamma distributions -gPValDF <- getGammaPVal(rsScores = rsScores, - nullDistList = nullDistList, - signalCol = myPhen, - method = "mme", realScoreInDist = TRUE) -gPValDF <- cbind(gPValDF, - rsScores[, colnames(rsScores)[!(colnames(rsScores) - %in% myPhen)]]) -gPValDF <- cbind(gPValDF, regionSetNames) -gPValDF -``` - - -```{r, eval=TRUE, message=FALSE, warning=FALSE} -getPermStat(rsScores = rsScores, - nullDistList = nullDistList, - signalCol = myPhen) +names(resultsList) +# rsScores$regionSetName <- regionSetNames +# rsScores +# gPValDF <- cbind(gPValDF, +# rsScores[, colnames(rsScores)[!(colnames(rsScores) +# %in% PCsToAnnotate)]]) +# gPValDF <- cbind(gPValDF, regionSetNames) +# gPValDF ``` # Visualization