diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 49d4528a..8b31af7e 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -6,11 +6,13 @@ cmd_args <- commandArgs(trailingOnly = TRUE) mzml_filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) +trim_parameters_filepath <- cmd_args[3] +resol <- as.numeric(cmd_args[4]) # load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, -# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos load(breaks_filepath) +# load trim parameters file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(trim_parameters_filepath) # get sample name techrep_name <- sub("\\..*$", "", basename(mzml_filepath)) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index d0a7098b..d3bc79ae 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -5,7 +5,7 @@ process AssignToBins { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(val(file_id), path(mzML_file), path(breaks_file)) + tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file)) output: path("${file_id}.RData"), emit: rdata_file @@ -13,7 +13,7 @@ process AssignToBins { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $trim_params_file $params.resolution """ } diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R new file mode 100644 index 00000000..7114e3c4 --- /dev/null +++ b/DIMS/AveragePeaks.R @@ -0,0 +1,37 @@ +library(dplyr) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +sample_name <- cmd_args[1] +techreps <- cmd_args[2] +scanmode <- cmd_args[3] +preprocessing_scripts_dir <- cmd_args[4] +tech_reps <- strsplit(techreps, ";")[[1]] + +# load in function scripts +source(paste0(preprocessing_scripts_dir, "average_peaks_functions.R")) + +# Initialize per sample +peaklist_allrepl <- NULL +nr_repl_persample <- 0 +averaged_peaks <- matrix(0, nrow = 0, ncol = 6) +colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + +# load RData files of technical replicates belonging to biological sample +for (file_nr in 1:length(tech_reps)) { + tech_repl_file <- paste0(tech_reps[file_nr], "_", scanmode, ".RData") + tech_repl <- get(load(tech_repl_file)) + # combine data for all technical replicates + peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) +} +# sort on mass +peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) +peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) +peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) +peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) + +# average over technical replicates +averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name) +save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData")) + diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf new file mode 100644 index 00000000..f50bd874 --- /dev/null +++ b/DIMS/AveragePeaks.nf @@ -0,0 +1,18 @@ +process AveragePeaks { + tag "DIMS AveragePeaks" + label 'AveragePeaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_files) + tuple val(sample_id), val(tech_reps), val(scanmode) + + output: + path 'AvgPeaks_*.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode $params.preprocessing_scripts_dir + """ +} diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf deleted file mode 100644 index f03d553c..00000000 --- a/DIMS/AverageTechReplicates.nf +++ /dev/null @@ -1,36 +0,0 @@ -process AverageTechReplicates { - tag "DIMS AverageTechReplicates" - label 'AverageTechReplicates' - container = 'docker://umcugenbioinf/dims:1.3' - shell = ['/bin/bash', '-euo', 'pipefail'] - - input: - path(rdata_file) - path(tic_txt_files) - path(init_file) - val(nr_replicates) - val(analysis_id) - val(matrix) - path(highest_mz_file) - path(breaks_file) - - output: - path('*_repl_pattern.RData'), emit: pattern_files - path('*_avg.RData'), emit: binned_files - path('miss_infusions_negative.txt') - path('miss_infusions_positive.txt') - path('*_TICplots.pdf'), emit: tic_plots_pdf - - script: - """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ - $params.nr_replicates \ - $analysis_id \ - $matrix \ - $highest_mz_file \ - $breaks_file \ - $params.threshold_tics - """ -} - - diff --git a/DIMS/CollectAveraged.R b/DIMS/CollectAveraged.R new file mode 100755 index 00000000..e6466d9e --- /dev/null +++ b/DIMS/CollectAveraged.R @@ -0,0 +1,18 @@ +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +scripts_dir <- cmd_args[1] + +# for each scan mode, collect all averaged peak lists per biological sample +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get list of files + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".RData")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peaklist_averaged <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peaklist_averaged) + } + save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData")) +} diff --git a/DIMS/CollectAveraged.nf b/DIMS/CollectAveraged.nf new file mode 100644 index 00000000..fc65bf21 --- /dev/null +++ b/DIMS/CollectAveraged.nf @@ -0,0 +1,17 @@ +process CollectAveraged { + tag "DIMS CollectAveraged" + label 'CollectAveraged' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(averaged_files) + + output: + path('AvgPeaks*.RData'), emit: averaged_peaks + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectAveraged.R + """ +} diff --git a/DIMS/AverageTechReplicates.R b/DIMS/EvaluateTics.R similarity index 55% rename from DIMS/AverageTechReplicates.R rename to DIMS/EvaluateTics.R index 1c727d10..528d6f4b 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/EvaluateTics.R @@ -1,5 +1,3 @@ -# adapted from 3-AverageTechReplicates.R - # load packages library("ggplot2") library("gridExtra") @@ -13,39 +11,14 @@ run_name <- cmd_args[3] dims_matrix <- cmd_args[4] highest_mz_file <- cmd_args[5] highest_mz <- get(load(highest_mz_file)) -breaks_filepath <- cmd_args[6] -thresh2remove <- as.numeric(cmd_args[7]) - -remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { - # collect list of samples to remove from replication pattern - remove_from_group <- NULL - for (sample_nr in 1:length(repl_pattern)){ - repl_pattern_1sample <- repl_pattern[[sample_nr]] - remove <- NULL - for (file_nr in 1:length(repl_pattern_1sample)) { - if (repl_pattern_1sample[file_nr] %in% bad_samples) { - remove <- c(remove, file_nr) - } - } - if (length(remove) == nr_replicates) { - remove_from_group <- c(remove_from_group, sample_nr) - } - if (!is.null(remove)) { - repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] - } - } - if (length(remove_from_group) != 0) { - repl_pattern <- repl_pattern[-remove_from_group] - } - return(list("pattern" = repl_pattern)) -} +trim_params_filepath <- cmd_args[6] +thresh2remove <- 1000000000 # load init_file: contains repl_pattern load(init_file) -# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, -# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) +# load trim_params_file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(trim_params_filepath) # lower the threshold for non Plasma matrices if (dims_matrix != "Plasma") { @@ -59,78 +32,32 @@ if (highest_mz > 700) { thresh2remove <- 1000000 } +# find out which technical replicates are below the threshold +remove_tech_reps <- find_bad_replicates(repl_pattern, thresh2remove) +print(remove_tech_reps) -# remove technical replicates which are below the threshold -remove_neg <- NULL -remove_pos <- NULL -cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (sample_nr in 1:length(repl_pattern)) { - tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) - tech_reps_array_pos <- NULL - tech_reps_array_neg <- NULL - sum_neg <- 0 - sum_pos <- 0 - nr_pos <- 0 - nr_neg <- 0 - for (file_nr in 1:length(tech_reps)) { - load(paste(tech_reps[file_nr], ".RData", sep = "")) - cat("\n\nParsing", tech_reps[file_nr]) - # negative scanmode - cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) - if (sum(peak_list$neg[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_neg <- c(remove_neg, tech_reps[file_nr]) - } else { - nr_neg <- nr_neg + 1 - sum_neg <- sum_neg + peak_list$neg - } - tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) - # positive scanmode - cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) - if (sum(peak_list$pos[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_pos <- c(remove_pos, tech_reps[file_nr]) - } else { - nr_pos <- nr_pos + 1 - sum_pos <- sum_pos + peak_list$pos - } - tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) - } - # save to file - if (nr_neg != 0) { - sum_neg[, 1] <- sum_neg[, 1] / nr_neg - colnames(sum_neg) <- names(repl_pattern)[sample_nr] - save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData")) - } - if (nr_pos != 0) { - sum_pos[, 1] <- sum_pos[, 1] / nr_pos - colnames(sum_pos) <- names(repl_pattern)[sample_nr] - save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData")) - } -} - -pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) -repl_pattern_filtered <- pattern_list$pattern +# negative scan mode +remove_neg <- remove_tech_reps$neg +repl_pattern_filtered <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) save(repl_pattern_filtered, file = "negative_repl_pattern.RData") -write.table( - remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" -) -pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) -repl_pattern_filtered <- pattern_list$pattern +# positive scan mode +remove_pos <- remove_tech_reps$pos +repl_pattern_filtered <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) save(repl_pattern_filtered, file = "positive_repl_pattern.RData") -write.table( - remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" + +# get an overview of suitable technical replicates for both scan modes +allsamples_techreps_neg <- get_overview_tech_reps(repl_pattern_filtered, "negative") +allsamples_techreps_pos <- get_overview_tech_reps(repl_pattern_filtered, "positive") +allsamples_techreps_both_scanmodes <- rbind(allsamples_techreps_pos, allsamples_techreps_neg) +write.table(allsamples_techreps_both_scanmodes, + file = "replicates_per_sample.txt", + col.names = FALSE, + row.names = FALSE, + sep = "," ) + ## generate TIC plots # get all txt files tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") @@ -207,3 +134,5 @@ tic_plot_pdf <- marrangeGrob( # save to file ggsave(filename = paste0(run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") + + diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf new file mode 100644 index 00000000..fdf7f713 --- /dev/null +++ b/DIMS/EvaluateTics.nf @@ -0,0 +1,33 @@ +process EvaluateTics { + tag "DIMS EvaluateTics" + label 'EvaluateTics' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_file) + path(tic_txt_files) + path(init_file) + val(analysis_id) + path(highest_mz_file) + path(trim_params_file) + + output: + path('*_repl_pattern.RData'), emit: pattern_files + path('replicates_per_sample.txt'), emit: sample_techreps + path('miss_infusions_negative.txt') + path('miss_infusions_positive.txt') + path('*_TICplots.pdf'), emit: tic_plots_pdf + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R $init_file \ + $params.nr_replicates \ + $analysis_id \ + $params.matrix \ + $highest_mz_file \ + $trim_params_file + """ +} + + diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index d07fd203..007d5ab7 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,5 +1,3 @@ -## adapted from 1-generateBreaksFwhm.HPC.R ## - # load required package suppressPackageStartupMessages(library("xcms")) @@ -57,5 +55,6 @@ for (i in 1:nr_segments) { } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData") +save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") +save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index af617a8d..c486010d 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -10,6 +10,7 @@ process GenerateBreaks { output: path('breaks.fwhm.RData'), emit: breaks + path('trim_params.RData'), emit: trim_params path('highest_mz.RData'), emit: highest_mz script: diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 44d49962..9ff22623 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,5 +1,3 @@ -## adapted from makeInit in old pipeline - # define parameters args <- commandArgs(trailingOnly = TRUE) @@ -15,12 +13,12 @@ group_names_unique <- unique(group_names) # generate the replication pattern repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { - tmp <- c() + replicates_persample <- c() for (repl in nr_replicates:1) { index <- ((sample_group * nr_replicates) - repl) + 1 - tmp <- c(tmp, sample_names[index]) + replicates_persample <- c(replicates_persample, sample_names[index]) } - repl_pattern <- c(repl_pattern, list(tmp)) + repl_pattern <- c(repl_pattern, list(replicates_persample)) } names(repl_pattern) <- group_names_unique diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 697f8f7e..2cf49d20 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,53 +1,58 @@ -## adapted from 4-peakFinding.R +library(dplyr) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -sample_file <- cmd_args[1] -breaks_file <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) -scripts_dir <- cmd_args[4] -thresh <- 2000 -outdir <- "./" - -# load in function scripts -source(paste0(scripts_dir, "do_peakfinding.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "search_mzrange.R")) -source(paste0(scripts_dir, "fit_optim.R")) -source(paste0(scripts_dir, "fit_gaussian.R")) -source(paste0(scripts_dir, "fit_init.R")) -source(paste0(scripts_dir, "get_fwhm.R")) -source(paste0(scripts_dir, "get_stdev.R")) -source(paste0(scripts_dir, "optimize_gaussfit.R")) -source(paste0(scripts_dir, "fit_peaks.R")) -source(paste0(scripts_dir, "fit_gaussians.R")) -source(paste0(scripts_dir, "estimate_area.R")) -source(paste0(scripts_dir, "get_fit_quality.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "sum_curves.R")) -source(paste0(scripts_dir, "within_ppm.R")) - -load(breaks_file) - -# Load output of AverageTechReplicates for a sample -sample_avgtechrepl <- get(load(sample_file)) -if (grepl("_pos", sample_file)) { - scanmode <- "positive" -} else if (grepl("_neg", sample_file)) { - scanmode <- "negative" -} +replicate_rdatafile <- cmd_args[1] +resol <- as.numeric(cmd_args[2]) +preprocessing_scripts_dir <- cmd_args[3] +# use fixed theshold between noise and signal for peak +peak_thresh <- 2000 + +# source functions script +source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R")) + +# Load output of AssignToBins (peak_list) for a technical replicate +load(replicate_rdatafile) +techrepl_name <- colnames(peak_list$pos)[1] + +# load list of technical replicates per sample that passed threshold filter +techreps_passed <- read.table("replicates_per_sample.txt", sep=",") # Initialize options(digits = 16) -# Number used to calculate area under Gaussian curve -int_factor <- 1 * 10^5 -# Initial value used to estimate scaling parameter -scale <- 2 -width <- 1024 -height <- 768 -# run the findPeaks function +# do peak finding +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get intensities for scan mode + if (scanmode == "positive") { + ints_perscanmode <- peak_list$pos + } else if (scanmode == "negative") { + ints_perscanmode <- peak_list$neg + } + + # check whether technical replicate has passed threshold filter for this scanmode + techreps_scanmode <- techreps_passed[grep(scanmode, techreps_passed[, 3]), ] + # if techrep is ok, it will be found. If not, skip this techrep. + if (length(grep(techrepl_name, techreps_scanmode)) == 0) { + break + } + + # put mz and intensities into dataframe + ints_fullrange <- as.data.frame(cbind(mz = as.numeric(rownames(ints_perscanmode)), + int = as.numeric(ints_perscanmode))) + + # look for m/z range for all peaks + regions_of_interest <- search_regions_of_interest(ints_fullrange) + + # fit Gaussian curve and calculate integrated area under curve + integrated_peak_df <- integrate_peaks(ints_fullrange, regions_of_interest, resol, peak_thresh) + + # add sample name to dataframe + integrated_peak_df <- as.data.frame(cbind(samplenr = techrepl_name, integrated_peak_df)) + + # save output to file + save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData")) +} -# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) -do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 0097d128..04c82540 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -5,13 +5,14 @@ process PeakFinding { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(path(rdata_file), path(breaks_file)) + path(rdata_file) + each path(sample_techreps) output: - path '*tive.RData' + path '*tive.RData', optional: true script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $params.resolution $params.preprocessing_scripts_dir """ } diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 4435b033..e0a5c828 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -34,7 +34,7 @@ if (grepl("negative", basename(hmdb_part_file))) { batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] # load file with spectrum peaks -spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") +spec_peaks_file <- paste0("AvgPeaks_", scanmode, ".RData") load(spec_peaks_file) outlist_df <- as.data.frame(outlist_total) outlist_df$mzmed.pkt <- as.numeric(outlist_df$mzmed.pkt) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 0ecb18c1..6cc4ddb0 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -6,7 +6,8 @@ process PeakGrouping { input: path(hmdbpart_file) - each path(spectrumpeak_file) + path(averagedpeaks_file) + each path(pattern_file) output: path '*_identified.RData', emit: grouped_identified diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R new file mode 100644 index 00000000..beed29bb --- /dev/null +++ b/DIMS/preprocessing/average_peaks_functions.R @@ -0,0 +1,41 @@ +average_peaks_per_sample <- function(peaklist_allrepl_sorted, sample_name) { + #' Average the intensity of peaks that occur in different technical replicates of a biological sample + #' + #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float) + #' + #' @return averaged_peaks: matrix of averaged peaks (float) + + # initialize + averaged_peaks <- peaklist_allrepl_sorted[0, ] + # set ppm as fixed value, not the same ppm as in peak grouping + ppm_peak <- 2 + + while (nrow(peaklist_allrepl_sorted) > 1) { + # store row numbers + peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) + # find the peaks in the dataset with corresponding m/z plus or minus tolerance + reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] + mz_tolerance <- (reference_mass * ppm_peak) / 10^6 + minmz_ref <- reference_mass - mz_tolerance + maxmz_ref <- reference_mass + mz_tolerance + select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) + select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] + nrsamples <- length(select_peak_indices) + # put averaged intensities into a new row + averaged_1peak <- matrix(0, nrow = 1, ncol = 6) + colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # calculate m/z values for peak group + averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) + averaged_1peak[1, "fq"] <- nrsamples + averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) + # remove rownr column and append to averaged_peaks + peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] + averaged_peaks <- rbind(averaged_peaks, averaged_1peak) + } + # add sample name to first column + averaged_peaks[ , "samplenr"] <- sample_name + + return(averaged_peaks) +} diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R new file mode 100644 index 00000000..318ce2b6 --- /dev/null +++ b/DIMS/preprocessing/evaluate_tics_functions.R @@ -0,0 +1,102 @@ +# EvaluateTics functions +find_bad_replicates <- function(repl_pattern, thresh2remove) { + #' Find technical replicates with a total intensity below a threshold + #' + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param thresh2remove: Threshold value for acceptance or rejection of total intensity (integer) + #' + #' @return remove_tech_reps: Array of rejected technical replicates (strings) + + remove_pos <- NULL + remove_neg <- NULL + cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") + for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + for (file_nr in 1:length(tech_reps)) { + load(paste0(tech_reps[file_nr], ".RData")) + cat("\n\nParsing", tech_reps[file_nr]) + # positive scan mode: determine whether sum of intensities is above threshold + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_pos <- c(remove_pos, tech_reps[file_nr]) + } + # negative scan mode: determine whether sum of intensities is above threshold + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_neg <- c(remove_neg, tech_reps[file_nr]) + } + } + } + cat("\n") + # write information on miss_infusions for both scan modes + write.table(remove_pos, + file = paste0("miss_infusions_positive.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + write.table(remove_neg, + file = paste0("miss_infusions_negative.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + + # combine removed technical replicates from pos and neg + remove_tech_reps <- list(pos = remove_pos, neg = remove_neg) + return(remove_tech_reps) +} + +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { + #' Remove technical replicates with insufficient quality from a biological sample + #' + #' @param bad_samples: Array of technical replicates of insufficient quality (strings) + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param nr_replicates: Number of technical replicates per biological sample (integer) + #' + #' @return repl_pattern_filtered: list of technical replicates of sufficient quality (strings) + + # collect list of samples to remove from replication pattern + remove_from_group <- NULL + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] + remove <- NULL + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) + } + } + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } + } + if (length(remove_from_group) != 0) { + repl_pattern_filtered <- repl_pattern[-remove_from_group] + } else { + repl_pattern_filtered <- repl_pattern + } + return(repl_pattern_filtered) +} + +get_overview_tech_reps <- function(repl_pattern_filtered, scanmode) { + #' Create an overview of technical replicates with sufficient quality from a biological sample + #' + #' @param repl_pattern_filtered: List of samples with corresponding technical replicates (strings) + #' @param scanmode: Scan mode "positive" or "negative" (string) + #' + #' @return allsamples_techreps_scanmode: Matrix of technical replicates of sufficient quality (strings) + + allsamples_techreps_scanmode <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) + for (sample_nr in 1:length(repl_pattern_filtered)) { + allsamples_techreps_scanmode[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] + allsamples_techreps_scanmode[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") + } + allsamples_techreps_scanmode[, 3] <- scanmode + return(allsamples_techreps_scanmode) +} + diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R new file mode 100644 index 00000000..8539c03c --- /dev/null +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -0,0 +1,246 @@ +# functions for peak finding + +search_regions_of_interest <- function(ints_fullrange) { + #' Divide the full m/z range into regions of interest with indices + #' + #' @param ints_fullrange: Matrix with m/z values and intensities (float) + #' + #' @return regions_of_interest: matrix of m/z regions of interest (integer) + + # find indices where intensity is not equal to zero + nonzero_positions <- as.vector(which(ints_fullrange$int != 0)) + + # find regions of interest (look for consecutive numbers) + regions_of_interest_consec <- seqToIntervals(nonzero_positions) + # add length of regions of interest + regions_of_interest_diff <- regions_of_interest_consec[, 2] - regions_of_interest_consec[, 1] + 1 + regions_of_interest_length <- cbind(regions_of_interest_consec, length = regions_of_interest_diff) + # remove short lengths; a peak should have at least 3 data points + if (any(regions_of_interest_length[, "length"] < 3)) { + regions_of_interest_gte3 <- regions_of_interest_length[-which(regions_of_interest_length[, "length"] < 3), ] + } else { + regions_of_interest_gte3 <- regions_of_interest_length + } + # test for length of roi (region of interest). If length is greater than 11, break up into separate rois + remove_roi_index <- c() + new_rois_all <- regions_of_interest_gte3[0, ] + for (roi_nr in 1:nrow(regions_of_interest_gte3)) { + if (regions_of_interest_gte3[roi_nr, "length"] > 11) { + roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ] + roi_intrange <- as.numeric(roi$int) + # look for local minima that separate the peaks + local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1 + if (length(local_min_positions) > 0) { + remove_roi_index <- c(remove_roi_index, roi_nr) + # find new indices for rois after splitting + start_pos <- regions_of_interest_gte3[roi_nr, "from"] + new_rois <- data.frame(from = 0, to = 0, length = 0) + new_rois_splitroi <- regions_of_interest_gte3[0, ] + for (local_min_index in 1:length(local_min_positions)) { + new_rois[, 1] <- start_pos + new_rois[, 2] <- start_pos + local_min_positions[local_min_index] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + start_pos <- new_rois[, 2] + } + # intensities after last local minimum + new_rois[, 1] <- start_pos + new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + # append + new_rois_all <- rbind(new_rois_all, new_rois_splitroi) + } else { + # if there are no local minima, all intensities belong to one peak. + remove_roi_index <- c(remove_roi_index, roi_nr) + # look for maximum and take 5 intensities to the left and right + max_index <- which(roi_intrange == max(roi_intrange)) + max_pos <- as.numeric(rownames(roi)[max_index]) + new_roi <- data.frame(from = 0, to = 0, length = 0) + new_roi[, 1] <- max_pos - 5 + new_roi[, 2] <- max_pos + 5 + new_roi[, 3] <- new_roi[, 2] - new_roi[, 1] + 1 + # append + new_rois_all <- rbind(new_rois_all, new_roi) + } + } + } + + # remove rois that have been split into chunks or shortened + if (length(remove_roi_index) > 0) { + regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ] + } else { + regions_of_interest_minus_short <- regions_of_interest_gte3 + } + # combine remaining rois with info on chunks + regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all) + # remove regions of interest with short lengths again + if (any(regions_of_interest_split[, "length"] < 3)) { + regions_of_interest_final <- regions_of_interest_split[-which(regions_of_interest_split[, "length"] < 3), ] + } else { + regions_of_interest_final <- regions_of_interest_split + } + + # sort on first index + if (nrow(regions_of_interest_final) > 1){ + regions_of_interest_sorted <- regions_of_interest_final %>% dplyr::arrange(from) + } else { + regions_of_interest_sorted <- regions_of_interest_final + } + + return(regions_of_interest_sorted) +} + +integrate_peaks <- function(ints_fullrange, regions_of_interest, resol, peak_thresh) { + #' Fit Gaussian peak for each region of interest and integrate area under the curve + #' + #' @param ints_fullrange: Named list of intensities (float) + #' @param regions_of_interest: Named list of intensities (float) + #' @param resol: Value for resolution (integer) + #' @param peak_thresh: Value for noise level threshold (integer) # NOT USED YET + #' + #' @return allpeaks_values: matrix of integrated peaks + + # initialize dataframe to store results for all peaks + allpeaks_values <- matrix(0, nrow = nrow(regions_of_interest), ncol = 5) + colnames(allpeaks_values) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + + for (roi_nr in 1:nrow(regions_of_interest)) { + # find m/z values and intensities corresponding to the region of interest + index_range <- regions_of_interest[roi_nr, "from"] : regions_of_interest[roi_nr, "to"] + roi_mzrange <- as.numeric(ints_fullrange$mz[index_range]) + roi_intrange <- as.numeric(ints_fullrange$int[index_range]) + # find m/z value for maximum of peak (mu in Gaussian function) + weighted_mzmean <- weighted.mean(roi_mzrange, roi_intrange) + # find expected peak width at this m/z value + fwhm_mzmed <- get_fwhm(weighted_mzmean, resol) + # calculate sigma for Gaussian curve (https://en.wikipedia.org/wiki/Full_width_at_half_maximum) + sigma_peak <- fwhm_mzmed / 2.355 + # find scale factor for Gaussian. Initial estimate is maximum intensity in roi + # divided by 2 for better correlation with intensities from old PeakFinding method + scale_factor <- max(roi_intrange) / 2 + # fit Gaussian peak. Find intensities according to normal distribution + normal_ints <- scale_factor * gaussfunc(roi_mzrange, weighted_mzmean, sigma_peak) + # sum intensities in roi + sum_ints <- sum(roi_intrange) + sum_gauss <- sum(normal_ints) + # calculate quality of fit + quality_fit <- 1 - sum(abs(normal_ints - roi_intrange)) / sum_ints + # put all values into dataframe + allpeaks_values[roi_nr, "mzmed.pkt"] <- weighted_mzmean + allpeaks_values[roi_nr, "fq"] <- quality_fit + allpeaks_values[roi_nr, "mzmax.pkt"] <- max(roi_mzrange) + allpeaks_values[roi_nr, "mzmin.pkt"] <- min(roi_mzrange) + allpeaks_values[roi_nr, "height.pkt"] <- sum_gauss + } + + # remove peaks with height = 0, look for NA or NaN + remove_na <- which(is.na(as.numeric(allpeaks_values[, "height.pkt"]))) + if (length(remove_na) > 0) { + allpeaks_values <- allpeaks_values[-remove_na, ] + } + + return(allpeaks_values) +} + +# in the next version of the docker image, package R.utils will be included +seqToIntervals <- function(idx) { + #' Find consecutive stretches of numbers + #' function seqToIntervals copied from R.utils library + #' see https://rdrr.io/cran/R.utils/src/R/seqToIntervals.R + #' + #' @param idx: Sequence of indices (integers) + #' + #' @return res: Matrix of start and end positions of consecutive numbers (matrix) + + # Clean up sequence + idx <- as.integer(idx) + idx <- unique(idx) + idx <- sort(idx) + + len_idx <- length(idx) + if (len_idx == 0L) { + res <- matrix(NA_integer_, nrow = 0L, ncol = 2L) + colnames(res) <- c("from", "to") + return(res) + } + + # Identify end points of intervals + diff_idx <- diff(idx) + diff_idx <- (diff_idx > 1) + diff_idx <- which(diff_idx) + nr_intervals <- length(diff_idx) + 1 + + # Allocate return matrix + res <- matrix(NA_integer_, nrow = nr_intervals, ncol = 2L) + colnames(res) <- c("from", "to") + + from_value <- idx[1] + to_value <- from_value - 1 + last_value <- from_value + + count <- 1 + for (running_index in seq_along(idx)) { + value <- idx[running_index] + if (value - last_value > 1) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) + from_value <- value + count <- count + 1 + } + last_value <- value + } + + if (to_value < from_value) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) + } + + return(res) +} + +get_fwhm <- function(query_mass, resol) { + #' Calculate fwhm (full width at half maximum intensity) for a peak + #' + #' @param query_mass: Value for mass (float) + #' @param resol: Value for resolution (integer) + #' + #' @return fwhm: Value for full width at half maximum (float) + + # set aberrant values of query_mass to default value of 200 + if (is.na(query_mass)) { + query_mass <- 200 + } + if (query_mass < 0) { + query_mass <- 200 + } + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) + # calculate full width at half maximum + fwhm <- query_mass / resol_mz + + return(fwhm) +} + + +# from https://rdrr.io/cran/rvmethod/src/R/gaussfit.R +#' Gaussian Function from package rvmethod +#' +#' This function returns the unnormalized (height of 1.0) Gaussian curve with a +#' given center and spread. +#' +#' @param x the vector of values at which to evaluate the Gaussian +#' @param mu the center of the Gaussian +#' @param sigma the spread of the Gaussian (must be greater than 0) +#' @return vector of values of the Gaussian +#' @examples x = seq(-4, 4, length.out = 100) +#' y = gaussfunc(x, 0, 1) +#' plot(x, y) +#' +#' @import stats +#' +#' @export +gaussfunc <- function(x, mu, sigma) { + return(exp(-((x - mu) ^ 2) / (2 * (sigma ^ 2)))) +} + diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData new file mode 100644 index 00000000..6d9f40d8 Binary files /dev/null and b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData differ diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData new file mode 100644 index 00000000..77d650f9 Binary files /dev/null and b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData differ diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData new file mode 100644 index 00000000..21fd5bca Binary files /dev/null and b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData differ diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R new file mode 100644 index 00000000..0a8d2345 --- /dev/null +++ b/DIMS/tests/testthat/test_average_peaks.R @@ -0,0 +1,30 @@ +# unit test for AveragePeaks + +# source all functions for PeakGrouping +source("../../preprocessing/average_peaks_functions.R") + +# test find_peak_groups +testthat::test_that("peaks are correctly averaged", { + # create peak list to test on: + samplenrs <- rep(c("repl_001", "repl_002", "repl_003"), 3) + test_peaklist_sorted_num <- matrix(0, nrow = 9, ncol = 5) + colnames(test_peaklist_sorted_num) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + test_peaklist_sorted_num[, 1] <- 300 + (1:9) / 10000 + test_peaklist_sorted_num[, 5] <- 100 * (9:1) + test_peaklist_sorted <- as.data.frame(cbind(samplenr = samplenrs, test_peaklist_sorted_num)) + test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2]) + test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6]) + test_sample_name <- "P001" + test_empty_peaklist <- test_peaklist_sorted[0, ] + + # test that first peak is correctly averaged + expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)[1, 6]), 600, tolerance = 0.001, TRUE) + # test number of rows + expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), 2) + # test column names + expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), + c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE) + # test what happens when peak list is empty + expect_error(average_peaks_per_sample(test_empty_peaklist, test_sample_name)) +}) + diff --git a/DIMS/tests/testthat/test_evaluate_tics.R b/DIMS/tests/testthat/test_evaluate_tics.R new file mode 100644 index 00000000..b474a81b --- /dev/null +++ b/DIMS/tests/testthat/test_evaluate_tics.R @@ -0,0 +1,66 @@ +# unit test for EvaluateTics + +# source all functions for PeakGrouping +source("../../preprocessing/evaluate_tics_functions.R") + +# test find_bad_replicates +testthat::test_that("TICS are correctly accepted or rejected", { + # It's necessary to copy/symlink the files to the current location for the combine_sum_adducts_parts function + # local: setwd("~/Development/DIMS_refactor_PeakFinding_codereview/CustomModules/DIMS/tests/testthat") + test_files <- list.files("fixtures/", "test_evaluate_tics", full.names = TRUE) + file.symlink(file.path(test_files), getwd()) + + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern <- list(technical_replicates) + names(test_repl_pattern) <- "sample1" + test_thresh2remove <- 10^9 + + # test that output has two entries + expect_equal(length(find_bad_replicates(test_repl_pattern, test_thresh2remove)), 2) + # test that first technical replicate is removed in positive scan mode + expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$pos, "test_evaluate_tics_file1", TRUE) + # test that third technical replicate is removed in negative scan mode + expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$neg, "test_evaluate_tics_file3", TRUE) + # test that output files are generated + expect_equal(sum(grepl("miss_infusions_", list.files("./"))), 2) + + # Remove symlinked files + files_remove <- list.files("./", "SummedAdducts_test.RData", full.names = TRUE) + file.remove(files_remove) + +}) + +# test remove_from_repl_pattern +testthat::test_that("technical replicates are correctly removed from replication pattern", { + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern <- list(technical_replicates) + names(test_repl_pattern) <- "sample1" + test_bad_samples <- "test_evaluate_tics_file2" + test_nr_replicates <- 3 + + # test that the output contains 1 sample + expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)), 1) + # test that the output for the sample contains 2 technical replicates + expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1), 2) + # test that the second technical replicate has been removed + expect_false(unique(grepl(test_bad_samples, remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1))) +}) + +# test get_overview_tech_reps +testthat::test_that("overview of technical replicates is correctly created", { + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern_filtered <- list(technical_replicates) + names(test_repl_pattern_filtered) <- "sample1" + test_scanmode <- "positive" + + # test that overview is correctly created + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,1], "sample1") + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,3], "positive") + expect_true(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,2] == + paste0(technical_replicates, collapse = ";"), TRUE) + +}) + diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R new file mode 100644 index 00000000..bc7f9d86 --- /dev/null +++ b/DIMS/tests/testthat/test_peak_finding_functions.R @@ -0,0 +1,87 @@ +# unit tests for PeakFinding functions: + +source("../../preprocessing/peak_finding_functions.R") + +# test search_regions_of_interest function: +test_that("regions of interest are correctly found for a single peak", { + # create intensities to test on + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 10) + 1:10 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) + + # expected output + expected_output <- matrix(c(1, 10, 10), nrow = 1, ncol = 3L) + colnames(expected_output) <- c("from", "to", "length") + rownames(expected_output) <- "to" + + # tests + expect_type(search_regions_of_interest(test_ints_fullrange), "double") + expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 1) + expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, tolerance = 0.000001, TRUE) +}) +test_that("regions of interest are correctly found for two peaks", { + # create intensities to test on + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 20) + 1:20 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + + dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) + + # expected output + expected_output <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3)) + colnames(expected_output) <- c("from", "to", "length") + rownames(expected_output) <- as.character(c(1, 2)) + + # tests + expect_type(search_regions_of_interest(test_ints_fullrange), "list") + expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 2) + expect_equal(search_regions_of_interest(test_ints_fullrange)[, "length"], as.numeric(expected_output[, "length"])) + expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, check.attributes = FALSE) +}) + +# test integrate_peaks function: +test_that("peaks are correctly integrated", { + # create peak info to test on: + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 20) + 1:20 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + + dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) + + # create regions of interest to test on: + test_regions_of_interest <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3L)) + colnames(test_regions_of_interest) <- c("from", "to", "length") + rownames(test_regions_of_interest) <- as.character(c(1, 2)) + + # set other parameters + test_resol <- 140000 + test_peak_thresh <- 2000 + + # expected output + expected_output <- as.data.frame(matrix(c(70.000357, 70.000521, 0.5148094, 0.3016569, 70.00006, 70.00048, + 70.00048, 70.00120, 15526.826791, 11950.90570), nrow = 2, ncol = 5)) + colnames(expected_output) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + + expect_type(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh), "double") + expect_equal(nrow(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)), 2) + expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "mzmed.pkt"], + expected_output[, "mzmed.pkt"], tolerance = 0.0001) + expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "height.pkt"], + expected_output[, "height.pkt"], tolerance = 0.0001) +}) + +# test get_fwhm function +test_that("fwhm is correctly calculated", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_mz_max <- max(test_mass_vector) + test_resol <- 140000 + + expect_type(get_fwhm(test_mz_max, test_resol), "double") + expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE) +}) +