From 0f724c4ccbe7fee6fa1e69d65c5566393d813530 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 31 Jan 2025 17:22:42 +0100 Subject: [PATCH 01/31] GenerateBreaks output split into 2 files --- DIMS/AssignToBins.R | 10 +++++----- DIMS/AssignToBins.nf | 4 ++-- DIMS/GenerateBreaks.R | 3 ++- DIMS/GenerateBreaks.nf | 1 + 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index e49d6c66..a25a5e70 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,5 +1,3 @@ -## adapted from 2-DIMS.R - # load required packages suppressPackageStartupMessages(library("xcms")) @@ -8,13 +6,15 @@ cmd_args <- commandArgs(trailingOnly = TRUE) mzml_filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) +trimparams_filepath <- cmd_args[3] +resol <- as.numeric(cmd_args[4]) trim <- 0.1 dims_thresh <- 100 -# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, -# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg load(breaks_filepath) +# load trim paramters: trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(trimparams_filepath) # get sample name sample_name <- sub("\\..*$", "", basename(mzml_filepath)) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index d0a7098b..d3bc79ae 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -5,7 +5,7 @@ process AssignToBins { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(val(file_id), path(mzML_file), path(breaks_file)) + tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file)) output: path("${file_id}.RData"), emit: rdata_file @@ -13,7 +13,7 @@ process AssignToBins { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $trim_params_file $params.resolution """ } diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index d07fd203..ddb68c56 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -57,5 +57,6 @@ for (i in 1:nr_segments) { } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData") +save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData") +save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData") save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index af617a8d..c486010d 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -10,6 +10,7 @@ process GenerateBreaks { output: path('breaks.fwhm.RData'), emit: breaks + path('trim_params.RData'), emit: trim_params path('highest_mz.RData'), emit: highest_mz script: From de495e3cd07216be24fd993abdfe7c75054da6be Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 3 Feb 2025 10:21:36 +0100 Subject: [PATCH 02/31] AverageTechReplicates replaced by EvaluateTics --- DIMS/EvaluateTics.R | 176 +++++++++++++++++++++++++++++++++++++++++++ DIMS/EvaluateTics.nf | 34 +++++++++ 2 files changed, 210 insertions(+) create mode 100644 DIMS/EvaluateTics.R create mode 100644 DIMS/EvaluateTics.nf diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R new file mode 100644 index 00000000..5b8c1e30 --- /dev/null +++ b/DIMS/EvaluateTics.R @@ -0,0 +1,176 @@ +# load packages +library("ggplot2") +library("gridExtra") + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +init_file <- cmd_args[1] +nr_replicates <- as.numeric(cmd_args[2]) +run_name <- cmd_args[3] +dims_matrix <- cmd_args[4] +highest_mz_file <- cmd_args[5] +highest_mz <- get(load(highest_mz_file)) +trim_params_filepath <- cmd_args[6] +thresh2remove <- 1000000000 +dims_thresh <- 100 + +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { + # collect list of samples to remove from replication pattern + remove_from_group <- NULL + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] + remove <- NULL + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) + } + } + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } + } + if (length(remove_from_group) != 0) { + repl_pattern <- repl_pattern[-remove_from_group] + } + return(list("pattern" = repl_pattern)) +} + +# load init_file: contains repl_pattern +load(init_file) + +# load trim_params_file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(trim_params_filepath) + +# lower the threshold below which a sample will be removed for DBS and for high m/z +if (dims_matrix == "DBS") { + thresh2remove <- 500000000 +} +if (highest_mz > 700) { + thresh2remove <- 1000000 +} + +# remove technical replicates which are below the threshold and average intensity over time +remove_neg <- NULL +remove_pos <- NULL +cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") +for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + tech_reps_array_pos <- NULL + tech_reps_array_neg <- NULL + for (file_nr in 1:length(tech_reps)) { + load(paste(tech_reps[file_nr], ".RData", sep = "")) + cat("\n\nParsing", tech_reps[file_nr]) + # positive scan mode: determine whether sum of intensities is above threshold + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_pos <- c(remove_pos, tech_reps[file_nr]) + } + tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) + # negative scan mode: determine whether sum of intensities is above threshold + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_neg <- c(remove_neg, tech_reps[file_nr]) + } + tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) + } +} + +pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern +save(repl_pattern_filtered, file = "negative_repl_pattern.RData") +write.table( + remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) + +pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern +save(repl_pattern_filtered, file = "positive_repl_pattern.RData") +write.table( + remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) + +## generate TIC plots +# get all txt files +tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") +all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) + +# determine maximum intensity +highest_tic_max <- 0 +for (file in tic_files) { + tic <- read.table(file) + this_tic_max <- max(tic$tic_intensity) + if (this_tic_max > highest_tic_max) { + highest_tic_max <- this_tic_max + max_sample <- sub("_TIC\\..*$", "", basename(file)) + } +} + +# create a list with information for all TIC plots +tic_plot_list <- list() +plot_nr <- 0 +for (sample_nr in c(1:length(repl_pattern))) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + sample_name <- names(repl_pattern)[sample_nr] + for (file_nr in 1:length(tech_reps)) { + plot_nr <- plot_nr + 1 + # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) + repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) + bad_color_pos <- tech_reps[file_nr] %in% remove_pos + bad_color_neg <- tech_reps[file_nr] %in% remove_neg + if (bad_color_neg & bad_color_pos) { + plot_color <- "#F8766D" + } else if (bad_color_pos) { + plot_color <- "#ED8141" + } else if (bad_color_neg) { + plot_color <- "#BF80FF" + } else { + plot_color <- "white" + } + tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + + geom_line(linewidth = 0.3) + + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + + theme(plot.background = element_rect(fill = plot_color), + axis.text = element_text(size = 4), + axis.title = element_text(size = 4), + plot.title = element_text(size = 6)) + tic_plot_list[[plot_nr]] <- tic_plot + } +} + +# create a layout matrix dependent on number of replicates +layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +# put TIC plots in matrix +tic_plot_pdf <- marrangeGrob( + grobs = tic_plot_list, + nrow = 10, ncol = nr_replicates, + layout_matrix = layout, + top = quote(paste( + "TICs of run", run_name, + " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", + g, "/", npages + )) +) + +# save to file +ggsave(filename = paste0(run_name, "_TICplots.pdf"), + tic_plot_pdf, width = 21, height = 29.7, units = "cm") + diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf new file mode 100644 index 00000000..7ce58684 --- /dev/null +++ b/DIMS/EvaluateTics.nf @@ -0,0 +1,34 @@ +process EvaluateTics { + tag "DIMS EvaluateTics" + label 'EvaluateTics' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_file) + path(tic_txt_files) + path(init_file) + val(nr_replicates) + val(analysis_id) + val(matrix) + path(highest_mz_file) + path(trim_params_file) + + output: + path('*_repl_pattern.RData'), emit: pattern_files + path('miss_infusions_negative.txt') + path('miss_infusions_positive.txt') + path('*_TICplots.pdf') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R $init_file \ + $params.nr_replicates \ + $analysis_id \ + $matrix \ + $highest_mz_file \ + $trim_params_file + """ +} + + From 434a41f7aca65177e226405058e30330207efed3 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 3 Feb 2025 10:22:30 +0100 Subject: [PATCH 03/31] refactored PeakFinding, peak finding funtions moved to folder preprocessing --- DIMS/PeakFinding.R | 87 ++- DIMS/PeakFinding.nf | 2 +- DIMS/preprocessing/peak_finding_functions.R | 779 ++++++++++++++++++++ 3 files changed, 831 insertions(+), 37 deletions(-) create mode 100644 DIMS/preprocessing/peak_finding_functions.R diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 697f8f7e..738a0bc7 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,53 +1,68 @@ -## adapted from 4-peakFinding.R - # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -sample_file <- cmd_args[1] +replicate_mzmlfile <- cmd_args[1] breaks_file <- cmd_args[2] resol <- as.numeric(cmd_args[3]) scripts_dir <- cmd_args[4] -thresh <- 2000 -outdir <- "./" +peak_thresh <- 2000 # load in function scripts -source(paste0(scripts_dir, "do_peakfinding.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "search_mzrange.R")) -source(paste0(scripts_dir, "fit_optim.R")) -source(paste0(scripts_dir, "fit_gaussian.R")) -source(paste0(scripts_dir, "fit_init.R")) -source(paste0(scripts_dir, "get_fwhm.R")) -source(paste0(scripts_dir, "get_stdev.R")) -source(paste0(scripts_dir, "optimize_gaussfit.R")) -source(paste0(scripts_dir, "fit_peaks.R")) -source(paste0(scripts_dir, "fit_gaussians.R")) -source(paste0(scripts_dir, "estimate_area.R")) -source(paste0(scripts_dir, "get_fit_quality.R")) -source(paste0(scripts_dir, "check_overlap.R")) -source(paste0(scripts_dir, "sum_curves.R")) -source(paste0(scripts_dir, "within_ppm.R")) +preprocessing_scripts_dir <- gsub("Utils", "preprocessing", scripts_dir) +source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R")) load(breaks_file) -# Load output of AverageTechReplicates for a sample -sample_avgtechrepl <- get(load(sample_file)) -if (grepl("_pos", sample_file)) { - scanmode <- "positive" -} else if (grepl("_neg", sample_file)) { - scanmode <- "negative" -} +# Load output of AssignToBins for a sample +sample_techrepl <- get(load(replicate_mzmlfile)) +scanmodes <- c("positive", "negative") +ints_pos <- peak_list$pos +ints_neg <- peak_list$neg # Initialize options(digits = 16) -# Number used to calculate area under Gaussian curve -int_factor <- 1 * 10^5 -# Initial value used to estimate scaling parameter -scale <- 2 -width <- 1024 -height <- 768 # run the findPeaks function +techrepl_name <- colnames(ints_pos)[1] + +for (scanmode in scanmodes) { + # turn dataframe with intensities into a named list + if (scanmode == "positive") { + ints_perscanmode <- peak_list$pos + } else if (scanmode == "negative") { + ints_perscanmode <- peak_list$neg + } + + ints_fullrange <- as.vector(ints_perscanmode) + names(ints_fullrange) <- rownames(ints_perscanmode) + + start.time <- Sys.time() + + # look for m/z range for all peaks + allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, scanmode, peak_thresh) + + end.time <- Sys.time() + time.taken <- end.time - start.time + time.taken + + # turn the list into a dataframe + outlist_persample <- NULL + outlist_persample <- cbind("samplenr" = allpeaks_values$nr, + "mzmed.pkt" = allpeaks_values$mean, + "fq" = allpeaks_values$qual, + "mzmin.pkt" = allpeaks_values$min, + "mzmax.pkt" = allpeaks_values$max, + "height.pkt" = allpeaks_values$area) + + # remove peaks with height = 0 + outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ] + + # save output to file + save(outlist_persample, file = paste0(techrepl_name, "_", scanmode, ".RData")) + + # generate text output to log file on number of spikes for this sample + # spikes are peaks that are too narrow, e.g. 1 data point + cat(paste("There were", allpeaks_values$spikes, "spikes")) + +} -# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) -do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 0097d128..dad2a497 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -12,6 +12,6 @@ process PeakFinding { script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.preprocessing_scripts_dir """ } diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R new file mode 100644 index 00000000..331d563b --- /dev/null +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -0,0 +1,779 @@ +# functions for peak finding +search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_thresh) { + #' Divide the full m/z range into regions of interest with min, max and mean m/z + #' + #' @param ints_fullrange: Named list of intensities (float) + #' @param resol: Value for resolution (integer) + #' @param sample_name: Sample name (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param peak_thresh: Value for noise level threshold (integer) + #' + #' @return allpeaks_values: list of m/z regions of interest + + # Number used to calculate area under Gaussian curve + int_factor <- 1 * 10^5 + + # initialize list to store results for all peaks + allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, + "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) + + # find indices where intensity is not equal to zero + nonzero_positions <- as.vector(which(ints_fullrange != 0)) + + # initialize + # start position of the first peak + start_index <- nonzero_positions[1] + # maximum length of region of interest + max_roi_length <- 15 + + # find regions of interest + for (running_index in 1:length(nonzero_positions)) { + # find position of the end of a peak. + if (running_index < length(nonzero_positions) && (nonzero_positions[running_index + 1] - nonzero_positions[running_index]) > 1) { + end_index <- nonzero_positions[running_index] + # get m/z values and intensities for this region of interest + mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) + int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) + # check if intensity is above threshold or the maximum intensity is NaN + if (max(int_vector) < peak_thresh || is.nan(max(int_vector))) { + # go to next region of interest + start_index <- nonzero_positions[running_index + 1] + next + } + # check if there are more intensities than maximum for region of interest + if (length(int_vector) > max_roi_length) { + print(length(int_vector)) + print(running_index) + # trim lowest intensities to zero + #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 + # split the range into multiple sub ranges + #sub_range <- int_vector + #names(sub_range) <- mass_vector + #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, + # sample_name, scanmode, peak_thresh) + # A proper peak needs to have at least 3 intensities above threshold + } else if (length(int_vector) > 3) { + # check if the sum of intensities is above zero. Why is this necessary? + #if (sum(int_vector) == 0) next + # define mass_diff as difference between last and first value of mass_vector + mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)], + length = mass_diff * int_factor) + + # Find the index in int_vector with the highest intensity + # max_index <- which(int_vector == max(int_vector)) + # get initial fit values + roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, # max_index, + resol, force = length(max_index), + use_bounds = FALSE, scanmode) + + if (roi_values$qual[1] == 1) { + # get optimized fit values + roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + # add region of interest to list of all peaks + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + + } else { + for (j in 1:length(roi_values$mean)){ + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) + allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) + } + } + + } else { + + roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + } + } + start_index <- nonzero_positions[running_index + 1] + } + + return(allpeaks_values) +} + +# remove plot sections (commented out) +fit_gaussian <- function(mass_vector2, mass_vector, int_vector, + resol, force, use_bounds, scanmode) { + #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param resol: Value for resolution (integer) + #' @param force: Number of local maxima in int_vector (integer) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param scanmode: Scan mode, positive or negative (string) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # Number used to calculate area under Gaussian curve + int_factor <- 1 * 10^5 + + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector)) + + # Initialise + peak_mean <- NULL + peak_area <- NULL + peak_qual <- NULL + peak_min <- NULL + peak_max <- NULL + fit_quality1 <- 0.15 + fit_quality <- 0.2 + + # One local maximum: + if (force == 1) { + # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol, + fit_quality1, use_bounds) + + # set initial value for scale factor + scale <- 2 + # test if the mean is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol, + force = 1, use_bounds = TRUE, scanmode)) + } else { + # test if the fit is bad + if (fit_values$qual > fit_quality1) { + # Try to fit two curves; find two local maxima. NB: max_index (now new_index) removed from fit_gaussian + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are two indices in new_index + if (length(new_index) != 2) { + new_index <- round(length(mass_vector) / 3) + new_index <- c(new_index, 2 * new_index) + } + # run this function again with two local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, + resol, force = 2, use_bounds = FALSE, scanmode)) + # good fit + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, + fit_values$sigma)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + #### Two local maxima; need at least 6 data points for this #### + } else if (force == 2 && (length(mass_vector) > 6)) { + # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, resol, + use_bounds, fit_quality) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { + # check if fit quality is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol, + force = 2, use_bounds = TRUE, scanmode)) + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)){ + if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit three curves; find three local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are three indices in new_index + if (length(new_index) != 3) { + new_index <- round(length(mass_vector) / 4) + new_index <- c(new_index, 2 * new_index, 3 * new_index) + } + # run this function again with three local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, + resol, force = 3, use_bounds = FALSE, scanmode)) + # good fit, all means are within m/z range + } else { + # check if means are within 3 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + } else { # More than two local maxima; fit 1 peak. + scale <- 2 + fit_quality1 <- 0.40 + use_bounds <- TRUE + max_index <- which(int_vector == max(int_vector)) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol, + fit_quality1, use_bounds) + # check for bad fit + if (fit_values$qual > fit_quality1) { + # get fit values from fit_optim + fit_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, fit_values$area) + peak_min <- fit_values$min + peak_max <- fit_values$max + peak_qual <- 0 + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + # put all values for this region of interest into a list + roi_value_list <- list("mean" = peak_mean, + "area" = peak_area, + "qual" = peak_qual, + "min" = peak_min, + "max" = peak_max) + return(roi_value_list) +} + + +fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, + resol, fit_quality, use_bounds) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param resol: Value for resolution (integer) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # set initial value for scale + scale <- 2 + + if (length(int_vector) < 3) { + message("Range too small, no fit possible!") + } else { + if ((length(int_vector) == 4)) { + # fit 1 peak + mu <- weighted.mean(mass_vector, int_vector) + sigma <- get_stdev(mass_vector, int_vector) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + } else { + # set range vector + if ((length(mass_vector) - length(max_index)) < 2) { + range1 <- c((length(mass_vector) - 4) : length(mass_vector)) + } else if (length(max_index) < 2) { + range1 <- c(1:5) + } else { + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + } + if (range1[1] == 0) range1 <- range1[-1] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) { + range1 <- range1[-which(is.na(int_vector[range1]))] + } + # fit 1 peak + mu <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + } + + p1 <- fitted_peak$par + + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + + # bad fit + if (fq_new > fit_quality) { + # optimize scaling factor + fq <- 0 + scale <- 0 + if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + } + } else { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 0.8 * scale + } + } + # use optimized scale factor to fit 1 peak + if (fq < fq_new) { + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + p1 <- fitted_peak$par + fq_new <- fq + } + } + } + + roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new) + return(roi_value_list) +} + +fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol, use_bounds = FALSE, + fit_quality) { + #' Fit 2 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param resol: Value for resolution (integer) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 2 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + if (range1[1] == 0) range1 <- range1[-1] + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + + # fit 2 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # combined + fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds) + pc <- fitted_2peaks$par + + # get fit quality + if (is.null(sigma2)) sigma2 <- sigma1 + sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + +fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds) { + #' Fit a Gaussian curve for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma: Value for width of the peak (float) + #' @param query_mass: Value for mass at center of peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # Initial value used to estimate scaling parameter + scale <- 2 + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) + sum((d - int_vector) ^ 2) + } + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for fitted Gaussian curve + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000), method = "L-BFGS-B", + lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} + +fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, + query_mass1, scale1, + query_mass2, scale2, use_bounds) { + #' Fit two Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for 2 fitted Gaussian curves + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } + } else { + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000)) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000)) + } + } + return(opt_fit) +} + +get_stdev <- function(mass_vector, int_vector, resol = 140000) { + #' Calculate standard deviation to determine width of a peak + #' + #' @param mass_vector: Vector of 3 mass values (float) + #' @param int_vector: Vector of 3 intensities (float) + #' @param resol: Value for resolution (integer) + #' + #' @return stdev: Value for standard deviation + # find maximum intensity in vector + max_index <- which(int_vector == max(int_vector)) + # find corresponding mass at maximum intensity + max_mass <- mass_vector[max_index] + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200))) + # calculate full width at half maximum + fwhm <- max_mass / resol_mz + # calculate standard deviation + stdev <- (fwhm / 2) * 0.85 + return(stdev) +} + +## adapted from getFitQuality.R +# parameter not used: mu_last +get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param mu_first: Value for first peak (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + if (is.null(sum_fit)) { + mass_vector_int <- mass_vector + int_vector_int <- int_vector + # get new fit quality + fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / + rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) + } else { + sum_fit_int <- sum_fit + int_vector_int <- int_vector + mass_vector_int <- mass_vector + # get new fit quality + fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int))) + } + + # Prevent division by 0 + if (is.nan(fq_new)) fq_new <- 1 + + list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int) + return(list_params) +} + +estimate_area <- function(mass_max, resol, scale, sigma) { + #' Estimate area of Gaussian curve + #' + #' @param mass_max: Value for m/z at maximum intensity of a peak (float) + #' @param resol: Value for resolution (integer) + #' @param scale: Value for peak width (float) + #' @param sigma: Value for standard deviation (float) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return area_curve: Value for area under the Gaussian curve (float) + + # Number used to calculate area under Gaussian curve + int_factor <- 1 * 10^5 + + # avoid vectors that are too big (cannot allocate vector of size ...) + if (mass_max > 1200) return(0) + + # generate a mass_vector with equally spaced m/z values + fwhm <- get_fwhm(mass_max, resol) + mz_min <- mass_max - 2 * fwhm + mz_max <- mass_max + 2 * fwhm + mz_range <- mz_max - mz_min + mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor) + + # estimate area under the curve + area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100 + + return(area_curve) +} + +get_fwhm <- function(query_mass, resol) { + #' Calculate fwhm (full width at half maximum intensity) for a peak + #' + #' @param query_mass: Value for mass (float) + #' @param resol: Value for resolution (integer) + #' + #' @return fwhm: Value for full width at half maximum (float) + + # set aberrant values of query_mass to zero + if (is.nan(query_mass)) query_mass <- 0 + if (is.na(query_mass)) query_mass <- 0 + if (is.null(query_mass)) query_mass <- 0 + if (query_mass < 0) query_mass <- 0 + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) + # calculate full width at half maximum + fwhm <- query_mass / resol_mz + return(fwhm) +} + +fit_optim <- function(mass_vector, int_vector, resol, scanmode) { + #' Determine optimized fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' @param scanmode: Scan mode, positive or negative (string) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # Number used to calculate area under Gaussian curve + int_factor <- 1 * 10^5 + factor <- 1.5 + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector))[1] + mass_max <- mass_vector[max_index] + int_max <- int_vector[max_index] + # get peak width + fwhm <- get_fwhm(mass_max, resol) + # simplify the peak shape: represent it by a triangle + mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm) + int_max_simple <- c(0, int_max, 0) + + # define mass_diff as difference between last and first value of mass_max_simple + mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], + length = mass_diff * int_factor) + sigma <- get_stdev(mass_vector2, int_max_simple) + # define optimization function for optim based on normal distribution + opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) { + curve <- p * dnorm(mass_vector, mass_max, sigma) + return((max(curve) - max(int_vector))^2) + } + + # get optimal value for fitted Gaussian curve + opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max) + scale <- opt_fit$minimum + + # get an estimate of the area under the peak + area <- estimate_area(mass_max, resol, scale, sigma) + # put all values for this region of interest into a list + roi_value_list <- list("mean" = mass_max, + "area" = area, + "min" = mass_vector2[1], + "max" = mass_vector2[length(mass_vector2)]) + return(roi_value_list) +} + +within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol) { + #' Test whether two mass ranges are within ppm distance of each other + #' + #' @param mean: Value for mean m/z (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param sigma: Value for standard deviation (float) + #' @param area: Value for area under the curve (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param ppm: Value for distance between two values of mass (integer) + #' @param resol: Value for resolution (integer) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + # sort + index <- order(mean) + mean <- mean[index] + scale <- scale[index] + sigma <- sigma[index] + area <- area[index] + + summed <- NULL + remove <- NULL + + if (length(mean) > 1) { + for (i in 2:length(mean)) { + if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) { + + # avoid double occurrance in sum + if ((i - 1) %in% summed) next + + result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i], + mass_vector2, mass_vector, resol) + summed <- c(summed, i - 1, i) + if (is.nan(result_values$mean)) result_values$mean <- 0 + mean[i - 1] <- result_values$mean + mean[i] <- result_values$mean + area[i - 1] <- result_values$area + area[i] <- result_values$area + scale[i - 1] <- result_values$scale + scale[i] <- result_values$scale + sigma[i - 1] <- result_values$sigma + sigma[i] <- result_values$sigma + + remove <- c(remove, i) + } + } + } + + if (length(remove) != 0) { + mean <- mean[-c(remove)] + area <- area[-c(remove)] + scale <- scale[-c(remove)] + sigma <- sigma[-c(remove)] + } + + list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL) + return(list_params) +} + +sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol) { + #' Sum two curves + #' + #' @param mean1: Value for mean m/z of first peak (float) + #' @param mean2: Value for mean m/z of second peak (float) + #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer) + #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer) + #' @param sigma1: Value for standard deviation for first peak (float) + #' @param sigma2: Value for standard deviation for second peak (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2)) + + mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)), + max(scale2 * dnorm(mass_vector2, mean2, sigma2)))) + + # get new values for parameters + fwhm <- get_fwhm(mean1_plus2, resol) + area <- max(sum_fit) + scale <- scale1 + scale2 + sigma <- (fwhm / 2) * 0.85 + + list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma) + return(list_params) +} + From 9d7a69fcb2a3330bc77e05bbc645008c7793627d Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 10 Feb 2025 10:03:21 +0100 Subject: [PATCH 04/31] refactor DIMS PeakFinding, flow between scripts --- DIMS/AveragePeaks.R | 72 ++++ DIMS/AveragePeaks.nf | 18 + DIMS/PeakFinding.R | 30 +- DIMS/PeakGrouping.R | 2 +- DIMS/preprocessing/peak_finding_functions.R | 430 +++++++++++--------- 5 files changed, 334 insertions(+), 218 deletions(-) create mode 100644 DIMS/AveragePeaks.R create mode 100644 DIMS/AveragePeaks.nf diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R new file mode 100644 index 00000000..695c08fa --- /dev/null +++ b/DIMS/AveragePeaks.R @@ -0,0 +1,72 @@ +# define parameters +# ppm as fixed value, not the same ppm as in peak grouping +ppm_peak <- 2 + +library(dplyr) + +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes){ + # get sample names + load(paste0(scanmode, "_repl_pattern.RData")) + sample_names <- names(repl_pattern_filtered) + # initialize + outlist_total <- NULL + # for each biological sample, average peaks in technical replicates + for (sample_name in sample_names) { + print(sample_name) + # Initialize per sample + peaklist_allrepl <- NULL + nr_repl_persample <- 0 + # averaged_peaks <- matrix(0, nrow = 10 ^ 7, ncol = 6) # how big does it need to be? + averaged_peaks <- matrix(0, nrow = 0, ncol = 6) # append + colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # load RData files of technical replicates belonging to biological sample + sample_techreps_file <- repl_pattern_filtered[sample_name][[1]] + for (file_nr in 1:length(sample_techreps_file)) { + print(sample_techreps_file[file_nr]) + tech_repl_file <- paste0(sample_techreps_file[file_nr], "_positive.RData") + tech_repl <- get(load(tech_repl_file)) + # combine data for all technical replicates + peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) + # count number of replicates for each biological sample + nr_repl_persample <- nr_repl_persample + 1 + } + # sort on mass + peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) + peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) + peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) + # peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt)) + peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) + # average over technical replicates + while (nrow(peaklist_allrepl_sorted) > 1) { + # store row numbers + peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) + # find the peaks in the dataset with corresponding m/z plus or minus tolerance + reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] + mz_tolerance <- (reference_mass * ppm_peak) / 10^6 + minmz_ref <- reference_mass - mz_tolerance + maxmz_ref <- reference_mass + mz_tolerance + select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) + select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] + nrsamples <- length(select_peak_indices) + # put averaged intensities into a new row and append to averaged_peaks + averaged_1peak <- matrix(0, nrow = 1, ncol = 6) + colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # calculate m/z values for peak group + averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) + averaged_1peak[1, "fq"] <- nrsamples + averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) + # put intensities into proper columns + peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] + averaged_peaks <- rbind(averaged_peaks, averaged_1peak) + } + # add sample name to first column and append to outlist_total for all samples + averaged_peaks[ , "samplenr"] <- sample_name + outlist_total <- rbind(outlist_total, averaged_peaks) + } + save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData")) +} + diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf new file mode 100644 index 00000000..9d49f5f8 --- /dev/null +++ b/DIMS/AveragePeaks.nf @@ -0,0 +1,18 @@ +process AveragePeaks { + tag "DIMS AveragePeaks" + label 'AveragePeaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(rdata_files) + path(replication_pattern) + + output: + path 'AvgPeaks_*.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R + """ +} diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 738a0bc7..c54a64cf 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,30 +1,26 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -replicate_mzmlfile <- cmd_args[1] +replicate_rdatafile <- cmd_args[1] breaks_file <- cmd_args[2] resol <- as.numeric(cmd_args[3]) -scripts_dir <- cmd_args[4] +preprocessing_scripts_dir <- cmd_args[4] peak_thresh <- 2000 # load in function scripts -preprocessing_scripts_dir <- gsub("Utils", "preprocessing", scripts_dir) source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R")) load(breaks_file) # Load output of AssignToBins for a sample -sample_techrepl <- get(load(replicate_mzmlfile)) -scanmodes <- c("positive", "negative") -ints_pos <- peak_list$pos -ints_neg <- peak_list$neg +sample_techrepl <- get(load(replicate_rdatafile)) +techrepl_name <- colnames(peak_list$pos)[1] # Initialize options(digits = 16) # run the findPeaks function -techrepl_name <- colnames(ints_pos)[1] - +scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # turn dataframe with intensities into a named list if (scanmode == "positive") { @@ -36,18 +32,12 @@ for (scanmode in scanmodes) { ints_fullrange <- as.vector(ints_perscanmode) names(ints_fullrange) <- rownames(ints_perscanmode) - start.time <- Sys.time() - # look for m/z range for all peaks - allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, scanmode, peak_thresh) - - end.time <- Sys.time() - time.taken <- end.time - start.time - time.taken + allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, peak_thresh) # turn the list into a dataframe - outlist_persample <- NULL - outlist_persample <- cbind("samplenr" = allpeaks_values$nr, + outlist_techrep <- NULL + outlist_techrep <- cbind("samplenr" = allpeaks_values$nr, "mzmed.pkt" = allpeaks_values$mean, "fq" = allpeaks_values$qual, "mzmin.pkt" = allpeaks_values$min, @@ -55,10 +45,10 @@ for (scanmode in scanmodes) { "height.pkt" = allpeaks_values$area) # remove peaks with height = 0 - outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ] + outlist_techrep <- outlist_techrep[outlist_techrep[, "height.pkt"] != 0, ] # save output to file - save(outlist_persample, file = paste0(techrepl_name, "_", scanmode, ".RData")) + save(outlist_techrep, file = paste0(techrepl_name, "_", scanmode, ".RData")) # generate text output to log file on number of spikes for this sample # spikes are peaks that are too narrow, e.g. 1 data point diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 53e42de3..69fcde9c 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -22,7 +22,7 @@ if (grepl("negative", basename(hmdb_part_file))) { batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] # load file with spectrum peaks -spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") +spec_peaks_file <- paste0("AvgPeaks_", scanmode, ".RData") load(spec_peaks_file) outlist_copy <- outlist_total rm(outlist_total) diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R index 331d563b..947fea1f 100644 --- a/DIMS/preprocessing/peak_finding_functions.R +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -1,18 +1,14 @@ # functions for peak finding -search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_thresh) { +search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) { #' Divide the full m/z range into regions of interest with min, max and mean m/z #' #' @param ints_fullrange: Named list of intensities (float) #' @param resol: Value for resolution (integer) #' @param sample_name: Sample name (string) - #' @param scanmode: Scan mode, positive or negative (string) #' @param peak_thresh: Value for noise level threshold (integer) #' #' @return allpeaks_values: list of m/z regions of interest - # Number used to calculate area under Gaussian curve - int_factor <- 1 * 10^5 - # initialize list to store results for all peaks allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) @@ -50,27 +46,27 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th #sub_range <- int_vector #names(sub_range) <- mass_vector #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, - # sample_name, scanmode, peak_thresh) + # sample_name, peak_thresh) # A proper peak needs to have at least 3 intensities above threshold } else if (length(int_vector) > 3) { # check if the sum of intensities is above zero. Why is this necessary? #if (sum(int_vector) == 0) next # define mass_diff as difference between last and first value of mass_vector - mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] + # mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] # generate a second mass_vector with equally spaced m/z values - mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)], - length = mass_diff * int_factor) + mass_vector_eq <- seq(mass_vector[1], mass_vector[length(mass_vector)], + length = 10 * length(mass_vector)) # Find the index in int_vector with the highest intensity # max_index <- which(int_vector == max(int_vector)) # get initial fit values - roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, # max_index, - resol, force = length(max_index), - use_bounds = FALSE, scanmode) + roi_values <- fit_gaussian(mass_vector_eq, mass_vector, int_vector, + resol, force_nr = length(max_index), + use_bounds = FALSE) if (roi_values$qual[1] == 1) { # get optimized fit values - roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + roi_values <- fit_optim(mass_vector, int_vector, resol) # add region of interest to list of all peaks allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) @@ -93,7 +89,7 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th } else { - roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + roi_values <- fit_optim(mass_vector, int_vector, resol) allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) @@ -109,24 +105,19 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th return(allpeaks_values) } -# remove plot sections (commented out) -fit_gaussian <- function(mass_vector2, mass_vector, int_vector, - resol, force, use_bounds, scanmode) { +fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector, + resol, force_nr, use_bounds) { #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z #' - #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Value used to calculate area under Gaussian curve (integer) #' @param resol: Value for resolution (integer) - #' @param force: Number of local maxima in int_vector (integer) + #' @param force_nr: Number of local maxima in int_vector (integer) #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' @param scanmode: Scan mode, positive or negative (string) #' #' @return roi_value_list: list of fit values for region of interest (list) - # Number used to calculate area under Gaussian curve - int_factor <- 1 * 10^5 - # Find the index in int_vector with the highest intensity max_index <- which(int_vector == max(int_vector)) @@ -138,20 +129,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, peak_max <- NULL fit_quality1 <- 0.15 fit_quality <- 0.2 + # set initial value for scale factor + scale_factor <- 2 # One local maximum: - if (force == 1) { - # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) - fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol, + if (force_nr == 1) { + # determine fit values for 1 Gaussian peak + fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol, fit_quality1, use_bounds) - # set initial value for scale factor - scale <- 2 # test if the mean is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { # run this function again with fixed boundaries - return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol, - force = 1, use_bounds = TRUE, scanmode)) + return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol, + force_nr = 1, use_bounds = TRUE)) } else { # test if the fit is bad if (fit_values$qual > fit_quality1) { @@ -163,12 +154,12 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, new_index <- c(new_index, 2 * new_index) } # run this function again with two local maxima - return(fit_gaussian(mass_vector2, mass_vector, int_vector, - resol, force = 2, use_bounds = FALSE, scanmode)) + return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, + resol, force_nr = 2, use_bounds = FALSE)) # good fit } else { peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, fit_values$sigma)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] @@ -177,9 +168,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, } #### Two local maxima; need at least 6 data points for this #### - } else if (force == 2 && (length(mass_vector) > 6)) { - # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) - fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, resol, + } else if (force_nr == 2 && (length(mass_vector) > 6)) { + # determine fit values for 2 Gaussian peaks + fit_values <- fit_2peaks(mass_vector_eq, mass_vector, int_vector, max_index, resol, use_bounds, fit_quality) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || @@ -187,8 +178,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, # check if fit quality is bad if (fit_values$qual > fit_quality) { # run this function again with fixed boundaries - return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol, - force = 2, use_bounds = TRUE, scanmode)) + return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol, + force_nr = 2, use_bounds = TRUE)) } else { # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! @@ -217,9 +208,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, new_index <- c(new_index, 2 * new_index, 3 * new_index) } # run this function again with three local maxima - return(fit_gaussian(mass_vector2, mass_vector, int_vector, - resol, force = 3, use_bounds = FALSE, scanmode)) - # good fit, all means are within m/z range + return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, + resol, force_nr = 3, use_bounds = FALSE)) + # good fit, all means are within m/z range } else { # check if means are within 3 ppm and sum if so tmp <- fit_values$qual @@ -227,8 +218,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, nr_means <- length(fit_values$mean) while (nr_means != nr_means_new) { nr_means <- length(fit_values$mean) - fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, - mass_vector2, mass_vector, ppm = 4, resol) + fit_values <- within_ppm(fit_values$mean, fit_values$scale_factor, fit_values$sigma, fit_values$area, + mass_vector_eq, mass_vector, ppm = 4, resol) nr_means_new <- length(fit_values$mean) } # restore original quality score @@ -245,16 +236,15 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, } } else { # More than two local maxima; fit 1 peak. - scale <- 2 fit_quality1 <- 0.40 use_bounds <- TRUE max_index <- which(int_vector == max(int_vector)) - fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol, + fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol, fit_quality1, use_bounds) # check for bad fit if (fit_values$qual > fit_quality1) { # get fit values from fit_optim - fit_values <- fit_optim(mass_vector, int_vector, resol, scanmode) + fit_values <- fit_optim(mass_vector, int_vector, resol) peak_mean <- c(peak_mean, fit_values$mean) peak_area <- c(peak_area, fit_values$area) peak_min <- fit_values$min @@ -262,7 +252,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, peak_qual <- 0 } else { peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma)) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, fit_values$sigma)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] @@ -279,11 +269,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, } -fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, +fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, resol, fit_quality, use_bounds) { #' Fit 1 Gaussian peak in small region of m/z #' - #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Value used to calculate area under Gaussian curve (integer) #' @param max_index: Index in int_vector with the highest intensity (integer) @@ -293,17 +283,17 @@ fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, #' #' @return roi_value_list: list of fit values for region of interest (list) - # set initial value for scale - scale <- 2 + # set initial value for scale_factor + scale_factor <- 2 if (length(int_vector) < 3) { message("Range too small, no fit possible!") } else { - if ((length(int_vector) == 4)) { + if (length(int_vector) == 4) { # fit 1 peak - mu <- weighted.mean(mass_vector, int_vector) + weighted_mu <- weighted.mean(mass_vector, int_vector) sigma <- get_stdev(mass_vector, int_vector) - fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) } else { # set range vector if ((length(mass_vector) - length(max_index)) < 2) { @@ -311,71 +301,79 @@ fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, } else if (length(max_index) < 2) { range1 <- c(1:5) } else { - range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2)) + } + # remove zero at the beginning of range1 + if (range1[1] == 0) { + range1 <- range1[-1] } - if (range1[1] == 0) range1 <- range1[-1] # remove NA if (length(which(is.na(int_vector[range1]))) != 0) { range1 <- range1[-which(is.na(int_vector[range1]))] } # fit 1 peak - mu <- weighted.mean(mass_vector[range1], int_vector[range1]) + weighted_mu <- weighted.mean(mass_vector[range1], int_vector[range1]) sigma <- get_stdev(mass_vector[range1], int_vector[range1]) - fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) + fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) } - p1 <- fitted_peak$par + fitted_mz <- fitted_peak$par[1] + fitted_nr <- fitted_peak$par[2] - # get new value for fit quality and scale - fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new - scale_new <- 1.2 * scale + # get new value for fit quality and scale_factor + fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new + scale_factor_new <- 1.2 * scale_factor # bad fit if (fq_new > fit_quality) { # optimize scaling factor fq <- 0 - scale <- 0 - if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) { - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + scale_factor <- 0 + if (sum(int_vector) > sum(params1[2] * dnorm(mass_vector, fitted_mz, sigma))) { + # increase scale_factor until convergence + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) { fq <- fq_new - scale <- scale_new + scale_factor <- scale_factor_new # fit 1 peak - fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) - p1 <- fitted_peak$par - # get new value for fit quality and scale - fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new - scale_new <- 1.2 * scale + fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) + params1 <- fitted_peak$par + # get new value for fit quality and scale_factor + fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new + scale_factor_new <- 1.2 * scale_factor } } else { - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + # decrease scale_factor until convergence + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) { fq <- fq_new - scale <- scale_new + scale_factor <- scale_factor_new # fit 1 peak - fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) - p1 <- fitted_peak$par - # get new value for fit quality and scale - fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new - scale_new <- 0.8 * scale + fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) + params1 <- fitted_peak$par + # get new value for fit quality and scale_factor + fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new + scale_factor_new <- 0.8 * scale_factor + print(scale_factor_new) } } - # use optimized scale factor to fit 1 peak + # use optimized scale_factor factor to fit 1 peak if (fq < fq_new) { - fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds) - p1 <- fitted_peak$par + fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) + fitted_mz <- fitted_peak$par[1] + fitted_nr <- fitted_peak$par[2] fq_new <- fq } } } - roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new) + roi_value_list <- list("mean" = fitted_mz, "scale_factor" = fitted_nr, "sigma" = sigma, "qual" = fq_new) return(roi_value_list) } -fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol, use_bounds = FALSE, +fit_2peaks <- function(mass_vector_eq, mass_vector, int_vector, max_index, resol, use_bounds = FALSE, fit_quality) { - #' Fit 2 Gaussian peaks in small region of m/z + #' Fit 2 Gaussian peaks in a small region of m/z #' - #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Value used to calculate area under Gaussian curve (integer) #' @param max_index: Index in int_vector with the highest intensity (integer) @@ -390,73 +388,93 @@ fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol, peak_scale <- NULL peak_sigma <- NULL - # set range vectors for 2 peaks - range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) - if (range1[1] == 0) range1 <- range1[-1] - range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) - if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)] + # set range vectors for first peak + range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2)) + # remove zero at the beginning of range1 + if (range1[1] == 0) { + range1 <- range1[-1] + } + # set range vectors for second peak + range2 <- seq(from = (max_index[2] - 2), to = (max_index[2] + 2)) + # if range2 ends outside mass_vector, shorten it + if (length(mass_vector) < range2[length(range2)]) { + range2 <- range2[-length(range2)] + } + # check whether the two ranges overlap range1 <- check_overlap(range1, range2)[[1]] range2 <- check_overlap(range1, range2)[[2]] - # check for negative or 0 + # check for negative or 0 values in range1 or range2 remove <- which(range1 < 1) - if (length(remove) > 0) range1 <- range1[-remove] + if (length(remove) > 0) { + range1 <- range1[-remove] + } remove <- which(range2 < 1) - if (length(remove) > 0) range2 <- range2[-remove] + if (length(remove) > 0) { + range2 <- range2[-remove] + } # remove NA - if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] - if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + if (length(which(is.na(int_vector[range1]))) != 0) { + range1 <- range1[-which(is.na(int_vector[range1]))] + } + if (length(which(is.na(int_vector[range2]))) != 0) { + range2 <- range2[-which(is.na(int_vector[range2]))] + } # fit 2 peaks, first separately, then together - mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + weighted_mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) - fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) - p1 <- fitted_peak$par + fitted_peak1 <- optimize_1gaussian(mass_vector[range1], int_vector[range1], sigma1, weighted_mu1, scale_factor, use_bounds) + fitted_mz1 <- fitted_peak1$par[1] + fitted_nr1 <- fitted_peak1$par[2] # second peak - mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + weighted_mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) - fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) - p2 <- fitted_peak$par + fitted_peak2 <- optimize_1gaussian(mass_vector[range2], int_vector[range2], sigma2, weighted_mu2, scale_factor, use_bounds) + fitted_mz2 <- fitted_peak2$par[1] + fitted_nr2 <- fitted_peak2$par[2] # combined - fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds) - pc <- fitted_2peaks$par + fitted_2peaks <- optimize_2gaussians(mass_vector, int_vector, sigma1, sigma2, + fitted_mz1, fitted_nr1, + fitted_mz2, fitted_nr2, use_bounds) + fitted_2peaks_mz1 <- fitted_2peaks$par[1] + fitted_2peaks_nr1 <- fitted_2peaks$par[2] + fitted_2peaks_mz2 <- fitted_2peaks$par[3] + fitted_2peaks_nr2 <- fitted_2peaks$par[4] # get fit quality - if (is.null(sigma2)) sigma2 <- sigma1 - sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + - (pc[4] * dnorm(mass_vector, pc[3], sigma2)) - fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2], + if (is.null(sigma2)) { + sigma2 <- sigma1 + } + sum_fit <- (fitted_2peaks_nr1 * dnorm(mass_vector, fitted_2peaks_mz1, sigma1)) + + (fitted_2peaks_nr2 * dnorm(mass_vector, fitted_2peaks_mz2, sigma2)) + lowest_mz <- sort(c(fitted_2peaks_mz1, fitted_2peaks_mz2))[1] + fq_new <- get_fit_quality(mass_vector, int_vector, lowest_mz, resol, sum_fit = sum_fit)$fq_new # get parameter values - area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) - area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) - peak_area <- c(peak_area, area1) - peak_area <- c(peak_area, area2) - peak_mean <- c(peak_mean, pc[1]) - peak_mean <- c(peak_mean, pc[3]) - peak_scale <- c(peak_scale, pc[2]) - peak_scale <- c(peak_scale, pc[4]) - peak_sigma <- c(peak_sigma, sigma1) - peak_sigma <- c(peak_sigma, sigma2) - - roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + area1 <- estimate_area(fitted_2peaks_mz1, resol, fitted_2peaks_nr1, sigma1) + area2 <- estimate_area(fitted_2peaks_mz2, resol, fitted_2peaks_nr2, sigma2) + peak_area <- c(peak_area, area1, area2) + peak_mean <- c(peak_mean, fitted_2peaks_mz1, fitted_2peaks_mz2) + peak_scale <- c(peak_scale, fitted_2peaks_nr1, fitted_2peaks_nr2) + peak_sigma <- c(peak_sigma, sigma1, sigma2) + + roi_value_list <- list("mean" = peak_mean, "scale_factor" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq_new) return(roi_value_list) } -fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds) { +optimize_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale_factor, use_bounds) { #' Fit a Gaussian curve for a peak with given parameters #' #' @param mass_vector: Vector of masses (float) #' @param int_vector: Vector of intensities (float) #' @param sigma: Value for width of the peak (float) #' @param query_mass: Value for mass at center of peak (float) + #' @param scale_factor: Value for scaling intensities (float) #' @param use_bounds: Boolean to indicate whether boundaries are to be used #' #' @return opt_fit: list of parameters and values describing the optimal fit - # Initial value used to estimate scaling parameter - scale <- 2 - # define optimization function for optim based on normal distribution opt_f <- function(params) { d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) @@ -467,19 +485,19 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds lower <- c(mass_vector[1], 0, mass_vector[1], 0) upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) # get optimal value for fitted Gaussian curve - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)), opt_f, control = list(maxit = 10000), method = "L-BFGS-B", lower = lower, upper = upper) } else { - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)), opt_f, control = list(maxit = 10000)) } return(opt_fit) } -fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, - query_mass1, scale1, - query_mass2, scale2, use_bounds) { +optimize_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, + query_mass1, scale_factor1, + query_mass2, scale_factor2, use_bounds) { #' Fit two Gaussian curves for a peak with given parameters #' #' @param mass_vector: Vector of masses (float) @@ -487,9 +505,9 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, #' @param sigma1: Value for width of the first peak (float) #' @param sigma2: Value for width of the second peak (float) #' @param query_mass1: Value for mass at center of first peak (float) - #' @param scale1: Value for scaling intensities for first peak (float) + #' @param scale_factor1: Value for scaling intensities for first peak (float) #' @param query_mass2: Value for mass at center of second peak (float) - #' @param scale2: Value for scaling intensities for second peak (float) + #' @param scale_factor2: Value for scaling intensities for second peak (float) #' @param use_bounds: Boolean to indicate whether boundaries are to be used #' #' @return opt_fit: list of parameters and values describing the optimal fit @@ -506,27 +524,27 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, lower <- c(mass_vector[1], 0, mass_vector[1], 0) upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) # get optimal value for 2 fitted Gaussian curves - if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) { sigma2 <- sigma1 - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass1), as.numeric(scale1)), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), + as.numeric(query_mass1), as.numeric(scale_factor1)), opt_f, control = list(maxit = 10000), method = "L-BFGS-B", lower = lower, upper = upper) } else { - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass2), as.numeric(scale2)), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), + as.numeric(query_mass2), as.numeric(scale_factor2)), opt_f, control = list(maxit = 10000), method = "L-BFGS-B", lower = lower, upper = upper) } } else { - if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) { sigma2 <- sigma1 - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass1), as.numeric(scale1)), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), + as.numeric(query_mass1), as.numeric(scale_factor1)), opt_f, control = list(maxit = 10000)) } else { - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass2), as.numeric(scale2)), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), + as.numeric(query_mass2), as.numeric(scale_factor2)), opt_f, control = list(maxit = 10000)) } } @@ -554,15 +572,13 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) { return(stdev) } -## adapted from getFitQuality.R -# parameter not used: mu_last -get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) { - #' Fit 1 Gaussian peak in small region of m/z +get_fit_quality <- function(mass_vector, int_vector, mu_first, resol, scale_factor = NULL, sigma = NULL, sum_fit = NULL) { + #' Get fit quality for 1 Gaussian peak in small region of m/z #' #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Value used to calculate area under Gaussian curve (integer) #' @param mu_first: Value for first peak (float) - #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param scale_factor: Initial value used to estimate scaling parameter (integer) #' @param resol: Value for resolution (integer) #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float) #' @@ -571,8 +587,8 @@ get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, s mass_vector_int <- mass_vector int_vector_int <- int_vector # get new fit quality - fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / - rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) + fq_new <- mean(abs((scale_factor * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / + rep((max(scale_factor * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) } else { sum_fit_int <- sum_fit int_vector_int <- int_vector @@ -588,32 +604,27 @@ get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, s return(list_params) } -estimate_area <- function(mass_max, resol, scale, sigma) { +estimate_area <- function(mass_max, resol, scale_factor, sigma) { #' Estimate area of Gaussian curve #' #' @param mass_max: Value for m/z at maximum intensity of a peak (float) #' @param resol: Value for resolution (integer) - #' @param scale: Value for peak width (float) + #' @param scale_factor: Value for peak width (float) #' @param sigma: Value for standard deviation (float) - #' @param int_factor: Value used to calculate area under Gaussian curve (integer) #' #' @return area_curve: Value for area under the Gaussian curve (float) - # Number used to calculate area under Gaussian curve - int_factor <- 1 * 10^5 - - # avoid vectors that are too big (cannot allocate vector of size ...) - if (mass_max > 1200) return(0) + # calculate width of peak at half maximum + fwhm <- get_fwhm(mass_max, resol) # generate a mass_vector with equally spaced m/z values - fwhm <- get_fwhm(mass_max, resol) mz_min <- mass_max - 2 * fwhm mz_max <- mass_max + 2 * fwhm mz_range <- mz_max - mz_min - mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor) + mass_vector_eq <- seq(mz_min, mz_max, length = 100) # estimate area under the curve - area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100 + area_curve <- sum(scale_factor * dnorm(mass_vector_eq, mass_max, sigma)) / 100 return(area_curve) } @@ -625,32 +636,34 @@ get_fwhm <- function(query_mass, resol) { #' @param resol: Value for resolution (integer) #' #' @return fwhm: Value for full width at half maximum (float) - - # set aberrant values of query_mass to zero - if (is.nan(query_mass)) query_mass <- 0 - if (is.na(query_mass)) query_mass <- 0 - if (is.null(query_mass)) query_mass <- 0 - if (query_mass < 0) query_mass <- 0 + + # set aberrant values of query_mass to default value of 200 + if (is.na(query_mass)) { + query_mass <- 200 + } + if (query_mass < 0) { + query_mass <- 200 + } # calculate resolution at given m/z value resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) # calculate full width at half maximum fwhm <- query_mass / resol_mz + return(fwhm) } -fit_optim <- function(mass_vector, int_vector, resol, scanmode) { +fit_optim <- function(mass_vector, int_vector, resol) { #' Determine optimized fit of Gaussian curve to small region of m/z #' #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Vector of intensities for a region of interest (float) #' @param resol: Value for resolution (integer) - #' @param scanmode: Scan mode, positive or negative (string) #' #' @return roi_value_list: list of fit values for region of interest (list) - # Number used to calculate area under Gaussian curve - int_factor <- 1 * 10^5 - factor <- 1.5 + # initial value for scale_factor + scale_factor <- 1.5 + # Find the index in int_vector with the highest intensity max_index <- which(int_vector == max(int_vector))[1] mass_max <- mass_vector[max_index] @@ -658,15 +671,15 @@ fit_optim <- function(mass_vector, int_vector, resol, scanmode) { # get peak width fwhm <- get_fwhm(mass_max, resol) # simplify the peak shape: represent it by a triangle - mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm) + mass_max_simple <- c(mass_max - scale_factor * fwhm, mass_max, mass_max + scale_factor * fwhm) int_max_simple <- c(0, int_max, 0) # define mass_diff as difference between last and first value of mass_max_simple - mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] + # mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] # generate a second mass_vector with equally spaced m/z values - mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], - length = mass_diff * int_factor) - sigma <- get_stdev(mass_vector2, int_max_simple) + mass_vector_eq <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], + length = 100 * length(mass_max_simple)) + sigma <- get_stdev(mass_vector_eq, int_max_simple) # define optimization function for optim based on normal distribution opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) { curve <- p * dnorm(mass_vector, mass_max, sigma) @@ -675,26 +688,26 @@ fit_optim <- function(mass_vector, int_vector, resol, scanmode) { # get optimal value for fitted Gaussian curve opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max) - scale <- opt_fit$minimum + scale_factor <- opt_fit$minimum # get an estimate of the area under the peak - area <- estimate_area(mass_max, resol, scale, sigma) + area <- estimate_area(mass_max, resol, scale_factor, sigma) # put all values for this region of interest into a list roi_value_list <- list("mean" = mass_max, "area" = area, - "min" = mass_vector2[1], - "max" = mass_vector2[length(mass_vector2)]) + "min" = mass_vector_eq[1], + "max" = mass_vector_eq[length(mass_vector_eq)]) return(roi_value_list) } -within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol) { +within_ppm <- function(mean, scale_factor, sigma, area, mass_vector_eq, mass_vector, ppm = 4, resol) { #' Test whether two mass ranges are within ppm distance of each other #' #' @param mean: Value for mean m/z (float) - #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param scale_factor: Initial value used to estimate scaling parameter (integer) #' @param sigma: Value for standard deviation (float) #' @param area: Value for area under the curve (float) - #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param ppm: Value for distance between two values of mass (integer) #' @param resol: Value for resolution (integer) @@ -704,7 +717,7 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm # sort index <- order(mean) mean <- mean[index] - scale <- scale[index] + scale_factor <- scale_factor[index] sigma <- sigma[index] area <- area[index] @@ -718,16 +731,16 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm # avoid double occurrance in sum if ((i - 1) %in% summed) next - result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i], - mass_vector2, mass_vector, resol) + result_values <- sum_curves(mean[i - 1], mean[i], scale_factor[i - 1], scale_factor[i], sigma[i - 1], sigma[i], + mass_vector_eq, mass_vector, resol) summed <- c(summed, i - 1, i) if (is.nan(result_values$mean)) result_values$mean <- 0 mean[i - 1] <- result_values$mean mean[i] <- result_values$mean area[i - 1] <- result_values$area area[i] <- result_values$area - scale[i - 1] <- result_values$scale - scale[i] <- result_values$scale + scale_factor[i - 1] <- result_values$scale_factor + scale_factor[i] <- result_values$scale_factor sigma[i - 1] <- result_values$sigma sigma[i] <- result_values$sigma @@ -739,41 +752,64 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm if (length(remove) != 0) { mean <- mean[-c(remove)] area <- area[-c(remove)] - scale <- scale[-c(remove)] + scale_factor <- scale_factor[-c(remove)] sigma <- sigma[-c(remove)] } - list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL) + list_params <- list("mean" = mean, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma, "qual" = NULL) return(list_params) } -sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol) { +sum_curves <- function(mean1, mean2, scale_factor1, scale_factor2, sigma1, sigma2, mass_vector_eq, mass_vector, resol) { #' Sum two curves #' #' @param mean1: Value for mean m/z of first peak (float) #' @param mean2: Value for mean m/z of second peak (float) - #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer) - #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer) + #' @param scale_factor1: Initial value used to estimate scaling parameter for first peak (integer) + #' @param scale_factor2: Initial value used to estimate scaling parameter for second peak (integer) #' @param sigma1: Value for standard deviation for first peak (float) #' @param sigma2: Value for standard deviation for second peak (float) - #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param resol: Value for resolution (integer) #' #' @return list_params: list of parameters indicating quality of fit (list) - sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2)) + sum_fit <- (scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)) + (scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2)) - mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)), - max(scale2 * dnorm(mass_vector2, mean2, sigma2)))) + mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)), + max(scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2)))) # get new values for parameters fwhm <- get_fwhm(mean1_plus2, resol) area <- max(sum_fit) - scale <- scale1 + scale2 + scale_factor <- scale_factor1 + scale_factor2 sigma <- (fwhm / 2) * 0.85 - list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma) + list_params <- list("mean" = mean1_plus2, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma) return(list_params) } +check_overlap <- function(range1, range2) { + #' Modify range1 and range2 in case of overlap + #' + #' @param range1: Vector of m/z values for first peak (float) + #' @param range2: Vector of m/z values for second peak (float) + #' + #' @return new_ranges: list of two ranges (list) + + # Check for overlap + if (length(intersect(range1, range2)) == 2) { + if (length(range1) >= length(range2)) { + range1 <- range1[-length(range1)] + } else { + range2 <- range2[-1] + } + } else if (length(intersect(range1, range2)) == 3) { + range1 <- range1[-length(range1)] + range2 <- range2[-1] + } + new_ranges <- list("range1" = range1, "range2" = range2) + return(new_ranges) +} + From 62fc22503e990e70921ba3f41e827c9b874fa57c Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 11 Feb 2025 11:17:01 +0100 Subject: [PATCH 05/31] added unit tests for DIMS peak finding --- DIMS/preprocessing/peak_finding_functions.R | 13 +- DIMS/tests/testthat.R | 6 + .../testthat/test_peak_finding_functions.R | 218 ++++++++++++++++++ 3 files changed, 231 insertions(+), 6 deletions(-) create mode 100644 DIMS/tests/testthat.R create mode 100644 DIMS/tests/testthat/test_peak_finding_functions.R diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R index 947fea1f..1baef120 100644 --- a/DIMS/preprocessing/peak_finding_functions.R +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -38,8 +38,8 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) { } # check if there are more intensities than maximum for region of interest if (length(int_vector) > max_roi_length) { + print("vector of intensities is longer than max") print(length(int_vector)) - print(running_index) # trim lowest intensities to zero #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 # split the range into multiple sub ranges @@ -107,7 +107,7 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) { fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector, resol, force_nr, use_bounds) { - #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z + #' Fit 1 or 2 Gaussian peaks in small region of m/z #' #' @param mass_vector_eq: Vector of equally spaced m/z values (float) #' @param mass_vector: Vector of m/z values for a region of interest (float) @@ -329,14 +329,15 @@ fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, # optimize scaling factor fq <- 0 scale_factor <- 0 - if (sum(int_vector) > sum(params1[2] * dnorm(mass_vector, fitted_mz, sigma))) { + if (sum(int_vector) > sum(fitted_nr * dnorm(mass_vector, fitted_mz, sigma))) { # increase scale_factor until convergence while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) { fq <- fq_new scale_factor <- scale_factor_new # fit 1 peak fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - params1 <- fitted_peak$par + fitted_mz <- fitted_peak$par[1] + fitted_nr <- fitted_peak$par[2] # get new value for fit quality and scale_factor fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new scale_factor_new <- 1.2 * scale_factor @@ -348,11 +349,11 @@ fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, scale_factor <- scale_factor_new # fit 1 peak fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - params1 <- fitted_peak$par + fitted_mz <- fitted_peak$par[1] + fitted_nr <- fitted_peak$par[2] # get new value for fit quality and scale_factor fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new scale_factor_new <- 0.8 * scale_factor - print(scale_factor_new) } } # use optimized scale_factor factor to fit 1 peak diff --git a/DIMS/tests/testthat.R b/DIMS/tests/testthat.R new file mode 100644 index 00000000..a467339d --- /dev/null +++ b/DIMS/tests/testthat.R @@ -0,0 +1,6 @@ +# Run all unit tests +library(testthat) +# enable snapshots +local_edition(3) + +testthat::test_file("testthat/test_peak_finding_functions.R") diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R new file mode 100644 index 00000000..a0db0ca9 --- /dev/null +++ b/DIMS/tests/testthat/test_peak_finding_functions.R @@ -0,0 +1,218 @@ +# unit tests for PeakFinding functions: + +source("../preprocessing/peak_finding_functions.R") + +# test fit_quality function: +test_that("fit quality is correctly calculated", { + # initialize + test_resol <- 140000 + test_scale_factor <- 2 + test_sigma <- 0.0001 + test_sum_fit <- NULL + + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_mz_max <- max(test_mass_vector) + + expect_type(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit), "list") + expect_equal(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit)$fq_new, 2.426, tolerance = 0.001, TRUE) +}) + +# test get_stdev: +test_that("standard deviation is correctly calculated", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_resol <- 140000 + + expect_type(get_stdev(test_mass_vector, test_int_vector, test_resol), "double") + expect_equal(get_stdev(test_mass_vector, test_int_vector, test_resol), 0.000126, tolerance = 0.00001, TRUE) +}) + +# test estimate_area +test_that("area under the curve is correctly calculated", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_mz_max <- max(test_mass_vector) + test_resol <- 140000 + test_scale_factor <- 2 + test_sigma <- 0.0001 + + expect_type(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), "double") + expect_equal(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), 1673.061, tolerance = 0.001, TRUE) +}) + +# test get_fwhm +test_that("fwhm is correctly calculated", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_mz_max <- max(test_mass_vector) + test_resol <- 140000 + + expect_type(get_fwhm(test_mz_max, test_resol), "double") + expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE) +}) + +# test get_fwhm for NA +test_that("fwhm for mass NA gives standard fwhm", { + test_resol <- 140000 + + expect_type(get_fwhm(NA, test_resol), "double") + expect_equal(get_fwhm(NA, test_resol), 0.001428571, tolerance = 0.000001, TRUE) +}) + +# test within_ppm +test_that("two peaks are within 5 ppm", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_mz_max <- max(test_mass_vector) + test_scale_factor <- 2 + test_sigma <- 0.0001 + test_area <- 20000 + test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) + test_ppm <- 5 + test_resol <- 140000 + + expect_type(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol), "list") + expect_equal(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol)$mean, 70.00962, tolerance = 0.0001, TRUE) +}) + +# test sum_curves +test_that("two curves are correctly summed", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) + test_mean1 <- 70.00938 + test_mean2 <- 70.00962 + test_scale_factor1 <- 2 + test_scale_factor2 <- 4 + test_sigma1 <- 0.0001 + test_sigma2 <- 0.0002 + test_resol <- 140000 + + expect_type(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol), "list") + expect_equal(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol)$mean, 70.0095, tolerance = 0.0001, TRUE) +}) + +# test fit_optim +test_that("optimal peak fit can be found", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_resol <- 140000 + # create output to test on: + test_output <- list(mean = 70.0095, area = 5009.596, min = 70.00906, max = 70.00994) + + expect_type(fit_optim(test_mass_vector, test_int_vector, test_resol), "list") + for (key in names(test_output)) { + expect_equal(fit_optim(test_mass_vector, test_int_vector, test_resol)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) + } +}) + +# test optimize_1gaussian: +test_that("optimal value for m/z and scale_factor can be found", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_sigma <- 0.0001 + test_query_mass <- median(test_mass_vector) + test_scale_factor <- 2 + test_use_bounds <- FALSE + + expect_type(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds), "list") + expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[1], 70.00949, tolerance = 0.0001, TRUE) + expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[2], 4.570024, tolerance = 0.0001, TRUE) +}) + +# test fit_1peak: +test_that("correct mean m/z and scale_factor can be found for 1 peak", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) + test_max_index <- which(test_int_vector == max(test_int_vector)) + test_resol <- 140000 + test_fit_quality <- 0.5 + test_use_bounds <- FALSE + # create output to test on: + test_output <- list(mean = 70.0095, scale_factor = 5.23334, sigma = 0.000126, qual = 0.24300) + + expect_type(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds), "list") + for (key in names(test_output)) { + expect_equal(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) + } +}) + +# test optimize_2gaussians: +test_that("optimal value for m/z and scale_factor can be found", { + # create info for two peaks in a small region + test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006 + test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2) + test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100) + test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks)) + test_sigma1 <- 0.0001 + test_sigma2 <- 0.0002 + test_query_mass1 <- test_max_index[1] + test_query_mass2 <- test_max_index[2] + test_scale_factor1 <- 2 + test_scale_factor2 <- 3 + test_use_bounds <- FALSE + + expect_type(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds), "list") + expect_equal(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds)$par[1], 4, tolerance = 0.1, TRUE) +}) + +# test fit_2peaks: +test_that("correct mean m/z and scale_factor can be found for 2 peaks", { + # create info for two peaks in a small region + test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006 + test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2) + test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100) + test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks)) + test_resol <- 140000 + test_fit_quality <- 0.5 + test_use_bounds <- FALSE + # create output to test on: + test_output <- list(mean = c(70.00954, 70.00998), scale_factor = c(4.823598, 4.828072), sigma = c(0.0001257425, 0.0001257436), qual = 0.38341) + + expect_type(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, + test_resol, test_use_bounds, test_fit_quality), "list") + for (key in names(test_output)) { + expect_equal(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, + test_resol, test_use_bounds, test_fit_quality)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) + } +}) + +# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds) +test_that("initial gaussian fit is done correctly", { + # create peak info to test on: + test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 + test_int_vector <- c(9000, 16000, 19000, 15000, 6000) + test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) + test_resol <- 140000 + test_use_bounds <- FALSE + test_force_nr <- 1 + + expect_type(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds), "list") + expect_equal(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds)$mean, 70.00956, tolerance = 0.00001, TRUE) +}) + +# test search_mzrange(ints_fullrange, resol, sample_name, peak_thresh) +test_that("all peak finding functions work together", { + # enable snapshot + local_edition(3) + # create info for ten peaks separated by zero + test_large_mass_vector <- rep(70.00938, 80) + 1:80 * 0.00006 + test_large_int_vector <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000, 0), 10) + names(test_large_int_vector) <- test_large_mass_vector + test_resol <- 140000 + test_sample_name <- "C1.1" + test_peak_thresh <- 100 + + expect_type(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), "list") + expect_snapshot(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), error = FALSE) +}) + +# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds) +test_that("fit gaussian") From 5d311a3203e0eee999781d9621c10f01c63508f9 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:27:24 +0200 Subject: [PATCH 06/31] changed variable name tmp to replicates_persample --- DIMS/MakeInit.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 44d49962..9ff22623 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,5 +1,3 @@ -## adapted from makeInit in old pipeline - # define parameters args <- commandArgs(trailingOnly = TRUE) @@ -15,12 +13,12 @@ group_names_unique <- unique(group_names) # generate the replication pattern repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { - tmp <- c() + replicates_persample <- c() for (repl in nr_replicates:1) { index <- ((sample_group * nr_replicates) - repl) + 1 - tmp <- c(tmp, sample_names[index]) + replicates_persample <- c(replicates_persample, sample_names[index]) } - repl_pattern <- c(repl_pattern, list(tmp)) + repl_pattern <- c(repl_pattern, list(replicates_persample)) } names(repl_pattern) <- group_names_unique From 708b872f548ffee4a4b114a2b2b1ba20c6cfdc0a Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:29:54 +0200 Subject: [PATCH 07/31] omitted obsolete lines --- DIMS/GenerateBreaks.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index ddb68c56..007d5ab7 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,5 +1,3 @@ -## adapted from 1-generateBreaksFwhm.HPC.R ## - # load required package suppressPackageStartupMessages(library("xcms")) From 0521390b07a4ea54810ea80abc8e9d54766dac19 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:33:21 +0200 Subject: [PATCH 08/31] added weighted mean for half-bad TICs --- DIMS/AssignToBins.R | 76 ++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index a25a5e70..8b31af7e 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -6,51 +6,63 @@ cmd_args <- commandArgs(trailingOnly = TRUE) mzml_filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] -trimparams_filepath <- cmd_args[3] +trim_parameters_filepath <- cmd_args[3] resol <- as.numeric(cmd_args[4]) -trim <- 0.1 -dims_thresh <- 100 -# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg +# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, load(breaks_filepath) -# load trim paramters: trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(trimparams_filepath) +# load trim parameters file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos +load(trim_parameters_filepath) # get sample name -sample_name <- sub("\\..*$", "", basename(mzml_filepath)) +techrep_name <- sub("\\..*$", "", basename(mzml_filepath)) options(digits = 16) # Initialize pos_results <- NULL neg_results <- NULL - -# read in the data for 1 sample -raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) - -# for TIC plots: prepare txt files with data for plots -tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) -colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") -write.table(tic_intensity_persample, file = paste0(sample_name, "_TIC.txt")) - -# Create empty placeholders for later use bins <- rep(0, length(breaks_fwhm) - 1) pos_bins <- bins neg_bins <- bins +dims_thresh <- 100 -# Generate a matrix +# read in the data for 1 sample +raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) + +# Generate a matrix with retention times and intensities raw_data_matrix <- xcms::rawMat(raw_data) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] # Select scans between trim_left and trim_right -pos_times <- pos_times[pos_times > trim_left_pos & pos_times < trim_right_pos] -neg_times <- neg_times[neg_times > trim_left_neg & neg_times < trim_right_neg] +pos_times_trimmed <- pos_times[pos_times > trim_left_pos & pos_times < trim_right_pos] +neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_right_neg] +# get TIC intensities for areas between trim_left and trim_right +tic_intensity_persample <- cbind(raw_data@scantime, raw_data@tic) +colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") +tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) & + tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ] +tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) & + tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ] +# calculate weighted mean of intensities for pos and neg separately +mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"]) +mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"]) +# intensity per scan should be at least 80% of weighted mean +dims_thresh_pos <- 0.8 * mean_pos +dims_thresh_neg <- 0.8 * mean_neg + +# Generate an index with which to select values for each mode +#pos_index <- which(raw_data_matrix[, "time"] %in% pos_times) +#neg_index <- which(raw_data_matrix[, "time"] %in% neg_times) +# select only data from scans which pass the dims_thresh_pos and *_neg filter +pos_times_pass <- tic_intensity_pos[which(tic_intensity_pos[ , "tic_intensity"] > dims_thresh_pos), "retention_time"] +neg_times_pass <- tic_intensity_neg[which(tic_intensity_neg[ , "tic_intensity"] > dims_thresh_neg), "retention_time"] # Generate an index with which to select values for each mode -pos_index <- which(raw_data_matrix[, "time"] %in% pos_times) -neg_index <- which(raw_data_matrix[, "time"] %in% neg_times) +pos_index <- which(raw_data_matrix[, "time"] %in% pos_times_pass) +neg_index <- which(raw_data_matrix[, "time"] %in% neg_times_pass) # Separate each mode into its own matrix pos_raw_data_matrix <- raw_data_matrix[pos_index, ] neg_raw_data_matrix <- raw_data_matrix[neg_index, ] @@ -76,10 +88,10 @@ bin_indices_neg <- cut( if (nrow(pos_raw_data_matrix) > 0) { # set NA in intensities to zero pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0 - # aggregate intensities, calculate mean, use only values above dims_thresh + # aggregate intensities, calculate mean, use only values above dims_thresh_pos aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"], list(bin_indices_pos), - FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + FUN = function(x) { mean(x) }) # set NA to zero in second column aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0 pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2] @@ -87,10 +99,10 @@ if (nrow(pos_raw_data_matrix) > 0) { if (nrow(neg_raw_data_matrix) > 0) { # set NA in intensities to zero neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0 - # aggregate intensities, calculate mean, use only values above dims_thresh + # aggregate intensities, calculate mean, use only values above dims_thresh_neg aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"], list(bin_indices_neg), - FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + FUN = function(x) { mean(x) }) # set NA to zero in second column aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0 neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2] @@ -108,8 +120,8 @@ pos_results_transpose <- t(pos_results) neg_results_transpose <- t(neg_results) # Add file names as row names -rownames(pos_results_transpose) <- sample_name -rownames(neg_results_transpose) <- sample_name +rownames(pos_results_transpose) <- techrep_name +rownames(neg_results_transpose) <- techrep_name # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results breaks_fwhm_avg_minuslast <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] @@ -126,4 +138,10 @@ neg_results_final <- t(neg_results_transpose) peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm) -save(peak_list, file = paste0(sample_name, ".RData")) +save(peak_list, file = paste0(techrep_name, ".RData")) + +# for TIC plots: write txt files with data including threshold values for plots +dims_thresh <- c(rep(dims_thresh_pos, length(pos_times)), rep(dims_thresh_neg, length(neg_times))) +tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic, round(dims_thresh, 0)) +colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity", "threshold") +write.table(tic_intensity_persample, file = paste0(techrep_name, "_TIC.txt")) From 15403cd559ab7f953858bfee430df3575a670dec Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:39:05 +0200 Subject: [PATCH 09/31] replaced AverageTechReplicates step by EvaluateTics --- DIMS/EvaluateTics.R | 65 ++++++++++++++++++++++++++++++++++---------- DIMS/EvaluateTics.nf | 3 +- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R index 5b8c1e30..bc66a34e 100644 --- a/DIMS/EvaluateTics.R +++ b/DIMS/EvaluateTics.R @@ -13,7 +13,6 @@ highest_mz_file <- cmd_args[5] highest_mz <- get(load(highest_mz_file)) trim_params_filepath <- cmd_args[6] thresh2remove <- 1000000000 -dims_thresh <- 100 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { # collect list of samples to remove from replication pattern @@ -76,31 +75,62 @@ for (sample_nr in 1:length(repl_pattern)) { if (sum(peak_list$neg[, 1]) < thresh2remove) { cat(" ... Removed") remove_neg <- c(remove_neg, tech_reps[file_nr]) - } + } tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) } } +# negative scan mode +print("neg") pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) repl_pattern_filtered <- pattern_list$pattern +print(repl_pattern_filtered) +print(length(repl_pattern_filtered)) save(repl_pattern_filtered, file = "negative_repl_pattern.RData") -write.table( - remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" +# save replication pattern in txt file for use in Nextflow +allsamples_techreps_neg <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) +for (sample_nr in 1:length(repl_pattern_filtered)) { + allsamples_techreps_neg[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] + allsamples_techreps_neg[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") +} +allsamples_techreps_neg[, 3] <- "negative" + +# write information on miss_infusions +write.table(remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" ) +# positive scan mode pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +print(pattern_list) repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "positive_repl_pattern.RData") -write.table( - remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" +# save replication pattern in txt file for use in Nextflow +allsamples_techreps_pos <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) +for (sample_nr in 1:length(repl_pattern_filtered)) { + allsamples_techreps_pos[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] + allsamples_techreps_pos[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") +} +allsamples_techreps_pos[, 3] <- "positive" + +# combine information on samples and technical replicates for pos and neg +allsamples_techreps <- rbind(allsamples_techreps_pos, allsamples_techreps_neg) +write.table(allsamples_techreps, + file = paste0("replicates_per_sample.txt"), + col.names = FALSE, + row.names = FALSE, + sep = "," +) + +# write information on miss_infusions +write.table(remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" ) ## generate TIC plots @@ -127,8 +157,11 @@ for (sample_nr in c(1:length(repl_pattern))) { sample_name <- names(repl_pattern)[sample_nr] for (file_nr in 1:length(tech_reps)) { plot_nr <- plot_nr + 1 - # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) + # get threshold values per technical replicate + dims_thresh_pos <- repl1_nr[1, "threshold"] + dims_thresh_neg <- repl1_nr[nrow(repl1_nr), "threshold"] + # for replicates with bad TIC, determine what color the border of the plot should be bad_color_pos <- tech_reps[file_nr] %in% remove_pos bad_color_neg <- tech_reps[file_nr] %in% remove_neg if (bad_color_neg & bad_color_pos) { @@ -147,6 +180,8 @@ for (sample_nr in c(1:length(repl_pattern))) { geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + + geom_hline(yintercept = dims_thresh_pos, col = "green", linetype = 2, linewidth = 0.3) + + geom_hline(yintercept = dims_thresh_neg, col = "blue", linetype = 2, linewidth = 0.3) + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + theme(plot.background = element_rect(fill = plot_color), axis.text = element_text(size = 4), diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf index 7ce58684..831c9500 100644 --- a/DIMS/EvaluateTics.nf +++ b/DIMS/EvaluateTics.nf @@ -15,7 +15,8 @@ process EvaluateTics { path(trim_params_file) output: - path('*_repl_pattern.RData'), emit: pattern_files + path('*_repl_pattern.RData'), emit: pattern_files + path('replicates_per_sample.txt'), emit: sample_techreps path('miss_infusions_negative.txt') path('miss_infusions_positive.txt') path('*_TICplots.pdf') From 6a7ada6d303af488141a77252b7b8487ad759b4c Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:39:59 +0200 Subject: [PATCH 10/31] replaced AverageTechReplicates step by EvaluateTics --- DIMS/AverageTechReplicates.R | 200 ---------------------------------- DIMS/AverageTechReplicates.nf | 35 ------ 2 files changed, 235 deletions(-) delete mode 100644 DIMS/AverageTechReplicates.R delete mode 100644 DIMS/AverageTechReplicates.nf diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R deleted file mode 100644 index 98a2d414..00000000 --- a/DIMS/AverageTechReplicates.R +++ /dev/null @@ -1,200 +0,0 @@ -# adapted from 3-AverageTechReplicates.R - -# load packages -library("ggplot2") -library("gridExtra") - -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) - -init_file <- cmd_args[1] -nr_replicates <- as.numeric(cmd_args[2]) -run_name <- cmd_args[3] -dims_matrix <- cmd_args[4] -highest_mz_file <- cmd_args[5] -highest_mz <- get(load(highest_mz_file)) -breaks_filepath <- cmd_args[6] -thresh2remove <- 1000000000 -dims_thresh <- 100 - -remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { - # collect list of samples to remove from replication pattern - remove_from_group <- NULL - for (sample_nr in 1:length(repl_pattern)){ - repl_pattern_1sample <- repl_pattern[[sample_nr]] - remove <- NULL - for (file_nr in 1:length(repl_pattern_1sample)) { - if (repl_pattern_1sample[file_nr] %in% bad_samples) { - remove <- c(remove, file_nr) - } - } - if (length(remove) == nr_replicates) { - remove_from_group <- c(remove_from_group, sample_nr) - } - if (!is.null(remove)) { - repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] - } - } - if (length(remove_from_group) != 0) { - repl_pattern <- repl_pattern[-remove_from_group] - } - return(list("pattern" = repl_pattern)) -} - -# load init_file: contains repl_pattern -load(init_file) - -# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, -# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos -load(breaks_filepath) - -# lower the threshold below which a sample will be removed for DBS and for high m/z -if (dims_matrix == "DBS") { - thresh2remove <- 500000000 -} -if (highest_mz > 700) { - thresh2remove <- 1000000 -} - - -# remove technical replicates which are below the threshold -remove_neg <- NULL -remove_pos <- NULL -cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (sample_nr in 1:length(repl_pattern)) { - tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) - tech_reps_array_pos <- NULL - tech_reps_array_neg <- NULL - sum_neg <- 0 - sum_pos <- 0 - nr_pos <- 0 - nr_neg <- 0 - for (file_nr in 1:length(tech_reps)) { - load(paste(tech_reps[file_nr], ".RData", sep = "")) - cat("\n\nParsing", tech_reps[file_nr]) - # negative scanmode - cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) - if (sum(peak_list$neg[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_neg <- c(remove_neg, tech_reps[file_nr]) - } else { - nr_neg <- nr_neg + 1 - sum_neg <- sum_neg + peak_list$neg - } - tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) - # positive scanmode - cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) - if (sum(peak_list$pos[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_pos <- c(remove_pos, tech_reps[file_nr]) - } else { - nr_pos <- nr_pos + 1 - sum_pos <- sum_pos + peak_list$pos - } - tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) - } - # save to file - if (nr_neg != 0) { - sum_neg[, 1] <- sum_neg[, 1] / nr_neg - colnames(sum_neg) <- names(repl_pattern)[sample_nr] - save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData")) - } - if (nr_pos != 0) { - sum_pos[, 1] <- sum_pos[, 1] / nr_pos - colnames(sum_pos) <- names(repl_pattern)[sample_nr] - save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData")) - } -} - -pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) -repl_pattern_filtered <- pattern_list$pattern -save(repl_pattern_filtered, file = "negative_repl_pattern.RData") -write.table( - remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" -) - -pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) -repl_pattern_filtered <- pattern_list$pattern -save(repl_pattern_filtered, file = "positive_repl_pattern.RData") -write.table( - remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" -) - -## generate TIC plots -# get all txt files -tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") -all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) - -# determine maximum intensity -highest_tic_max <- 0 -for (file in tic_files) { - tic <- read.table(file) - this_tic_max <- max(tic$tic_intensity) - if (this_tic_max > highest_tic_max) { - highest_tic_max <- this_tic_max - max_sample <- sub("_TIC\\..*$", "", basename(file)) - } -} - -# create a list with information for all TIC plots -tic_plot_list <- list() -plot_nr <- 0 -for (sample_nr in c(1:length(repl_pattern))) { - tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) - sample_name <- names(repl_pattern)[sample_nr] - for (file_nr in 1:length(tech_reps)) { - plot_nr <- plot_nr + 1 - # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) - repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) - bad_color_pos <- tech_reps[file_nr] %in% remove_pos - bad_color_neg <- tech_reps[file_nr] %in% remove_neg - if (bad_color_neg & bad_color_pos) { - plot_color <- "#F8766D" - } else if (bad_color_pos) { - plot_color <- "#ED8141" - } else if (bad_color_neg) { - plot_color <- "#BF80FF" - } else { - plot_color <- "white" - } - tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + - geom_line(linewidth = 0.3) + - geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + - geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) + - geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + - geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + - geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + - labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + - theme(plot.background = element_rect(fill = plot_color), - axis.text = element_text(size = 4), - axis.title = element_text(size = 4), - plot.title = element_text(size = 6)) - tic_plot_list[[plot_nr]] <- tic_plot - } -} - -# create a layout matrix dependent on number of replicates -layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) -# put TIC plots in matrix -tic_plot_pdf <- marrangeGrob( - grobs = tic_plot_list, - nrow = 10, ncol = nr_replicates, - layout_matrix = layout, - top = quote(paste( - "TICs of run", run_name, - " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", - g, "/", npages - )) -) - -# save to file -ggsave(filename = paste0(run_name, "_TICplots.pdf"), - tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf deleted file mode 100644 index f4b85b9a..00000000 --- a/DIMS/AverageTechReplicates.nf +++ /dev/null @@ -1,35 +0,0 @@ -process AverageTechReplicates { - tag "DIMS AverageTechReplicates" - label 'AverageTechReplicates' - container = 'docker://umcugenbioinf/dims:1.3' - shell = ['/bin/bash', '-euo', 'pipefail'] - - input: - path(rdata_file) - path(tic_txt_files) - path(init_file) - val(nr_replicates) - val(analysis_id) - val(matrix) - path(highest_mz_file) - path(breaks_file) - - output: - path('*_repl_pattern.RData'), emit: pattern_files - path('*_avg.RData'), emit: binned_files - path('miss_infusions_negative.txt') - path('miss_infusions_positive.txt') - path('*_TICplots.pdf') - - script: - """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ - $params.nr_replicates \ - $analysis_id \ - $matrix \ - $highest_mz_file \ - $breaks_file - """ -} - - From 6438e0e6cb0c24bf440b98b632b4d5abb46b9fb5 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:43:38 +0200 Subject: [PATCH 11/31] removed breaks as input for PeakFinding --- DIMS/PeakFinding.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index dad2a497..04c82540 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -5,13 +5,14 @@ process PeakFinding { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(path(rdata_file), path(breaks_file)) + path(rdata_file) + each path(sample_techreps) output: - path '*tive.RData' + path '*tive.RData', optional: true script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.preprocessing_scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $params.resolution $params.preprocessing_scripts_dir """ } From 1ef197d7c52981cba19be936299c1a95c7492f3b Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:44:27 +0200 Subject: [PATCH 12/31] changed PeakFinding to new two-step method --- DIMS/PeakFinding.R | 66 +++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index c54a64cf..529370d4 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -2,57 +2,57 @@ cmd_args <- commandArgs(trailingOnly = TRUE) replicate_rdatafile <- cmd_args[1] -breaks_file <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) -preprocessing_scripts_dir <- cmd_args[4] -peak_thresh <- 2000 +resol <- as.numeric(cmd_args[2]) +preprocessing_scripts_dir <- cmd_args[3] +# use fixed theshold between noise and signal for peak +peak_thresh <- 2000 -# load in function scripts +# source functions script source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R")) -load(breaks_file) +library(dplyr) -# Load output of AssignToBins for a sample -sample_techrepl <- get(load(replicate_rdatafile)) +# Load output of AssignToBins (peak_list) for a technical replicate +load(replicate_rdatafile) techrepl_name <- colnames(peak_list$pos)[1] +# load list of technical replicates per sample that passed threshold filter +techreps_passed <- read.table("replicates_per_sample.txt", sep=",") + # Initialize options(digits = 16) # run the findPeaks function scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { - # turn dataframe with intensities into a named list + # get intensities for scan mode if (scanmode == "positive") { ints_perscanmode <- peak_list$pos } else if (scanmode == "negative") { ints_perscanmode <- peak_list$neg } - - ints_fullrange <- as.vector(ints_perscanmode) - names(ints_fullrange) <- rownames(ints_perscanmode) - + + # check whether technical replicate has passed threshold filter for this scanmode + techreps_scanmode <- techreps_passed[grep(scanmode, techreps_passed[, 3]), ] + # if techrep is ok, it will be found. If not, skip this techrep. + if (length(grep(techrepl_name, techreps_scanmode)) == 0) { + break + } + + # put mz and intensities into dataframe + ints_fullrange <- as.data.frame(cbind(mz = as.numeric(rownames(ints_perscanmode)), + int = as.numeric(ints_perscanmode))) + # look for m/z range for all peaks - allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, peak_thresh) - - # turn the list into a dataframe - outlist_techrep <- NULL - outlist_techrep <- cbind("samplenr" = allpeaks_values$nr, - "mzmed.pkt" = allpeaks_values$mean, - "fq" = allpeaks_values$qual, - "mzmin.pkt" = allpeaks_values$min, - "mzmax.pkt" = allpeaks_values$max, - "height.pkt" = allpeaks_values$area) - - # remove peaks with height = 0 - outlist_techrep <- outlist_techrep[outlist_techrep[, "height.pkt"] != 0, ] - + regions_of_interest <- search_regions_of_interest(ints_fullrange) + + # fit Gaussian curve and calculate integrated area under curve + integrated_peak_df <- integrate_peaks(ints_fullrange, regions_of_interest, resol, peak_thresh) + + # add sample name to dataframe + integrated_peak_df <- as.data.frame(cbind(samplenr = techrepl_name, integrated_peak_df)) + # save output to file - save(outlist_techrep, file = paste0(techrepl_name, "_", scanmode, ".RData")) - - # generate text output to log file on number of spikes for this sample - # spikes are peaks that are too narrow, e.g. 1 data point - cat(paste("There were", allpeaks_values$spikes, "spikes")) - + save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData")) } From 9272ec3f93884064b0949d06bcc14c523b65a5a8 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:46:17 +0200 Subject: [PATCH 13/31] functions for new two-step PeakFinding method --- DIMS/preprocessing/peak_finding_functions.R | 956 ++++---------------- 1 file changed, 194 insertions(+), 762 deletions(-) diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R index 1baef120..8c2f62d7 100644 --- a/DIMS/preprocessing/peak_finding_functions.R +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -1,633 +1,204 @@ # functions for peak finding -search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) { - #' Divide the full m/z range into regions of interest with min, max and mean m/z + +search_regions_of_interest <- function(ints_fullrange) { + #' Divide the full m/z range into regions of interest with indices #' - #' @param ints_fullrange: Named list of intensities (float) - #' @param resol: Value for resolution (integer) - #' @param sample_name: Sample name (string) - #' @param peak_thresh: Value for noise level threshold (integer) + #' @param ints_fullrange: Matrix with m/z values and intensities (float) #' - #' @return allpeaks_values: list of m/z regions of interest + #' @return regions_of_interest: matrix of m/z regions of interest (integer) - # initialize list to store results for all peaks - allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, - "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) - # find indices where intensity is not equal to zero - nonzero_positions <- as.vector(which(ints_fullrange != 0)) - - # initialize - # start position of the first peak - start_index <- nonzero_positions[1] - # maximum length of region of interest - max_roi_length <- 15 - - # find regions of interest - for (running_index in 1:length(nonzero_positions)) { - # find position of the end of a peak. - if (running_index < length(nonzero_positions) && (nonzero_positions[running_index + 1] - nonzero_positions[running_index]) > 1) { - end_index <- nonzero_positions[running_index] - # get m/z values and intensities for this region of interest - mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) - int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) - # check if intensity is above threshold or the maximum intensity is NaN - if (max(int_vector) < peak_thresh || is.nan(max(int_vector))) { - # go to next region of interest - start_index <- nonzero_positions[running_index + 1] - next - } - # check if there are more intensities than maximum for region of interest - if (length(int_vector) > max_roi_length) { - print("vector of intensities is longer than max") - print(length(int_vector)) - # trim lowest intensities to zero - #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 - # split the range into multiple sub ranges - #sub_range <- int_vector - #names(sub_range) <- mass_vector - #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, - # sample_name, peak_thresh) - # A proper peak needs to have at least 3 intensities above threshold - } else if (length(int_vector) > 3) { - # check if the sum of intensities is above zero. Why is this necessary? - #if (sum(int_vector) == 0) next - # define mass_diff as difference between last and first value of mass_vector - # mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] - # generate a second mass_vector with equally spaced m/z values - mass_vector_eq <- seq(mass_vector[1], mass_vector[length(mass_vector)], - length = 10 * length(mass_vector)) - - # Find the index in int_vector with the highest intensity - # max_index <- which(int_vector == max(int_vector)) - # get initial fit values - roi_values <- fit_gaussian(mass_vector_eq, mass_vector, int_vector, - resol, force_nr = length(max_index), - use_bounds = FALSE) - - if (roi_values$qual[1] == 1) { - # get optimized fit values - roi_values <- fit_optim(mass_vector, int_vector, resol) - # add region of interest to list of all peaks - allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) - allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) - allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) - allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) - allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) - allpeaks_values$qual <- c(allpeaks_values$qual, 0) - allpeaks_values$spikes <- allpeaks_values$spikes + 1 - - } else { - for (j in 1:length(roi_values$mean)){ - allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) - allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) - allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) - allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) - allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) - allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) - } + nonzero_positions <- as.vector(which(ints_fullrange$int != 0)) + + # find regions of interest (look for consecutive numbers) + regions_of_interest_consec <- seqToIntervals(nonzero_positions) + # add length of regions of interest + regions_of_interest_diff <- regions_of_interest_consec[, 2] - regions_of_interest_consec[, 1] + 1 + regions_of_interest_length <- cbind(regions_of_interest_consec, length = regions_of_interest_diff) + # remove short lengths; a peak should have at least 3 data points + if (any(regions_of_interest_length[, "length"] < 3)) { + regions_of_interest_gte3 <- regions_of_interest_length[-which(regions_of_interest_length[, "length"] < 3), ] + } else { + regions_of_interest_gte3 <- regions_of_interest_length + } + # test for length of roi. If length is greater than 11, remove roi and break up into separate rois + remove_roi_index <- c() + new_rois_all <- regions_of_interest_gte3[0, ] + for (roi_nr in 1:nrow(regions_of_interest_gte3)) { + if (regions_of_interest_gte3[roi_nr, "length"] > 11) { + roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ] + roi_intrange <- as.numeric(roi$int) + # look for local minima that separate the peaks + local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1 + if (length(local_min_positions) > 0) { + remove_roi_index <- c(remove_roi_index, roi_nr) + # find new indices for rois after splitting + start_pos <- regions_of_interest_gte3[roi_nr, "from"] + new_rois <- data.frame(from = 0, to = 0, length = 0) + new_rois_splitroi <- regions_of_interest_gte3[0, ] + for (local_min_index in 1:length(local_min_positions)) { + new_rois[, 1] <- start_pos + new_rois[, 2] <- start_pos + local_min_positions[local_min_index] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + start_pos <- new_rois[, 2] } - + # last part + new_rois[, 1] <- start_pos + new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"] + new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 + new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) + # append + new_rois_all <- rbind(new_rois_all, new_rois_splitroi) } else { - - roi_values <- fit_optim(mass_vector, int_vector, resol) - allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) - allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) - allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) - allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) - allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) - allpeaks_values$qual <- c(allpeaks_values$qual, 0) - allpeaks_values$spikes <- allpeaks_values$spikes + 1 + # if there are no local minima, all intensities belong to one peak. + remove_roi_index <- c(remove_roi_index, roi_nr) + # look for maximum and take 5 intensities to the left and right + max_index <- which(roi_intrange == max(roi_intrange)) + max_pos <- as.numeric(rownames(roi)[max_index]) + new_roi <- data.frame(from = 0, to = 0, length = 0) + new_roi[, 1] <- max_pos - 5 + new_roi[, 2] <- max_pos + 5 + new_roi[, 3] <- new_roi[, 2] - new_roi[, 1] + 1 + # append + new_rois_all <- rbind(new_rois_all, new_roi) } } - start_index <- nonzero_positions[running_index + 1] } - return(allpeaks_values) -} - -fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector, - resol, force_nr, use_bounds) { - #' Fit 1 or 2 Gaussian peaks in small region of m/z - #' - #' @param mass_vector_eq: Vector of equally spaced m/z values (float) - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param int_vector: Value used to calculate area under Gaussian curve (integer) - #' @param resol: Value for resolution (integer) - #' @param force_nr: Number of local maxima in int_vector (integer) - #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' - #' @return roi_value_list: list of fit values for region of interest (list) - - # Find the index in int_vector with the highest intensity - max_index <- which(int_vector == max(int_vector)) - - # Initialise - peak_mean <- NULL - peak_area <- NULL - peak_qual <- NULL - peak_min <- NULL - peak_max <- NULL - fit_quality1 <- 0.15 - fit_quality <- 0.2 - # set initial value for scale factor - scale_factor <- 2 - - # One local maximum: - if (force_nr == 1) { - # determine fit values for 1 Gaussian peak - fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol, - fit_quality1, use_bounds) - - # test if the mean is outside the m/z range - if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { - # run this function again with fixed boundaries - return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol, - force_nr = 1, use_bounds = TRUE)) - } else { - # test if the fit is bad - if (fit_values$qual > fit_quality1) { - # Try to fit two curves; find two local maxima. NB: max_index (now new_index) removed from fit_gaussian - new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 - # test if there are two indices in new_index - if (length(new_index) != 2) { - new_index <- round(length(mass_vector) / 3) - new_index <- c(new_index, 2 * new_index) - } - # run this function again with two local maxima - return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, - resol, force_nr = 2, use_bounds = FALSE)) - # good fit - } else { - peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, - fit_values$sigma)) - peak_qual <- fit_values$qual - peak_min <- mass_vector[1] - peak_max <- mass_vector[length(mass_vector)] - } - } - - #### Two local maxima; need at least 6 data points for this #### - } else if (force_nr == 2 && (length(mass_vector) > 6)) { - # determine fit values for 2 Gaussian peaks - fit_values <- fit_2peaks(mass_vector_eq, mass_vector, int_vector, max_index, resol, - use_bounds, fit_quality) - # test if one of the means is outside the m/z range - if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || - fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { - # check if fit quality is bad - if (fit_values$qual > fit_quality) { - # run this function again with fixed boundaries - return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol, - force_nr = 2, use_bounds = TRUE)) - } else { - # check which mean is outside range and remove it from the list of means - # NB: peak_mean and other variables have not been given values from 2-peak fit yet! - for (i in 1:length(fit_values$mean)){ - if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { - peak_mean <- c(peak_mean, -i) - peak_area <- c(peak_area, -i) - } else { - peak_mean <- c(peak_mean, fit_values$mean[i]) - peak_area <- c(peak_area, fit_values$area[i]) - } - } - peak_qual <- fit_values$qual - peak_min <- mass_vector[1] - peak_max <- mass_vector[length(mass_vector)] - } - # if all means are within range - } else { - # check for bad fit - if (fit_values$qual > fit_quality) { - # Try to fit three curves; find three local maxima - new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 - # test if there are three indices in new_index - if (length(new_index) != 3) { - new_index <- round(length(mass_vector) / 4) - new_index <- c(new_index, 2 * new_index, 3 * new_index) - } - # run this function again with three local maxima - return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, - resol, force_nr = 3, use_bounds = FALSE)) - # good fit, all means are within m/z range - } else { - # check if means are within 3 ppm and sum if so - tmp <- fit_values$qual - nr_means_new <- -1 - nr_means <- length(fit_values$mean) - while (nr_means != nr_means_new) { - nr_means <- length(fit_values$mean) - fit_values <- within_ppm(fit_values$mean, fit_values$scale_factor, fit_values$sigma, fit_values$area, - mass_vector_eq, mass_vector, ppm = 4, resol) - nr_means_new <- length(fit_values$mean) - } - # restore original quality score - fit_values$qual <- tmp - - for (i in 1:length(fit_values$mean)){ - peak_mean <- c(peak_mean, fit_values$mean[i]) - peak_area <- c(peak_area, fit_values$area[i]) - } - peak_qual <- fit_values$qual - peak_min <- mass_vector[1] - peak_max <- mass_vector[length(mass_vector)] - } - } - - } else { # More than two local maxima; fit 1 peak. - fit_quality1 <- 0.40 - use_bounds <- TRUE - max_index <- which(int_vector == max(int_vector)) - fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol, - fit_quality1, use_bounds) - # check for bad fit - if (fit_values$qual > fit_quality1) { - # get fit values from fit_optim - fit_values <- fit_optim(mass_vector, int_vector, resol) - peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, fit_values$area) - peak_min <- fit_values$min - peak_max <- fit_values$max - peak_qual <- 0 - } else { - peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, fit_values$sigma)) - peak_qual <- fit_values$qual - peak_min <- mass_vector[1] - peak_max <- mass_vector[length(mass_vector)] - } + # remove rois that have been split into chunks or shortened + #print(dim(regions_of_interest_gte3)) + if (length(remove_roi_index) > 0) { + regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ] + } else { + regions_of_interest_minus_short <- regions_of_interest_gte3 + } + #print(dim(regions_of_interest_minus_short)) + # combine remaining rois with info on chunks + regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all) + # remove regions of interest with short lengths again + if (any(regions_of_interest_split[, "length"] < 3)) { + regions_of_interest_final <- regions_of_interest_split[-which(regions_of_interest_split[, "length"] < 3), ] + } else { + regions_of_interest_final <- regions_of_interest_split } - - # put all values for this region of interest into a list - roi_value_list <- list("mean" = peak_mean, - "area" = peak_area, - "qual" = peak_qual, - "min" = peak_min, - "max" = peak_max) - return(roi_value_list) -} - -fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, - resol, fit_quality, use_bounds) { - #' Fit 1 Gaussian peak in small region of m/z - #' - #' @param mass_vector_eq: Vector of equally spaced m/z values (float) - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param int_vector: Value used to calculate area under Gaussian curve (integer) - #' @param max_index: Index in int_vector with the highest intensity (integer) - #' @param resol: Value for resolution (integer) - #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) - #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' - #' @return roi_value_list: list of fit values for region of interest (list) - - # set initial value for scale_factor - scale_factor <- 2 - - if (length(int_vector) < 3) { - message("Range too small, no fit possible!") + # sort on first index + if (nrow(regions_of_interest_final) > 1){ + regions_of_interest_sorted <- regions_of_interest_final %>% dplyr::arrange(from) } else { - if (length(int_vector) == 4) { - # fit 1 peak - weighted_mu <- weighted.mean(mass_vector, int_vector) - sigma <- get_stdev(mass_vector, int_vector) - fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - } else { - # set range vector - if ((length(mass_vector) - length(max_index)) < 2) { - range1 <- c((length(mass_vector) - 4) : length(mass_vector)) - } else if (length(max_index) < 2) { - range1 <- c(1:5) - } else { - range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2)) - } - # remove zero at the beginning of range1 - if (range1[1] == 0) { - range1 <- range1[-1] - } - # remove NA - if (length(which(is.na(int_vector[range1]))) != 0) { - range1 <- range1[-which(is.na(int_vector[range1]))] - } - # fit 1 peak - weighted_mu <- weighted.mean(mass_vector[range1], int_vector[range1]) - sigma <- get_stdev(mass_vector[range1], int_vector[range1]) - fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - } - - fitted_mz <- fitted_peak$par[1] - fitted_nr <- fitted_peak$par[2] - - # get new value for fit quality and scale_factor - fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new - scale_factor_new <- 1.2 * scale_factor - - # bad fit - if (fq_new > fit_quality) { - # optimize scaling factor - fq <- 0 - scale_factor <- 0 - if (sum(int_vector) > sum(fitted_nr * dnorm(mass_vector, fitted_mz, sigma))) { - # increase scale_factor until convergence - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) { - fq <- fq_new - scale_factor <- scale_factor_new - # fit 1 peak - fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - fitted_mz <- fitted_peak$par[1] - fitted_nr <- fitted_peak$par[2] - # get new value for fit quality and scale_factor - fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new - scale_factor_new <- 1.2 * scale_factor - } - } else { - # decrease scale_factor until convergence - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) { - fq <- fq_new - scale_factor <- scale_factor_new - # fit 1 peak - fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - fitted_mz <- fitted_peak$par[1] - fitted_nr <- fitted_peak$par[2] - # get new value for fit quality and scale_factor - fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new - scale_factor_new <- 0.8 * scale_factor - } - } - # use optimized scale_factor factor to fit 1 peak - if (fq < fq_new) { - fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds) - fitted_mz <- fitted_peak$par[1] - fitted_nr <- fitted_peak$par[2] - fq_new <- fq - } - } + regions_of_interest_sorted <- regions_of_interest_final } - - roi_value_list <- list("mean" = fitted_mz, "scale_factor" = fitted_nr, "sigma" = sigma, "qual" = fq_new) - return(roi_value_list) + + return(regions_of_interest_sorted) } -fit_2peaks <- function(mass_vector_eq, mass_vector, int_vector, max_index, resol, use_bounds = FALSE, - fit_quality) { - #' Fit 2 Gaussian peaks in a small region of m/z +integrate_peaks <- function(ints_fullrange, regions_of_interest, resol, peak_thresh) { + #' Fit Gaussian peak for each region of interest and integrate area under the curve #' - #' @param mass_vector_eq: Vector of equally spaced m/z values (float) - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param int_vector: Value used to calculate area under Gaussian curve (integer) - #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param ints_fullrange: Named list of intensities (float) + #' @param regions_of_interest: Named list of intensities (float) #' @param resol: Value for resolution (integer) - #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) - #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' - #' @return roi_value_list: list of fit values for region of interest (list) - - peak_mean <- NULL - peak_area <- NULL - peak_scale <- NULL - peak_sigma <- NULL - - # set range vectors for first peak - range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2)) - # remove zero at the beginning of range1 - if (range1[1] == 0) { - range1 <- range1[-1] - } - # set range vectors for second peak - range2 <- seq(from = (max_index[2] - 2), to = (max_index[2] + 2)) - # if range2 ends outside mass_vector, shorten it - if (length(mass_vector) < range2[length(range2)]) { - range2 <- range2[-length(range2)] - } - # check whether the two ranges overlap - range1 <- check_overlap(range1, range2)[[1]] - range2 <- check_overlap(range1, range2)[[2]] - # check for negative or 0 values in range1 or range2 - remove <- which(range1 < 1) - if (length(remove) > 0) { - range1 <- range1[-remove] - } - remove <- which(range2 < 1) - if (length(remove) > 0) { - range2 <- range2[-remove] - } - # remove NA - if (length(which(is.na(int_vector[range1]))) != 0) { - range1 <- range1[-which(is.na(int_vector[range1]))] - } - if (length(which(is.na(int_vector[range2]))) != 0) { - range2 <- range2[-which(is.na(int_vector[range2]))] - } - - # fit 2 peaks, first separately, then together - weighted_mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) - sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) - fitted_peak1 <- optimize_1gaussian(mass_vector[range1], int_vector[range1], sigma1, weighted_mu1, scale_factor, use_bounds) - fitted_mz1 <- fitted_peak1$par[1] - fitted_nr1 <- fitted_peak1$par[2] - # second peak - weighted_mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) - sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) - fitted_peak2 <- optimize_1gaussian(mass_vector[range2], int_vector[range2], sigma2, weighted_mu2, scale_factor, use_bounds) - fitted_mz2 <- fitted_peak2$par[1] - fitted_nr2 <- fitted_peak2$par[2] - # combined - fitted_2peaks <- optimize_2gaussians(mass_vector, int_vector, sigma1, sigma2, - fitted_mz1, fitted_nr1, - fitted_mz2, fitted_nr2, use_bounds) - fitted_2peaks_mz1 <- fitted_2peaks$par[1] - fitted_2peaks_nr1 <- fitted_2peaks$par[2] - fitted_2peaks_mz2 <- fitted_2peaks$par[3] - fitted_2peaks_nr2 <- fitted_2peaks$par[4] - - # get fit quality - if (is.null(sigma2)) { - sigma2 <- sigma1 + #' @param peak_thresh: Value for noise level threshold (integer) # NOT USED YET + #' + #' @return allpeaks_values: matrix of integrated peaks + + # initialize dataframe to store results for all peaks + allpeaks_values <- matrix(0, nrow = nrow(regions_of_interest), ncol = 5) + colnames(allpeaks_values) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + + for (roi_nr in 1:nrow(regions_of_interest)) { + # find m/z values and intensities corresponding to the region of interest + index_range <- regions_of_interest[roi_nr, "from"] : regions_of_interest[roi_nr, "to"] + roi_mzrange <- as.numeric(ints_fullrange$mz[index_range]) + roi_intrange <- as.numeric(ints_fullrange$int[index_range]) + # find m/z value for maximum of peak (mu in Gaussian function) + weighted_mzmean <- weighted.mean(roi_mzrange, roi_intrange) + # find expected peak width at this m/z value + fwhm_mzmed <- get_fwhm(weighted_mzmean, resol) + # calculate sigma for Gaussian curve (https://en.wikipedia.org/wiki/Full_width_at_half_maximum) + sigma_peak <- fwhm_mzmed / 2.355 + # find scale factor for Gaussian. Initial estimate is maximum intensity in roi + # divided by 2 for better correlation with intensities from old PeakFinding method + scale_factor <- max(roi_intrange) / 2 + # fit Gaussian peak. Find intensities according to normal distribution + normal_ints <- scale_factor * gaussfunc(roi_mzrange, weighted_mzmean, sigma_peak) + # sum intensities in roi + sum_ints <- sum(roi_intrange) + sum_gauss <- sum(normal_ints) + # calculate quality of fit + quality_fit <- 1 - sum(abs(normal_ints - roi_intrange)) / sum_ints + # put all values into dataframe + allpeaks_values[roi_nr, "mzmed.pkt"] <- weighted_mzmean + allpeaks_values[roi_nr, "fq"] <- quality_fit + allpeaks_values[roi_nr, "mzmax.pkt"] <- max(roi_mzrange) + allpeaks_values[roi_nr, "mzmin.pkt"] <- min(roi_mzrange) + allpeaks_values[roi_nr, "height.pkt"] <- sum_gauss + } + + # remove peaks with height = 0, look for NA or NaN + remove_na <- which(is.na(as.numeric(allpeaks_values[, "height.pkt"]))) + if (length(remove_na) > 0) { + allpeaks_values <- allpeaks_values[-remove_na, ] } - sum_fit <- (fitted_2peaks_nr1 * dnorm(mass_vector, fitted_2peaks_mz1, sigma1)) + - (fitted_2peaks_nr2 * dnorm(mass_vector, fitted_2peaks_mz2, sigma2)) - lowest_mz <- sort(c(fitted_2peaks_mz1, fitted_2peaks_mz2))[1] - fq_new <- get_fit_quality(mass_vector, int_vector, lowest_mz, - resol, sum_fit = sum_fit)$fq_new - - # get parameter values - area1 <- estimate_area(fitted_2peaks_mz1, resol, fitted_2peaks_nr1, sigma1) - area2 <- estimate_area(fitted_2peaks_mz2, resol, fitted_2peaks_nr2, sigma2) - peak_area <- c(peak_area, area1, area2) - peak_mean <- c(peak_mean, fitted_2peaks_mz1, fitted_2peaks_mz2) - peak_scale <- c(peak_scale, fitted_2peaks_nr1, fitted_2peaks_nr2) - peak_sigma <- c(peak_sigma, sigma1, sigma2) - - roi_value_list <- list("mean" = peak_mean, "scale_factor" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq_new) - return(roi_value_list) -} -optimize_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale_factor, use_bounds) { - #' Fit a Gaussian curve for a peak with given parameters - #' - #' @param mass_vector: Vector of masses (float) - #' @param int_vector: Vector of intensities (float) - #' @param sigma: Value for width of the peak (float) - #' @param query_mass: Value for mass at center of peak (float) - #' @param scale_factor: Value for scaling intensities (float) - #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' - #' @return opt_fit: list of parameters and values describing the optimal fit - - # define optimization function for optim based on normal distribution - opt_f <- function(params) { - d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) - sum((d - int_vector) ^ 2) - } - if (use_bounds) { - # determine lower and upper boundaries - lower <- c(mass_vector[1], 0, mass_vector[1], 0) - upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) - # get optimal value for fitted Gaussian curve - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)), - opt_f, control = list(maxit = 10000), method = "L-BFGS-B", - lower = lower, upper = upper) - } else { - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)), - opt_f, control = list(maxit = 10000)) - } - return(opt_fit) + return(allpeaks_values) } -optimize_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, - query_mass1, scale_factor1, - query_mass2, scale_factor2, use_bounds) { - #' Fit two Gaussian curves for a peak with given parameters - #' - #' @param mass_vector: Vector of masses (float) - #' @param int_vector: Vector of intensities (float) - #' @param sigma1: Value for width of the first peak (float) - #' @param sigma2: Value for width of the second peak (float) - #' @param query_mass1: Value for mass at center of first peak (float) - #' @param scale_factor1: Value for scaling intensities for first peak (float) - #' @param query_mass2: Value for mass at center of second peak (float) - #' @param scale_factor2: Value for scaling intensities for second peak (float) - #' @param use_bounds: Boolean to indicate whether boundaries are to be used - #' - #' @return opt_fit: list of parameters and values describing the optimal fit - - # define optimization function for optim based on normal distribution - opt_f <- function(params) { - d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + - params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) - sum((d - int_vector) ^ 2) - } - - if (use_bounds) { - # determine lower and upper boundaries - lower <- c(mass_vector[1], 0, mass_vector[1], 0) - upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) - # get optimal value for 2 fitted Gaussian curves - if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) { - sigma2 <- sigma1 - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), - as.numeric(query_mass1), as.numeric(scale_factor1)), - opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper) - } else { - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), - as.numeric(query_mass2), as.numeric(scale_factor2)), - opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper) - } - } else { - if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) { - sigma2 <- sigma1 - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), - as.numeric(query_mass1), as.numeric(scale_factor1)), - opt_f, control = list(maxit = 10000)) - } else { - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1), - as.numeric(query_mass2), as.numeric(scale_factor2)), - opt_f, control = list(maxit = 10000)) +# in the next version of the docker image, package R.utils will be included +seqToIntervals <- function(idx) { + #' Find consecutive stretches of numbers + #' function seqToIntervals copied from R.utils library + #' see https://rdrr.io/cran/R.utils/src/R/seqToIntervals.R + #' + #' @param idx: Sequence of indices (integers) + #' + #' @return res: Matrix of start and end positions of consecutive numbers (matrix) + + # Clean up sequence + idx <- as.integer(idx) + idx <- unique(idx) + idx <- sort(idx) + + len_idx <- length(idx) + if (len_idx == 0L) { + res <- matrix(NA_integer_, nrow = 0L, ncol = 2L) + colnames(res) <- c("from", "to") + return(res) + } + + # Identify end points of intervals + diff_idx <- diff(idx) + diff_idx <- (diff_idx > 1) + diff_idx <- which(diff_idx) + nr_intervals <- length(diff_idx) + 1 + + # Allocate return matrix + res <- matrix(NA_integer_, nrow = nr_intervals, ncol = 2L) + colnames(res) <- c("from", "to") + + from_value <- idx[1] + to_value <- from_value - 1 + last_value <- from_value + + count <- 1 + for (running_index in seq_along(idx)) { + value <- idx[running_index] + if (value - last_value > 1) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) + from_value <- value + count <- count + 1 } + last_value <- value } - return(opt_fit) -} -get_stdev <- function(mass_vector, int_vector, resol = 140000) { - #' Calculate standard deviation to determine width of a peak - #' - #' @param mass_vector: Vector of 3 mass values (float) - #' @param int_vector: Vector of 3 intensities (float) - #' @param resol: Value for resolution (integer) - #' - #' @return stdev: Value for standard deviation - # find maximum intensity in vector - max_index <- which(int_vector == max(int_vector)) - # find corresponding mass at maximum intensity - max_mass <- mass_vector[max_index] - # calculate resolution at given m/z value - resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200))) - # calculate full width at half maximum - fwhm <- max_mass / resol_mz - # calculate standard deviation - stdev <- (fwhm / 2) * 0.85 - return(stdev) -} - -get_fit_quality <- function(mass_vector, int_vector, mu_first, resol, scale_factor = NULL, sigma = NULL, sum_fit = NULL) { - #' Get fit quality for 1 Gaussian peak in small region of m/z - #' - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param int_vector: Value used to calculate area under Gaussian curve (integer) - #' @param mu_first: Value for first peak (float) - #' @param scale_factor: Initial value used to estimate scaling parameter (integer) - #' @param resol: Value for resolution (integer) - #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float) - #' - #' @return list_params: list of parameters indicating quality of fit (list) - if (is.null(sum_fit)) { - mass_vector_int <- mass_vector - int_vector_int <- int_vector - # get new fit quality - fq_new <- mean(abs((scale_factor * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / - rep((max(scale_factor * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) - } else { - sum_fit_int <- sum_fit - int_vector_int <- int_vector - mass_vector_int <- mass_vector - # get new fit quality - fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int))) + if (to_value < from_value) { + to_value <- last_value + res[count, ] <- c(from_value, to_value) } - - # Prevent division by 0 - if (is.nan(fq_new)) fq_new <- 1 - - list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int) - return(list_params) -} -estimate_area <- function(mass_max, resol, scale_factor, sigma) { - #' Estimate area of Gaussian curve - #' - #' @param mass_max: Value for m/z at maximum intensity of a peak (float) - #' @param resol: Value for resolution (integer) - #' @param scale_factor: Value for peak width (float) - #' @param sigma: Value for standard deviation (float) - #' - #' @return area_curve: Value for area under the Gaussian curve (float) - - # calculate width of peak at half maximum - fwhm <- get_fwhm(mass_max, resol) - - # generate a mass_vector with equally spaced m/z values - mz_min <- mass_max - 2 * fwhm - mz_max <- mass_max + 2 * fwhm - mz_range <- mz_max - mz_min - mass_vector_eq <- seq(mz_min, mz_max, length = 100) - - # estimate area under the curve - area_curve <- sum(scale_factor * dnorm(mass_vector_eq, mass_max, sigma)) / 100 - - return(area_curve) + return(res) } get_fwhm <- function(query_mass, resol) { @@ -641,7 +212,7 @@ get_fwhm <- function(query_mass, resol) { # set aberrant values of query_mass to default value of 200 if (is.na(query_mass)) { query_mass <- 200 - } + } if (query_mass < 0) { query_mass <- 200 } @@ -649,168 +220,29 @@ get_fwhm <- function(query_mass, resol) { resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) # calculate full width at half maximum fwhm <- query_mass / resol_mz - - return(fwhm) -} - -fit_optim <- function(mass_vector, int_vector, resol) { - #' Determine optimized fit of Gaussian curve to small region of m/z - #' - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param int_vector: Vector of intensities for a region of interest (float) - #' @param resol: Value for resolution (integer) - #' - #' @return roi_value_list: list of fit values for region of interest (list) - - # initial value for scale_factor - scale_factor <- 1.5 - - # Find the index in int_vector with the highest intensity - max_index <- which(int_vector == max(int_vector))[1] - mass_max <- mass_vector[max_index] - int_max <- int_vector[max_index] - # get peak width - fwhm <- get_fwhm(mass_max, resol) - # simplify the peak shape: represent it by a triangle - mass_max_simple <- c(mass_max - scale_factor * fwhm, mass_max, mass_max + scale_factor * fwhm) - int_max_simple <- c(0, int_max, 0) - - # define mass_diff as difference between last and first value of mass_max_simple - # mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] - # generate a second mass_vector with equally spaced m/z values - mass_vector_eq <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], - length = 100 * length(mass_max_simple)) - sigma <- get_stdev(mass_vector_eq, int_max_simple) - # define optimization function for optim based on normal distribution - opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) { - curve <- p * dnorm(mass_vector, mass_max, sigma) - return((max(curve) - max(int_vector))^2) - } - - # get optimal value for fitted Gaussian curve - opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max) - scale_factor <- opt_fit$minimum - - # get an estimate of the area under the peak - area <- estimate_area(mass_max, resol, scale_factor, sigma) - # put all values for this region of interest into a list - roi_value_list <- list("mean" = mass_max, - "area" = area, - "min" = mass_vector_eq[1], - "max" = mass_vector_eq[length(mass_vector_eq)]) - return(roi_value_list) -} -within_ppm <- function(mean, scale_factor, sigma, area, mass_vector_eq, mass_vector, ppm = 4, resol) { - #' Test whether two mass ranges are within ppm distance of each other - #' - #' @param mean: Value for mean m/z (float) - #' @param scale_factor: Initial value used to estimate scaling parameter (integer) - #' @param sigma: Value for standard deviation (float) - #' @param area: Value for area under the curve (float) - #' @param mass_vector_eq: Vector of equally spaced m/z values (float) - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param ppm: Value for distance between two values of mass (integer) - #' @param resol: Value for resolution (integer) - #' - #' @return list_params: list of parameters indicating quality of fit (list) - - # sort - index <- order(mean) - mean <- mean[index] - scale_factor <- scale_factor[index] - sigma <- sigma[index] - area <- area[index] - - summed <- NULL - remove <- NULL - - if (length(mean) > 1) { - for (i in 2:length(mean)) { - if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) { - - # avoid double occurrance in sum - if ((i - 1) %in% summed) next - - result_values <- sum_curves(mean[i - 1], mean[i], scale_factor[i - 1], scale_factor[i], sigma[i - 1], sigma[i], - mass_vector_eq, mass_vector, resol) - summed <- c(summed, i - 1, i) - if (is.nan(result_values$mean)) result_values$mean <- 0 - mean[i - 1] <- result_values$mean - mean[i] <- result_values$mean - area[i - 1] <- result_values$area - area[i] <- result_values$area - scale_factor[i - 1] <- result_values$scale_factor - scale_factor[i] <- result_values$scale_factor - sigma[i - 1] <- result_values$sigma - sigma[i] <- result_values$sigma - - remove <- c(remove, i) - } - } - } - - if (length(remove) != 0) { - mean <- mean[-c(remove)] - area <- area[-c(remove)] - scale_factor <- scale_factor[-c(remove)] - sigma <- sigma[-c(remove)] - } - - list_params <- list("mean" = mean, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma, "qual" = NULL) - return(list_params) + return(fwhm) } -sum_curves <- function(mean1, mean2, scale_factor1, scale_factor2, sigma1, sigma2, mass_vector_eq, mass_vector, resol) { - #' Sum two curves - #' - #' @param mean1: Value for mean m/z of first peak (float) - #' @param mean2: Value for mean m/z of second peak (float) - #' @param scale_factor1: Initial value used to estimate scaling parameter for first peak (integer) - #' @param scale_factor2: Initial value used to estimate scaling parameter for second peak (integer) - #' @param sigma1: Value for standard deviation for first peak (float) - #' @param sigma2: Value for standard deviation for second peak (float) - #' @param mass_vector_eq: Vector of equally spaced m/z values (float) - #' @param mass_vector: Vector of m/z values for a region of interest (float) - #' @param resol: Value for resolution (integer) - #' - #' @return list_params: list of parameters indicating quality of fit (list) - - sum_fit <- (scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)) + (scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2)) - - mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)), - max(scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2)))) - - # get new values for parameters - fwhm <- get_fwhm(mean1_plus2, resol) - area <- max(sum_fit) - scale_factor <- scale_factor1 + scale_factor2 - sigma <- (fwhm / 2) * 0.85 - - list_params <- list("mean" = mean1_plus2, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma) - return(list_params) -} -check_overlap <- function(range1, range2) { - #' Modify range1 and range2 in case of overlap - #' - #' @param range1: Vector of m/z values for first peak (float) - #' @param range2: Vector of m/z values for second peak (float) - #' - #' @return new_ranges: list of two ranges (list) - - # Check for overlap - if (length(intersect(range1, range2)) == 2) { - if (length(range1) >= length(range2)) { - range1 <- range1[-length(range1)] - } else { - range2 <- range2[-1] - } - } else if (length(intersect(range1, range2)) == 3) { - range1 <- range1[-length(range1)] - range2 <- range2[-1] - } - new_ranges <- list("range1" = range1, "range2" = range2) - return(new_ranges) +# from https://rdrr.io/cran/rvmethod/src/R/gaussfit.R +#' Gaussian Function from package rvmethod +#' +#' This function returns the unnormalized (height of 1.0) Gaussian curve with a +#' given center and spread. +#' +#' @param x the vector of values at which to evaluate the Gaussian +#' @param mu the center of the Gaussian +#' @param sigma the spread of the Gaussian (must be greater than 0) +#' @return vector of values of the Gaussian +#' @examples x = seq(-4, 4, length.out = 100) +#' y = gaussfunc(x, 0, 1) +#' plot(x, y) +#' +#' @import stats +#' +#' @export +gaussfunc <- function(x, mu, sigma) { + return(exp(-((x - mu) ^ 2) / (2 * (sigma ^ 2)))) } From e006160d51ab942a4da4f88173b0ba1906717624 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:47:10 +0200 Subject: [PATCH 14/31] unit tests for new two-step PeakFinding method --- .../testthat/test_peak_finding_functions.R | 261 +++++------------- 1 file changed, 65 insertions(+), 196 deletions(-) diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R index a0db0ca9..1e001a0c 100644 --- a/DIMS/tests/testthat/test_peak_finding_functions.R +++ b/DIMS/tests/testthat/test_peak_finding_functions.R @@ -1,218 +1,87 @@ -# unit tests for PeakFinding functions: +# unit tests for PeakFinding functions: source("../preprocessing/peak_finding_functions.R") -# test fit_quality function: -test_that("fit quality is correctly calculated", { - # initialize - test_resol <- 140000 - test_scale_factor <- 2 - test_sigma <- 0.0001 - test_sum_fit <- NULL - - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_mz_max <- max(test_mass_vector) - - expect_type(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit), "list") - expect_equal(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit)$fq_new, 2.426, tolerance = 0.001, TRUE) -}) - -# test get_stdev: -test_that("standard deviation is correctly calculated", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_resol <- 140000 - - expect_type(get_stdev(test_mass_vector, test_int_vector, test_resol), "double") - expect_equal(get_stdev(test_mass_vector, test_int_vector, test_resol), 0.000126, tolerance = 0.00001, TRUE) -}) - -# test estimate_area -test_that("area under the curve is correctly calculated", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_mz_max <- max(test_mass_vector) - test_resol <- 140000 - test_scale_factor <- 2 - test_sigma <- 0.0001 - - expect_type(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), "double") - expect_equal(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), 1673.061, tolerance = 0.001, TRUE) -}) - -# test get_fwhm -test_that("fwhm is correctly calculated", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_mz_max <- max(test_mass_vector) - test_resol <- 140000 - - expect_type(get_fwhm(test_mz_max, test_resol), "double") - expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE) -}) - -# test get_fwhm for NA -test_that("fwhm for mass NA gives standard fwhm", { - test_resol <- 140000 - - expect_type(get_fwhm(NA, test_resol), "double") - expect_equal(get_fwhm(NA, test_resol), 0.001428571, tolerance = 0.000001, TRUE) -}) - -# test within_ppm -test_that("two peaks are within 5 ppm", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_mz_max <- max(test_mass_vector) - test_scale_factor <- 2 - test_sigma <- 0.0001 - test_area <- 20000 - test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) - test_ppm <- 5 - test_resol <- 140000 - - expect_type(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol), "list") - expect_equal(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol)$mean, 70.00962, tolerance = 0.0001, TRUE) -}) - -# test sum_curves -test_that("two curves are correctly summed", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) - test_mean1 <- 70.00938 - test_mean2 <- 70.00962 - test_scale_factor1 <- 2 - test_scale_factor2 <- 4 - test_sigma1 <- 0.0001 - test_sigma2 <- 0.0002 - test_resol <- 140000 +# test search_regions_of_interest function: +test_that("regions of interest are correctly found for a single peak", { + # create intensities to test on + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 10) + 1:10 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) - expect_type(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol), "list") - expect_equal(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol)$mean, 70.0095, tolerance = 0.0001, TRUE) + # expected output + expected_output <- matrix(c(1, 10, 10), nrow = 1, ncol = 3L) + colnames(expected_output) <- c("from", "to", "length") + rownames(expected_output) <- "to" + + # tests + expect_type(search_regions_of_interest(test_ints_fullrange), "double") + expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 1) + expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, tolerance = 0.000001, TRUE) }) - -# test fit_optim -test_that("optimal peak fit can be found", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_resol <- 140000 - # create output to test on: - test_output <- list(mean = 70.0095, area = 5009.596, min = 70.00906, max = 70.00994) +test_that("regions of interest are correctly found for two peaks", { + # create intensities to test on + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 20) + 1:20 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + + dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) - expect_type(fit_optim(test_mass_vector, test_int_vector, test_resol), "list") - for (key in names(test_output)) { - expect_equal(fit_optim(test_mass_vector, test_int_vector, test_resol)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) - } -}) - -# test optimize_1gaussian: -test_that("optimal value for m/z and scale_factor can be found", { - # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_sigma <- 0.0001 - test_query_mass <- median(test_mass_vector) - test_scale_factor <- 2 - test_use_bounds <- FALSE + # expected output + expected_output <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3)) + colnames(expected_output) <- c("from", "to", "length") + rownames(expected_output) <- as.character(c(1, 2)) - expect_type(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds), "list") - expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[1], 70.00949, tolerance = 0.0001, TRUE) - expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[2], 4.570024, tolerance = 0.0001, TRUE) + # tests + expect_type(search_regions_of_interest(test_ints_fullrange), "list") + expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 2) + expect_equal(search_regions_of_interest(test_ints_fullrange)[, "length"], as.numeric(expected_output[, "length"])) + expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, check.attributes = FALSE) }) -# test fit_1peak: -test_that("correct mean m/z and scale_factor can be found for 1 peak", { +# test integrate_peaks function: +test_that("peaks are correctly integrated", { # create peak info to test on: - test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) - test_max_index <- which(test_int_vector == max(test_int_vector)) - test_resol <- 140000 - test_fit_quality <- 0.5 - test_use_bounds <- FALSE - # create output to test on: - test_output <- list(mean = 70.0095, scale_factor = 5.23334, sigma = 0.000126, qual = 0.24300) + test_stdev <- 0.00006 + test_mass_vector <- rep(70, 20) + 1:20 * test_stdev + test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + + dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev) + test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, + int = test_ints_range)) - expect_type(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds), "list") - for (key in names(test_output)) { - expect_equal(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) - } -}) - -# test optimize_2gaussians: -test_that("optimal value for m/z and scale_factor can be found", { - # create info for two peaks in a small region - test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006 - test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2) - test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100) - test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks)) - test_sigma1 <- 0.0001 - test_sigma2 <- 0.0002 - test_query_mass1 <- test_max_index[1] - test_query_mass2 <- test_max_index[2] - test_scale_factor1 <- 2 - test_scale_factor2 <- 3 - test_use_bounds <- FALSE + # create regions of interest to test on: + test_regions_of_interest <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3L)) + colnames(test_regions_of_interest) <- c("from", "to", "length") + rownames(test_regions_of_interest) <- as.character(c(1, 2)) - expect_type(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds), "list") - expect_equal(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds)$par[1], 4, tolerance = 0.1, TRUE) -}) - -# test fit_2peaks: -test_that("correct mean m/z and scale_factor can be found for 2 peaks", { - # create info for two peaks in a small region - test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006 - test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2) - test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100) - test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks)) + # set other parameters test_resol <- 140000 - test_fit_quality <- 0.5 - test_use_bounds <- FALSE - # create output to test on: - test_output <- list(mean = c(70.00954, 70.00998), scale_factor = c(4.823598, 4.828072), sigma = c(0.0001257425, 0.0001257436), qual = 0.38341) + test_peak_thresh <- 2000 + + # expected output + expected_output <- as.data.frame(matrix(c(70.000357, 70.000521, 0.5148094, 0.3016569, 70.00006, 70.00048, + 70.00048, 70.00120, 15526.826791, 11950.90570), nrow = 2, ncol = 5)) + colnames(expected_output) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") - expect_type(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, - test_resol, test_use_bounds, test_fit_quality), "list") - for (key in names(test_output)) { - expect_equal(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, - test_resol, test_use_bounds, test_fit_quality)[[key]], test_output[[key]], tolerance = 0.0001, TRUE) - } + expect_type(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh), "double") + expect_equal(nrow(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)), 2) + expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "mzmed.pkt"], + expected_output[, "mzmed.pkt"], tolerance = 0.0001) + expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "height.pkt"], + expected_output[, "height.pkt"], tolerance = 0.0001) }) -# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds) -test_that("initial gaussian fit is done correctly", { +# test get_fwhm function +test_that("fwhm is correctly calculated", { # create peak info to test on: test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006 - test_int_vector <- c(9000, 16000, 19000, 15000, 6000) - test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100) + test_mz_max <- max(test_mass_vector) test_resol <- 140000 - test_use_bounds <- FALSE - test_force_nr <- 1 - - expect_type(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds), "list") - expect_equal(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds)$mean, 70.00956, tolerance = 0.00001, TRUE) -}) -# test search_mzrange(ints_fullrange, resol, sample_name, peak_thresh) -test_that("all peak finding functions work together", { - # enable snapshot - local_edition(3) - # create info for ten peaks separated by zero - test_large_mass_vector <- rep(70.00938, 80) + 1:80 * 0.00006 - test_large_int_vector <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000, 0), 10) - names(test_large_int_vector) <- test_large_mass_vector - test_resol <- 140000 - test_sample_name <- "C1.1" - test_peak_thresh <- 100 - - expect_type(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), "list") - expect_snapshot(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), error = FALSE) + expect_type(get_fwhm(test_mz_max, test_resol), "double") + expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE) }) -# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds) -test_that("fit gaussian") From 8102bdb83853b7d02eef2a928543bad0d7dabc4c Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:54:33 +0200 Subject: [PATCH 15/31] information for averaging peaks for technical replicates based on txt file with scanmode --- DIMS/AveragePeaks.R | 126 +++++++++++++++++++++---------------------- DIMS/AveragePeaks.nf | 4 +- 2 files changed, 62 insertions(+), 68 deletions(-) diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R index 695c08fa..eef2d8a2 100644 --- a/DIMS/AveragePeaks.R +++ b/DIMS/AveragePeaks.R @@ -1,72 +1,66 @@ +library(dplyr) + # define parameters -# ppm as fixed value, not the same ppm as in peak grouping -ppm_peak <- 2 +cmd_args <- commandArgs(trailingOnly = TRUE) -library(dplyr) +sample_name <- cmd_args[1] +techreps <- cmd_args[2] +scanmode <- cmd_args[3] +tech_reps <- strsplit(techreps, ";")[[1]] +print(sample_name) +print(techreps) +print(scanmode) -scanmodes <- c("positive", "negative") +# set ppm as fixed value, not the same ppm as in peak grouping +ppm_peak <- 2 -for (scanmode in scanmodes){ - # get sample names - load(paste0(scanmode, "_repl_pattern.RData")) - sample_names <- names(repl_pattern_filtered) - # initialize - outlist_total <- NULL - # for each biological sample, average peaks in technical replicates - for (sample_name in sample_names) { - print(sample_name) - # Initialize per sample - peaklist_allrepl <- NULL - nr_repl_persample <- 0 - # averaged_peaks <- matrix(0, nrow = 10 ^ 7, ncol = 6) # how big does it need to be? - averaged_peaks <- matrix(0, nrow = 0, ncol = 6) # append - colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") - # load RData files of technical replicates belonging to biological sample - sample_techreps_file <- repl_pattern_filtered[sample_name][[1]] - for (file_nr in 1:length(sample_techreps_file)) { - print(sample_techreps_file[file_nr]) - tech_repl_file <- paste0(sample_techreps_file[file_nr], "_positive.RData") - tech_repl <- get(load(tech_repl_file)) - # combine data for all technical replicates - peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) - # count number of replicates for each biological sample - nr_repl_persample <- nr_repl_persample + 1 - } - # sort on mass - peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) - peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) - peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) - # peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt)) - peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) - # average over technical replicates - while (nrow(peaklist_allrepl_sorted) > 1) { - # store row numbers - peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) - # find the peaks in the dataset with corresponding m/z plus or minus tolerance - reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] - mz_tolerance <- (reference_mass * ppm_peak) / 10^6 - minmz_ref <- reference_mass - mz_tolerance - maxmz_ref <- reference_mass + mz_tolerance - select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) - select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] - nrsamples <- length(select_peak_indices) - # put averaged intensities into a new row and append to averaged_peaks - averaged_1peak <- matrix(0, nrow = 1, ncol = 6) - colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") - # calculate m/z values for peak group - averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) - averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) - averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) - averaged_1peak[1, "fq"] <- nrsamples - averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) - # put intensities into proper columns - peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] - averaged_peaks <- rbind(averaged_peaks, averaged_1peak) - } - # add sample name to first column and append to outlist_total for all samples - averaged_peaks[ , "samplenr"] <- sample_name - outlist_total <- rbind(outlist_total, averaged_peaks) - } - save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData")) +# average peaks in technical replicates +# Initialize per sample +peaklist_allrepl <- NULL +nr_repl_persample <- 0 +averaged_peaks <- matrix(0, nrow = 0, ncol = 6) +colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") +# load RData files of technical replicates belonging to biological sample +for (file_nr in 1:length(tech_reps)) { + tech_repl_file <- paste0(tech_reps[file_nr], "_", scanmode, ".RData") + tech_repl <- get(load(tech_repl_file)) + # combine data for all technical replicates + peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) + # count number of replicates for each biological sample + nr_repl_persample <- nr_repl_persample + 1 +} +# sort on mass +peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) +peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) +peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) +# peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt)) +peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) +# average over technical replicates +while (nrow(peaklist_allrepl_sorted) > 1) { + # store row numbers + peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) + # find the peaks in the dataset with corresponding m/z plus or minus tolerance + reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] + mz_tolerance <- (reference_mass * ppm_peak) / 10^6 + minmz_ref <- reference_mass - mz_tolerance + maxmz_ref <- reference_mass + mz_tolerance + select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) + select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] + nrsamples <- length(select_peak_indices) + # put averaged intensities into a new row and append to averaged_peaks + averaged_1peak <- matrix(0, nrow = 1, ncol = 6) + colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # calculate m/z values for peak group + averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) + averaged_1peak[1, "fq"] <- nrsamples + averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) + # put intensities into proper columns + peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] + averaged_peaks <- rbind(averaged_peaks, averaged_1peak) } +# add sample name to first column +averaged_peaks[ , "samplenr"] <- sample_name +save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData")) diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf index 9d49f5f8..de4b21b0 100644 --- a/DIMS/AveragePeaks.nf +++ b/DIMS/AveragePeaks.nf @@ -6,13 +6,13 @@ process AveragePeaks { input: path(rdata_files) - path(replication_pattern) + tuple val(sample_id), val(tech_reps), val(scanmode) output: path 'AvgPeaks_*.RData' script: """ - Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R + Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode """ } From d0ed769ec7373ce3cd47243065e4743d99192de4 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:56:15 +0200 Subject: [PATCH 16/31] modified input for PeakGrouping corresponding to new PeakFinding method --- DIMS/PeakGrouping.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index ae4382c7..0f20dc5b 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -6,7 +6,7 @@ process PeakGrouping { input: path(hmdbpart_file) - each path(spectrumpeak_file) + path(averagedpeaks_file) each path(pattern_file) output: From a35c4ca7decffc6e3379b44e220a0a47ec5392fb Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 8 Jul 2025 15:57:30 +0200 Subject: [PATCH 17/31] collect averaged peaks per biological sample, corresponding to new PeakFinding method --- DIMS/CollectAveraged.R | 18 ++++++++++++++++++ DIMS/CollectAveraged.nf | 17 +++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100755 DIMS/CollectAveraged.R create mode 100644 DIMS/CollectAveraged.nf diff --git a/DIMS/CollectAveraged.R b/DIMS/CollectAveraged.R new file mode 100755 index 00000000..e6466d9e --- /dev/null +++ b/DIMS/CollectAveraged.R @@ -0,0 +1,18 @@ +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +scripts_dir <- cmd_args[1] + +# for each scan mode, collect all averaged peak lists per biological sample +scanmodes <- c("positive", "negative") +for (scanmode in scanmodes) { + # get list of files + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".RData")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peaklist_averaged <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peaklist_averaged) + } + save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData")) +} diff --git a/DIMS/CollectAveraged.nf b/DIMS/CollectAveraged.nf new file mode 100644 index 00000000..fc65bf21 --- /dev/null +++ b/DIMS/CollectAveraged.nf @@ -0,0 +1,17 @@ +process CollectAveraged { + tag "DIMS CollectAveraged" + label 'CollectAveraged' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(averaged_files) + + output: + path('AvgPeaks*.RData'), emit: averaged_peaks + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectAveraged.R + """ +} From 295e4609abf81ee7384fa50fb5a4f4d958ffcb26 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 15 Jul 2025 14:39:38 +0200 Subject: [PATCH 18/31] fixed path to DIMS peak_finding_functions --- DIMS/tests/testthat/test_peak_finding_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R index 1e001a0c..bc7f9d86 100644 --- a/DIMS/tests/testthat/test_peak_finding_functions.R +++ b/DIMS/tests/testthat/test_peak_finding_functions.R @@ -1,6 +1,6 @@ # unit tests for PeakFinding functions: -source("../preprocessing/peak_finding_functions.R") +source("../../preprocessing/peak_finding_functions.R") # test search_regions_of_interest function: test_that("regions of interest are correctly found for a single peak", { From 109d664f1e1923a0b95e1cfadc67c78bd991310d Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 2 Oct 2025 11:48:14 +0200 Subject: [PATCH 19/31] created function for averaging peaks in DIMS/AveragePeaks.R --- DIMS/AveragePeaks.R | 40 ++++---------------- DIMS/AveragePeaks.nf | 2 +- DIMS/preprocessing/average_peaks_functions.R | 39 +++++++++++++++++++ 3 files changed, 47 insertions(+), 34 deletions(-) create mode 100644 DIMS/preprocessing/average_peaks_functions.R diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R index eef2d8a2..7e8d9fe1 100644 --- a/DIMS/AveragePeaks.R +++ b/DIMS/AveragePeaks.R @@ -6,61 +6,35 @@ cmd_args <- commandArgs(trailingOnly = TRUE) sample_name <- cmd_args[1] techreps <- cmd_args[2] scanmode <- cmd_args[3] +preprocessing_scripts_dir <- cmd_args[4] tech_reps <- strsplit(techreps, ";")[[1]] -print(sample_name) -print(techreps) -print(scanmode) + +# load in function scripts +source(paste0(preprocessing_scripts_dir, "average_peaks_functions.R")) # set ppm as fixed value, not the same ppm as in peak grouping ppm_peak <- 2 -# average peaks in technical replicates # Initialize per sample peaklist_allrepl <- NULL nr_repl_persample <- 0 averaged_peaks <- matrix(0, nrow = 0, ncol = 6) colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # load RData files of technical replicates belonging to biological sample for (file_nr in 1:length(tech_reps)) { tech_repl_file <- paste0(tech_reps[file_nr], "_", scanmode, ".RData") tech_repl <- get(load(tech_repl_file)) # combine data for all technical replicates peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl) - # count number of replicates for each biological sample - nr_repl_persample <- nr_repl_persample + 1 } # sort on mass peaklist_allrepl_df <- as.data.frame(peaklist_allrepl) peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) -# peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt)) peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) + # average over technical replicates -while (nrow(peaklist_allrepl_sorted) > 1) { - # store row numbers - peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) - # find the peaks in the dataset with corresponding m/z plus or minus tolerance - reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] - mz_tolerance <- (reference_mass * ppm_peak) / 10^6 - minmz_ref <- reference_mass - mz_tolerance - maxmz_ref <- reference_mass + mz_tolerance - select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) - select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] - nrsamples <- length(select_peak_indices) - # put averaged intensities into a new row and append to averaged_peaks - averaged_1peak <- matrix(0, nrow = 1, ncol = 6) - colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") - # calculate m/z values for peak group - averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) - averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) - averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) - averaged_1peak[1, "fq"] <- nrsamples - averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) - # put intensities into proper columns - peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] - averaged_peaks <- rbind(averaged_peaks, averaged_1peak) -} -# add sample name to first column -averaged_peaks[ , "samplenr"] <- sample_name +averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted) save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData")) diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf index de4b21b0..f50bd874 100644 --- a/DIMS/AveragePeaks.nf +++ b/DIMS/AveragePeaks.nf @@ -13,6 +13,6 @@ process AveragePeaks { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode + Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode $params.preprocessing_scripts_dir """ } diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R new file mode 100644 index 00000000..2545b757 --- /dev/null +++ b/DIMS/preprocessing/average_peaks_functions.R @@ -0,0 +1,39 @@ +average_peaks_per_sample <- function(peaklist_allrepl_sorted) { + #' Average the intensity of peaks that occur in different technical replicates of a biological sample + #' + #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float) + #' + #' @return averaged_peaks: matrix of averaged peaks (float) + + # initialize + averaged_peaks <- peaklist_allrepl_sorted[0, ] + + while (nrow(peaklist_allrepl_sorted) > 1) { + # store row numbers + peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) + # find the peaks in the dataset with corresponding m/z plus or minus tolerance + reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1] + mz_tolerance <- (reference_mass * ppm_peak) / 10^6 + minmz_ref <- reference_mass - mz_tolerance + maxmz_ref <- reference_mass + mz_tolerance + select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref)) + select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ] + nrsamples <- length(select_peak_indices) + # put averaged intensities into a new row + averaged_1peak <- matrix(0, nrow = 1, ncol = 6) + colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + # calculate m/z values for peak group + averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt) + averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt) + averaged_1peak[1, "fq"] <- nrsamples + averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt) + # remove rownr column and append to averaged_peaks + peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ] + averaged_peaks <- rbind(averaged_peaks, averaged_1peak) + } + # add sample name to first column + averaged_peaks[ , "samplenr"] <- sample_name + + return(averaged_peaks) +} From cb33aba206ce326a79347404a6b76769af3e38d4 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 2 Oct 2025 11:49:30 +0200 Subject: [PATCH 20/31] added unit tests for average_peaks_functions --- DIMS/tests/testthat/test_average_peaks.R | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 DIMS/tests/testthat/test_average_peaks.R diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R new file mode 100644 index 00000000..994de48c --- /dev/null +++ b/DIMS/tests/testthat/test_average_peaks.R @@ -0,0 +1,26 @@ +# unit test for AveragePeaks + +# source all functions for PeakGrouping +source("../../preprocessing/average_peaks_functions.R") + +# test find_peak_groups +testthat::test_that("peaks are correctly averaged", { + # create peak list to test on: + samplenrs <- rep(c("repl_001", "repl_002", "repl_003"), 3) + test_peaklist_sorted_num <- matrix(0, nrow = 9, ncol = 5) + colnames(test_peaklist_sorted_num) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt") + test_peaklist_sorted_num[, 1] <- 300 + (1:9) / 10000 + test_peaklist_sorted_num[, 5] <- 100 * (9:1) + test_peaklist_sorted <- as.data.frame(cbind(samplenr = samplenrs, test_peaklist_sorted_num)) + test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2]) + test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6]) + + # test that first peak is correctly averaged + expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted)[1, 6]), 600, tolerance = 0.001, TRUE) + # test number of rows + expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted)), 2) + # test column names + expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted)), + c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE) +}) + From db8633e61e15b27c1c052ec4462f7e0b6cfa5520 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 3 Oct 2025 13:49:00 +0200 Subject: [PATCH 21/31] moved parameters matrix and nr_replicates from workflow into params --- DIMS/EvaluateTics.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf index 831c9500..fdf7f713 100644 --- a/DIMS/EvaluateTics.nf +++ b/DIMS/EvaluateTics.nf @@ -8,9 +8,7 @@ process EvaluateTics { path(rdata_file) path(tic_txt_files) path(init_file) - val(nr_replicates) val(analysis_id) - val(matrix) path(highest_mz_file) path(trim_params_file) @@ -19,14 +17,14 @@ process EvaluateTics { path('replicates_per_sample.txt'), emit: sample_techreps path('miss_infusions_negative.txt') path('miss_infusions_positive.txt') - path('*_TICplots.pdf') + path('*_TICplots.pdf'), emit: tic_plots_pdf script: """ Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R $init_file \ $params.nr_replicates \ $analysis_id \ - $matrix \ + $params.matrix \ $highest_mz_file \ $trim_params_file """ From 004e3e9aad7b92472d52d4083bdf193ccefef11e Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 3 Oct 2025 13:51:15 +0200 Subject: [PATCH 22/31] refactored DIMS/EvaluateTics --- DIMS/EvaluateTics.R | 109 +++++++------------------------------------- 1 file changed, 16 insertions(+), 93 deletions(-) diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R index fb73576a..a61e910f 100644 --- a/DIMS/EvaluateTics.R +++ b/DIMS/EvaluateTics.R @@ -14,29 +14,6 @@ highest_mz <- get(load(highest_mz_file)) trim_params_filepath <- cmd_args[6] thresh2remove <- 1000000000 -remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { - # collect list of samples to remove from replication pattern - remove_from_group <- NULL - for (sample_nr in 1:length(repl_pattern)){ - repl_pattern_1sample <- repl_pattern[[sample_nr]] - remove <- NULL - for (file_nr in 1:length(repl_pattern_1sample)) { - if (repl_pattern_1sample[file_nr] %in% bad_samples) { - remove <- c(remove, file_nr) - } - } - if (length(remove) == nr_replicates) { - remove_from_group <- c(remove_from_group, sample_nr) - } - if (!is.null(remove)) { - repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] - } - } - if (length(remove_from_group) != 0) { - repl_pattern <- repl_pattern[-remove_from_group] - } - return(list("pattern" = repl_pattern)) -} # load init_file: contains repl_pattern load(init_file) @@ -52,86 +29,31 @@ if (highest_mz > 700) { thresh2remove <- 1000000 } -# remove technical replicates which are below the threshold and average intensity over time -remove_neg <- NULL -remove_pos <- NULL -cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (sample_nr in 1:length(repl_pattern)) { - tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) - tech_reps_array_pos <- NULL - tech_reps_array_neg <- NULL - for (file_nr in 1:length(tech_reps)) { - load(paste(tech_reps[file_nr], ".RData", sep = "")) - cat("\n\nParsing", tech_reps[file_nr]) - # positive scan mode: determine whether sum of intensities is above threshold - cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) - if (sum(peak_list$pos[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_pos <- c(remove_pos, tech_reps[file_nr]) - } - tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) - # negative scan mode: determine whether sum of intensities is above threshold - cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) - if (sum(peak_list$neg[, 1]) < thresh2remove) { - cat(" ... Removed") - remove_neg <- c(remove_neg, tech_reps[file_nr]) - } - tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) - } -} +# find out which technical replicates are below the threshold +remove_tech_reps <- find_bad_replicates(repl_pattern, thresh2remove) +print(remove_tech_reps) # negative scan mode -print("neg") -pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) -repl_pattern_filtered <- pattern_list$pattern -print(repl_pattern_filtered) -print(length(repl_pattern_filtered)) +remove_neg <- remove_tech_reps$neg +repl_pattern_filtered <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) save(repl_pattern_filtered, file = "negative_repl_pattern.RData") -# save replication pattern in txt file for use in Nextflow -allsamples_techreps_neg <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) -for (sample_nr in 1:length(repl_pattern_filtered)) { - allsamples_techreps_neg[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] - allsamples_techreps_neg[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") -} -allsamples_techreps_neg[, 3] <- "negative" - -# write information on miss_infusions -write.table(remove_neg, - file = "miss_infusions_negative.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" -) # positive scan mode -pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) -print(pattern_list) -repl_pattern_filtered <- pattern_list$pattern +remove_pos <- remove_tech_reps$pos +repl_pattern_filtered <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) save(repl_pattern_filtered, file = "positive_repl_pattern.RData") -# save replication pattern in txt file for use in Nextflow -allsamples_techreps_pos <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) -for (sample_nr in 1:length(repl_pattern_filtered)) { - allsamples_techreps_pos[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] - allsamples_techreps_pos[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") -} -allsamples_techreps_pos[, 3] <- "positive" -# combine information on samples and technical replicates for pos and neg -allsamples_techreps <- rbind(allsamples_techreps_pos, allsamples_techreps_neg) -write.table(allsamples_techreps, - file = paste0("replicates_per_sample.txt"), +# get an overview of suitable technical replicates for both scan modes +allsamples_techreps_neg <- get_overview_tech_reps(repl_pattern_filtered, "negative") +allsamples_techreps_pos <- get_overview_tech_reps(repl_pattern_filtered, "positive") +allsamples_techreps_both_scanmodes <- rbind(allsamples_techreps_pos, allsamples_techreps_neg) +write.table(allsamples_techreps_both_scanmodes, + file = "replicates_per_sample.txt", col.names = FALSE, - row.names = FALSE, - sep = "," + row.names = FALSE, + sep = "," ) -# write information on miss_infusions -write.table(remove_pos, - file = "miss_infusions_positive.txt", - row.names = FALSE, - col.names = FALSE, - sep = "\t" -) ## generate TIC plots # get all txt files @@ -212,3 +134,4 @@ tic_plot_pdf <- marrangeGrob( ggsave(filename = paste0(run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") + From 06e5e1a69b4e775bbffa615421715894c59d7389 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 3 Oct 2025 13:53:01 +0200 Subject: [PATCH 23/31] moved functions for DIMS/EvaluateTics to separate file --- DIMS/preprocessing/evaluate_tics_functions.R | 104 +++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 DIMS/preprocessing/evaluate_tics_functions.R diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R new file mode 100644 index 00000000..057b07cd --- /dev/null +++ b/DIMS/preprocessing/evaluate_tics_functions.R @@ -0,0 +1,104 @@ +# EvaluateTics functions +find_bad_replicates <- function(repl_pattern, thresh2remove) { + #' Find technical replicates with a total intensity below a threshold + #' + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param thresh2remove: Threshold value for acceptance or rejection of total intensity (integer) + #' + #' @return remove_tech_reps: Array of rejected technical replicates (strings) + + remove_pos <- NULL + remove_neg <- NULL + cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") + for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + tech_reps_array_pos <- NULL + #tech_reps_array_neg <- NULL + for (file_nr in 1:length(tech_reps)) { + load(paste0(tech_reps[file_nr], ".RData")) + cat("\n\nParsing", tech_reps[file_nr]) + # positive scan mode: determine whether sum of intensities is above threshold + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_pos <- c(remove_pos, tech_reps[file_nr]) + } + # negative scan mode: determine whether sum of intensities is above threshold + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { + cat(" ... Removed") + remove_neg <- c(remove_neg, tech_reps[file_nr]) + } + } + } + cat("\n") + # write information on miss_infusions for both scan modes + write.table(remove_pos, + file = paste0("miss_infusions_positive.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + write.table(remove_neg, + file = paste0("miss_infusions_negative.txt"), + row.names = FALSE, + col.names = FALSE, + sep = "\t" + ) + + # combine removed technical replicates from pos and neg + remove_tech_reps <- list(pos = remove_pos, neg = remove_neg) + return(remove_tech_reps) +} + +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { + #' Remove technical replicates with insufficient quality from a biological sample + #' + #' @param bad_samples: Array of technical replicates of insufficient quality (strings) + #' @param repl_pattern: List of samples with corresponding technical replicates (strings) + #' @param nr_replicates: Number of technical replicates per biological sample (integer) + #' + #' @return repl_pattern_filtered: list of technical replicates of sufficient quality (strings) + + # collect list of samples to remove from replication pattern + remove_from_group <- NULL + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] + remove <- NULL + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) + } + } + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } + } + if (length(remove_from_group) != 0) { + repl_pattern_filtered <- repl_pattern[-remove_from_group] + } else { + repl_pattern_filtered <- repl_pattern + } + return(repl_pattern_filtered) +} + +get_overview_tech_reps <- function(repl_pattern_filtered, scanmode) { + #' Create an overview of technical replicates with sufficient quality from a biological sample + #' + #' @param repl_pattern_filtered: List of samples with corresponding technical replicates (strings) + #' @param scanmode: Scan mode "positive" or "negative" (string) + #' + #' @return allsamples_techreps_scanmode: Matrix of technical replicates of sufficient quality (strings) + + allsamples_techreps_scanmode <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered)) + for (sample_nr in 1:length(repl_pattern_filtered)) { + allsamples_techreps_scanmode[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr] + allsamples_techreps_scanmode[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";") + } + allsamples_techreps_scanmode[, 3] <- scanmode + return(allsamples_techreps_scanmode) +} + From 3f24b6dcaa615d0702ffdc634f583f8894ed25de Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 3 Oct 2025 13:54:54 +0200 Subject: [PATCH 24/31] added unit tests for DIMS/EvaluateTics --- DIMS/tests/testthat/test_evaluate_tics.R | 66 ++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 DIMS/tests/testthat/test_evaluate_tics.R diff --git a/DIMS/tests/testthat/test_evaluate_tics.R b/DIMS/tests/testthat/test_evaluate_tics.R new file mode 100644 index 00000000..b474a81b --- /dev/null +++ b/DIMS/tests/testthat/test_evaluate_tics.R @@ -0,0 +1,66 @@ +# unit test for EvaluateTics + +# source all functions for PeakGrouping +source("../../preprocessing/evaluate_tics_functions.R") + +# test find_bad_replicates +testthat::test_that("TICS are correctly accepted or rejected", { + # It's necessary to copy/symlink the files to the current location for the combine_sum_adducts_parts function + # local: setwd("~/Development/DIMS_refactor_PeakFinding_codereview/CustomModules/DIMS/tests/testthat") + test_files <- list.files("fixtures/", "test_evaluate_tics", full.names = TRUE) + file.symlink(file.path(test_files), getwd()) + + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern <- list(technical_replicates) + names(test_repl_pattern) <- "sample1" + test_thresh2remove <- 10^9 + + # test that output has two entries + expect_equal(length(find_bad_replicates(test_repl_pattern, test_thresh2remove)), 2) + # test that first technical replicate is removed in positive scan mode + expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$pos, "test_evaluate_tics_file1", TRUE) + # test that third technical replicate is removed in negative scan mode + expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$neg, "test_evaluate_tics_file3", TRUE) + # test that output files are generated + expect_equal(sum(grepl("miss_infusions_", list.files("./"))), 2) + + # Remove symlinked files + files_remove <- list.files("./", "SummedAdducts_test.RData", full.names = TRUE) + file.remove(files_remove) + +}) + +# test remove_from_repl_pattern +testthat::test_that("technical replicates are correctly removed from replication pattern", { + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern <- list(technical_replicates) + names(test_repl_pattern) <- "sample1" + test_bad_samples <- "test_evaluate_tics_file2" + test_nr_replicates <- 3 + + # test that the output contains 1 sample + expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)), 1) + # test that the output for the sample contains 2 technical replicates + expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1), 2) + # test that the second technical replicate has been removed + expect_false(unique(grepl(test_bad_samples, remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1))) +}) + +# test get_overview_tech_reps +testthat::test_that("overview of technical replicates is correctly created", { + # create replication pattern to test on: + technical_replicates <- paste0("test_evaluate_tics_file", 1:3) + test_repl_pattern_filtered <- list(technical_replicates) + names(test_repl_pattern_filtered) <- "sample1" + test_scanmode <- "positive" + + # test that overview is correctly created + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,1], "sample1") + expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,3], "positive") + expect_true(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,2] == + paste0(technical_replicates, collapse = ";"), TRUE) + +}) + From c2c65ddfe715881f3bf2277a28b1e6931880c844 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 3 Oct 2025 14:11:17 +0200 Subject: [PATCH 25/31] modifications suggested in code review DIMS/PeakFinding --- DIMS/PeakFinding.R | 6 +++--- DIMS/preprocessing/peak_finding_functions.R | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 529370d4..2cf49d20 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,3 +1,5 @@ +library(dplyr) + # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -10,8 +12,6 @@ peak_thresh <- 2000 # source functions script source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R")) -library(dplyr) - # Load output of AssignToBins (peak_list) for a technical replicate load(replicate_rdatafile) techrepl_name <- colnames(peak_list$pos)[1] @@ -22,7 +22,7 @@ techreps_passed <- read.table("replicates_per_sample.txt", sep=",") # Initialize options(digits = 16) -# run the findPeaks function +# do peak finding scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get intensities for scan mode diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R index 8c2f62d7..8539c03c 100644 --- a/DIMS/preprocessing/peak_finding_functions.R +++ b/DIMS/preprocessing/peak_finding_functions.R @@ -21,7 +21,7 @@ search_regions_of_interest <- function(ints_fullrange) { } else { regions_of_interest_gte3 <- regions_of_interest_length } - # test for length of roi. If length is greater than 11, remove roi and break up into separate rois + # test for length of roi (region of interest). If length is greater than 11, break up into separate rois remove_roi_index <- c() new_rois_all <- regions_of_interest_gte3[0, ] for (roi_nr in 1:nrow(regions_of_interest_gte3)) { @@ -43,7 +43,7 @@ search_regions_of_interest <- function(ints_fullrange) { new_rois_splitroi <- rbind(new_rois_splitroi, new_rois) start_pos <- new_rois[, 2] } - # last part + # intensities after last local minimum new_rois[, 1] <- start_pos new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"] new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1 @@ -67,13 +67,11 @@ search_regions_of_interest <- function(ints_fullrange) { } # remove rois that have been split into chunks or shortened - #print(dim(regions_of_interest_gte3)) if (length(remove_roi_index) > 0) { regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ] } else { regions_of_interest_minus_short <- regions_of_interest_gte3 } - #print(dim(regions_of_interest_minus_short)) # combine remaining rois with info on chunks regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all) # remove regions of interest with short lengths again From 15a25ba489a52702b64122b2c2d0b2b9d4a3356f Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 16 Oct 2025 16:21:48 +0200 Subject: [PATCH 26/31] removed two obsolete lines --- DIMS/preprocessing/evaluate_tics_functions.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R index 057b07cd..318ce2b6 100644 --- a/DIMS/preprocessing/evaluate_tics_functions.R +++ b/DIMS/preprocessing/evaluate_tics_functions.R @@ -12,8 +12,6 @@ find_bad_replicates <- function(repl_pattern, thresh2remove) { cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") for (sample_nr in 1:length(repl_pattern)) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) - tech_reps_array_pos <- NULL - #tech_reps_array_neg <- NULL for (file_nr in 1:length(tech_reps)) { load(paste0(tech_reps[file_nr], ".RData")) cat("\n\nParsing", tech_reps[file_nr]) From 007bea441c48b9019b35e6d7ea5fdaa9190122ec Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 16 Oct 2025 17:14:26 +0200 Subject: [PATCH 27/31] moved parameter ppm_peak from DIMS/AveragePeaks.R to inside function --- DIMS/AveragePeaks.R | 3 --- DIMS/preprocessing/average_peaks_functions.R | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R index 7e8d9fe1..997b1d3c 100644 --- a/DIMS/AveragePeaks.R +++ b/DIMS/AveragePeaks.R @@ -12,9 +12,6 @@ tech_reps <- strsplit(techreps, ";")[[1]] # load in function scripts source(paste0(preprocessing_scripts_dir, "average_peaks_functions.R")) -# set ppm as fixed value, not the same ppm as in peak grouping -ppm_peak <- 2 - # Initialize per sample peaklist_allrepl <- NULL nr_repl_persample <- 0 diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R index 2545b757..77309acf 100644 --- a/DIMS/preprocessing/average_peaks_functions.R +++ b/DIMS/preprocessing/average_peaks_functions.R @@ -7,7 +7,9 @@ average_peaks_per_sample <- function(peaklist_allrepl_sorted) { # initialize averaged_peaks <- peaklist_allrepl_sorted[0, ] - + # set ppm as fixed value, not the same ppm as in peak grouping + ppm_peak <- 2 + while (nrow(peaklist_allrepl_sorted) > 1) { # store row numbers peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted) From e58640b703250c7c702c1f39a78606827854c890 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 16 Oct 2025 17:20:12 +0200 Subject: [PATCH 28/31] added parameter sample_name to DIMS/preprocessing/average_peaks_functions.R --- DIMS/AveragePeaks.R | 2 +- DIMS/preprocessing/average_peaks_functions.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R index 997b1d3c..7114e3c4 100644 --- a/DIMS/AveragePeaks.R +++ b/DIMS/AveragePeaks.R @@ -32,6 +32,6 @@ peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt) # average over technical replicates -averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted) +averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name) save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData")) diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R index 77309acf..beed29bb 100644 --- a/DIMS/preprocessing/average_peaks_functions.R +++ b/DIMS/preprocessing/average_peaks_functions.R @@ -1,4 +1,4 @@ -average_peaks_per_sample <- function(peaklist_allrepl_sorted) { +average_peaks_per_sample <- function(peaklist_allrepl_sorted, sample_name) { #' Average the intensity of peaks that occur in different technical replicates of a biological sample #' #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float) From f0763a012907c191916b5d9fa6eed21be1386e42 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 16 Oct 2025 17:24:58 +0200 Subject: [PATCH 29/31] modified DIMS/tests/testthat/test_average_peaks.R for extra variable sample_name --- DIMS/tests/testthat/test_average_peaks.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R index 994de48c..bb945b71 100644 --- a/DIMS/tests/testthat/test_average_peaks.R +++ b/DIMS/tests/testthat/test_average_peaks.R @@ -14,13 +14,14 @@ testthat::test_that("peaks are correctly averaged", { test_peaklist_sorted <- as.data.frame(cbind(samplenr = samplenrs, test_peaklist_sorted_num)) test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2]) test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6]) + test_sample_name <- "P001" # test that first peak is correctly averaged - expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted)[1, 6]), 600, tolerance = 0.001, TRUE) + expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)[1, 6]), 600, tolerance = 0.001, TRUE) # test number of rows - expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted)), 2) + expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), 2) # test column names - expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted)), + expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE) }) From ac9f43fc6508e5db29beb9d5d10cb8dbaa337e3a Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 17 Oct 2025 14:50:10 +0200 Subject: [PATCH 30/31] added fixture files for unit test for DIMS/EvaluateTics --- .../fixtures/test_evaluate_tics_file1.RData | Bin 0 -> 307 bytes .../fixtures/test_evaluate_tics_file2.RData | Bin 0 -> 301 bytes .../fixtures/test_evaluate_tics_file3.RData | Bin 0 -> 309 bytes 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData new file mode 100644 index 0000000000000000000000000000000000000000..6d9f40d89645aeee7de35825602471c5252ff7a0 GIT binary patch literal 307 zcmV-30nGj%iwFP!000000}FDAFy@NjVqjokW?*3flB_@`18ZoAo2~@|0}B(7!^ptG zzzL)|3sMua<8v~LOBfiKguyx(nD~G+mt(Tf1qP}D6pNWtGIN0xFA#%-xxiAWN;rT@ z@)C1Xi-Bxms5$|#TBy94rJjL-fw2)9-vX6yVvNSOMCF^BqVmlQ(fH=5d~-DW%+c(# zFoC)g<`15<%$!t^N8$|(41xTAupo9~S`e>5^&n>Yk3C$O{|~=#WB`YwL!bl8KI1ee zAkDgwt7;pNW?THQUlT~PPusD0E|BKv?B5{>q&aKS&a?n&u6)}EKY%oMlE~K_Ak7o< zhVwp<=5;?T7mSt;Sz(C~|eRIZH! F003yQiBA9k literal 0 HcmV?d00001 diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData new file mode 100644 index 0000000000000000000000000000000000000000..77d650f943d6314e2650f02db73963f0d1413f73 GIT binary patch literal 301 zcmV+|0n+{-iwFP!000000}FDAFy@NjVqjokW?*3flB_@`18ZoAo2~@|0}B(7!^ptG zzzL)|3sMua<8v~LOBfiKguyx(nD~G+mlM;1cm)P(0u+mxQ!;ab6fY2igt@>{s7g41 zO7aqOQ;UIYVW>I*uv)0RnWdhAfq}6R8s7qyZ(@wbw?yTenxgW}4AJ=JsC;uY`^?eo zvoL|W6Xp+|w9K4TkVoPT3=Dz%f3yu^rvKQ(mHGeh3r7ZUI64G6u*>%>mLpA#XVE18H9O z!*aoB>5vtc2tkfvhNa(v{9+h0FEt&?{s7g41 zO7aqOQ;UIYVW>I*uv)0RnWdhAfq}6R8s7qyZ(@wbw?yTenxgW}4AJ=JsC;uY`^?eo zvoL|W6Xp+|w9K4TkVoPT3=Dz%f3P5SOcuI8)er`UAJc#A;mZ7f_=O__I2;`U9a#1m zr#S&>){R_M+kiCN;)ng3K$?Bpj>U6 Date: Mon, 22 Dec 2025 11:55:20 +0100 Subject: [PATCH 31/31] added unit test for empty peaklist --- DIMS/tests/testthat/test_average_peaks.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R index bb945b71..0a8d2345 100644 --- a/DIMS/tests/testthat/test_average_peaks.R +++ b/DIMS/tests/testthat/test_average_peaks.R @@ -15,6 +15,7 @@ testthat::test_that("peaks are correctly averaged", { test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2]) test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6]) test_sample_name <- "P001" + test_empty_peaklist <- test_peaklist_sorted[0, ] # test that first peak is correctly averaged expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)[1, 6]), 600, tolerance = 0.001, TRUE) @@ -23,5 +24,7 @@ testthat::test_that("peaks are correctly averaged", { # test column names expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE) + # test what happens when peak list is empty + expect_error(average_peaks_per_sample(test_empty_peaklist, test_sample_name)) })