From 0f724c4ccbe7fee6fa1e69d65c5566393d813530 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 31 Jan 2025 17:22:42 +0100
Subject: [PATCH 01/32] GenerateBreaks output split into 2 files

---
 DIMS/AssignToBins.R    | 10 +++++-----
 DIMS/AssignToBins.nf   |  4 ++--
 DIMS/GenerateBreaks.R  |  3 ++-
 DIMS/GenerateBreaks.nf |  1 +
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index e49d6c66..a25a5e70 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,5 +1,3 @@
-## adapted from 2-DIMS.R
-
 # load required packages
 suppressPackageStartupMessages(library("xcms"))
 
@@ -8,13 +6,15 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 
 mzml_filepath <- cmd_args[1]
 breaks_filepath <- cmd_args[2]
-resol <- as.numeric(cmd_args[3])
+trimparams_filepath <- cmd_args[3]
+resol <- as.numeric(cmd_args[4])
 trim <- 0.1
 dims_thresh <- 100
 
-# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
-# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg
 load(breaks_filepath)
+# load trim paramters: trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+load(trimparams_filepath)
 
 # get sample name
 sample_name <- sub("\\..*$", "", basename(mzml_filepath))
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index d0a7098b..d3bc79ae 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -5,7 +5,7 @@ process AssignToBins {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(val(file_id), path(mzML_file), path(breaks_file))
+       tuple(val(file_id), path(mzML_file), path(breaks_file), path(trim_params_file))
 
     output:
        path("${file_id}.RData"), emit: rdata_file
@@ -13,7 +13,7 @@ process AssignToBins {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $trim_params_file $params.resolution
         """
 }
 
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index d07fd203..ddb68c56 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -57,5 +57,6 @@ for (i in 1:nr_segments) {
 }
 
 # generate output file
-save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData")
+save(breaks_fwhm, breaks_fwhm_avg, file = "breaks.fwhm.RData")
+save(trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "trim_params.RData")
 save(high_mz, file = "highest_mz.RData")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index af617a8d..c486010d 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -10,6 +10,7 @@ process GenerateBreaks {
 
     output:
        path('breaks.fwhm.RData'), emit: breaks
+       path('trim_params.RData'), emit: trim_params
        path('highest_mz.RData'), emit: highest_mz
 
     script:

From de495e3cd07216be24fd993abdfe7c75054da6be Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 3 Feb 2025 10:21:36 +0100
Subject: [PATCH 02/32] AverageTechReplicates replaced by EvaluateTics

---
 DIMS/EvaluateTics.R  | 176 +++++++++++++++++++++++++++++++++++++++++++
 DIMS/EvaluateTics.nf |  34 +++++++++
 2 files changed, 210 insertions(+)
 create mode 100644 DIMS/EvaluateTics.R
 create mode 100644 DIMS/EvaluateTics.nf

diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R
new file mode 100644
index 00000000..5b8c1e30
--- /dev/null
+++ b/DIMS/EvaluateTics.R
@@ -0,0 +1,176 @@
+# load packages
+library("ggplot2")
+library("gridExtra")
+
+# define parameters
+cmd_args <- commandArgs(trailingOnly = TRUE)
+
+init_file <- cmd_args[1]
+nr_replicates <- as.numeric(cmd_args[2])
+run_name <- cmd_args[3]
+dims_matrix <- cmd_args[4]
+highest_mz_file <- cmd_args[5]
+highest_mz <- get(load(highest_mz_file))
+trim_params_filepath <- cmd_args[6]
+thresh2remove <- 1000000000
+dims_thresh <- 100
+
+remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
+  # collect list of samples to remove from replication pattern
+  remove_from_group <- NULL
+  for (sample_nr in 1:length(repl_pattern)){
+    repl_pattern_1sample <- repl_pattern[[sample_nr]]
+    remove <- NULL
+    for (file_nr in 1:length(repl_pattern_1sample)) {
+      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
+        remove <- c(remove, file_nr)
+      }
+    }
+    if (length(remove) == nr_replicates) {
+      remove_from_group <- c(remove_from_group, sample_nr)
+    }
+    if (!is.null(remove)) {
+      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
+    }
+  }
+  if (length(remove_from_group) != 0) {
+    repl_pattern <- repl_pattern[-remove_from_group]
+  }
+  return(list("pattern" = repl_pattern))
+}
+
+# load init_file: contains repl_pattern
+load(init_file)
+
+# load trim_params_file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+load(trim_params_filepath)
+
+# lower the threshold below which a sample will be removed for DBS and for high m/z
+if (dims_matrix == "DBS") {
+  thresh2remove <- 500000000
+}
+if (highest_mz > 700) {
+  thresh2remove <- 1000000
+}
+
+# remove technical replicates which are below the threshold and average intensity over time
+remove_neg <- NULL
+remove_pos <- NULL
+cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
+for (sample_nr in 1:length(repl_pattern)) {
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+  tech_reps_array_pos <- NULL
+  tech_reps_array_neg <- NULL
+  for (file_nr in 1:length(tech_reps)) {
+    load(paste(tech_reps[file_nr], ".RData", sep = ""))
+    cat("\n\nParsing", tech_reps[file_nr])
+    # positive scan mode: determine whether sum of intensities is above threshold
+    cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
+    if (sum(peak_list$pos[, 1]) < thresh2remove) {
+      cat(" ... Removed")
+      remove_pos <- c(remove_pos, tech_reps[file_nr])
+    }
+    tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
+    # negative scan mode: determine whether sum of intensities is above threshold
+    cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
+    if (sum(peak_list$neg[, 1]) < thresh2remove) {
+      cat(" ... Removed")
+      remove_neg <- c(remove_neg, tech_reps[file_nr])
+    } 
+    tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg)
+  }
+}
+
+pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
+repl_pattern_filtered <- pattern_list$pattern
+save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
+write.table(
+  remove_neg, 
+  file = "miss_infusions_negative.txt", 
+  row.names = FALSE, 
+  col.names = FALSE, 
+  sep = "\t"
+)
+
+pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+repl_pattern_filtered <- pattern_list$pattern
+save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
+write.table(
+  remove_pos, 
+  file = "miss_infusions_positive.txt", 
+  row.names = FALSE, 
+  col.names = FALSE, 
+  sep = "\t"
+)
+
+## generate TIC plots
+# get all txt files
+tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt")
+all_samps <- sub("_TIC\\..*$", "", basename(tic_files))
+
+# determine maximum intensity
+highest_tic_max <- 0
+for (file in tic_files) {
+  tic <- read.table(file)
+  this_tic_max <- max(tic$tic_intensity)
+  if (this_tic_max > highest_tic_max) {
+    highest_tic_max <- this_tic_max
+    max_sample <- sub("_TIC\\..*$", "", basename(file))
+  }
+}
+
+# create a list with information for all TIC plots
+tic_plot_list <- list()
+plot_nr <-  0
+for (sample_nr in c(1:length(repl_pattern))) {
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+  sample_name <- names(repl_pattern)[sample_nr]
+  for (file_nr in 1:length(tech_reps)) {
+    plot_nr <- plot_nr + 1
+    # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
+    repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
+    bad_color_pos <- tech_reps[file_nr] %in% remove_pos
+    bad_color_neg <- tech_reps[file_nr] %in% remove_neg
+    if (bad_color_neg & bad_color_pos) {
+      plot_color <- "#F8766D"
+    } else if (bad_color_pos) {
+      plot_color <- "#ED8141"
+    } else if (bad_color_neg) {
+      plot_color <- "#BF80FF"
+    } else {
+      plot_color <- "white"
+    }
+    tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
+      geom_line(linewidth = 0.3) +
+      geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) +
+      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
+      theme(plot.background = element_rect(fill = plot_color),
+            axis.text = element_text(size = 4),
+            axis.title = element_text(size = 4),
+            plot.title = element_text(size = 6))
+    tic_plot_list[[plot_nr]] <- tic_plot
+  }
+}
+
+# create a layout matrix dependent on number of replicates
+layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
+# put TIC plots in matrix
+tic_plot_pdf <- marrangeGrob(
+  grobs = tic_plot_list,
+  nrow = 10, ncol = nr_replicates,
+  layout_matrix = layout,
+  top = quote(paste(
+    "TICs of run", run_name,
+    " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
+    g, "/", npages
+  ))
+)
+
+# save to file
+ggsave(filename = paste0(run_name, "_TICplots.pdf"),
+       tic_plot_pdf, width = 21, height = 29.7, units = "cm")
+
diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf
new file mode 100644
index 00000000..7ce58684
--- /dev/null
+++ b/DIMS/EvaluateTics.nf
@@ -0,0 +1,34 @@
+process EvaluateTics {
+    tag "DIMS EvaluateTics"
+    label 'EvaluateTics'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(rdata_file)
+       path(tic_txt_files)
+       path(init_file)
+       val(nr_replicates)
+       val(analysis_id)
+       val(matrix)
+       path(highest_mz_file)
+       path(trim_params_file)
+
+    output:
+       path('*_repl_pattern.RData'), emit: pattern_files
+       path('miss_infusions_negative.txt')
+       path('miss_infusions_positive.txt')
+       path('*_TICplots.pdf')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R $init_file \
+                                                             $params.nr_replicates \
+                                                             $analysis_id \
+                                                             $matrix \
+                                                             $highest_mz_file \
+                                                             $trim_params_file
+        """
+}
+
+

From 434a41f7aca65177e226405058e30330207efed3 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 3 Feb 2025 10:22:30 +0100
Subject: [PATCH 03/32] refactored PeakFinding, peak finding funtions moved to
 folder preprocessing

---
 DIMS/PeakFinding.R                          |  87 ++-
 DIMS/PeakFinding.nf                         |   2 +-
 DIMS/preprocessing/peak_finding_functions.R | 779 ++++++++++++++++++++
 3 files changed, 831 insertions(+), 37 deletions(-)
 create mode 100644 DIMS/preprocessing/peak_finding_functions.R

diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 697f8f7e..738a0bc7 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,53 +1,68 @@
-## adapted from 4-peakFinding.R
-
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-sample_file <- cmd_args[1]
+replicate_mzmlfile <- cmd_args[1]
 breaks_file <- cmd_args[2]
 resol <- as.numeric(cmd_args[3])
 scripts_dir <- cmd_args[4]
-thresh <- 2000
-outdir <- "./"
+peak_thresh <- 2000
 
 # load in function scripts
-source(paste0(scripts_dir, "do_peakfinding.R"))
-source(paste0(scripts_dir, "check_overlap.R"))
-source(paste0(scripts_dir, "search_mzrange.R"))
-source(paste0(scripts_dir, "fit_optim.R"))
-source(paste0(scripts_dir, "fit_gaussian.R"))
-source(paste0(scripts_dir, "fit_init.R"))
-source(paste0(scripts_dir, "get_fwhm.R"))
-source(paste0(scripts_dir, "get_stdev.R"))
-source(paste0(scripts_dir, "optimize_gaussfit.R"))
-source(paste0(scripts_dir, "fit_peaks.R"))
-source(paste0(scripts_dir, "fit_gaussians.R"))
-source(paste0(scripts_dir, "estimate_area.R"))
-source(paste0(scripts_dir, "get_fit_quality.R"))
-source(paste0(scripts_dir, "check_overlap.R"))
-source(paste0(scripts_dir, "sum_curves.R"))
-source(paste0(scripts_dir, "within_ppm.R"))
+preprocessing_scripts_dir <- gsub("Utils", "preprocessing", scripts_dir)
+source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R"))
 
 load(breaks_file)
 
-# Load output of AverageTechReplicates for a sample
-sample_avgtechrepl <- get(load(sample_file))
-if (grepl("_pos", sample_file)) {
-  scanmode <- "positive"
-} else if (grepl("_neg", sample_file)) {
-  scanmode <- "negative"
-}
+# Load output of AssignToBins for a sample
+sample_techrepl <- get(load(replicate_mzmlfile))
+scanmodes <- c("positive", "negative")
+ints_pos <- peak_list$pos
+ints_neg <- peak_list$neg
 
 # Initialize
 options(digits = 16)
-# Number used to calculate area under Gaussian curve
-int_factor <- 1 * 10^5 
-# Initial value used to estimate scaling parameter
-scale <- 2
-width <- 1024
-height <- 768
 
 # run the findPeaks function
+techrepl_name <- colnames(ints_pos)[1]
+
+for (scanmode in scanmodes) {
+  # turn dataframe with intensities into a named list
+  if (scanmode == "positive") {
+    ints_perscanmode <- peak_list$pos
+  } else if (scanmode == "negative") {
+    ints_perscanmode <- peak_list$neg
+  }
+  
+  ints_fullrange <- as.vector(ints_perscanmode)
+  names(ints_fullrange) <- rownames(ints_perscanmode)
+  
+  start.time <- Sys.time()
+  
+  # look for m/z range for all peaks
+  allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, scanmode, peak_thresh)
+  
+  end.time <- Sys.time()
+  time.taken <- end.time - start.time
+  time.taken
+  
+  # turn the list into a dataframe
+  outlist_persample <- NULL
+  outlist_persample <- cbind("samplenr" = allpeaks_values$nr,
+                             "mzmed.pkt" = allpeaks_values$mean,
+                             "fq" = allpeaks_values$qual,
+                             "mzmin.pkt" = allpeaks_values$min,
+                             "mzmax.pkt" = allpeaks_values$max,
+                             "height.pkt" = allpeaks_values$area)
+  
+  # remove peaks with height = 0
+  outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ]
+  
+  # save output to file
+  save(outlist_persample, file = paste0(techrepl_name, "_", scanmode, ".RData"))
+  
+  # generate text output to log file on number of spikes for this sample
+  # spikes are peaks that are too narrow, e.g. 1 data point
+  cat(paste("There were", allpeaks_values$spikes, "spikes"))
+  
+}
 
-# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
-do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 0097d128..dad2a497 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -12,6 +12,6 @@ process PeakFinding {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.preprocessing_scripts_dir
         """
 }
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
new file mode 100644
index 00000000..331d563b
--- /dev/null
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -0,0 +1,779 @@
+# functions for peak finding
+search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_thresh) {
+  #' Divide the full m/z range into regions of interest with min, max and mean m/z
+  #'
+  #' @param ints_fullrange: Named list of intensities (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param sample_name: Sample name (string)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param peak_thresh: Value for noise level threshold (integer)
+  #'
+  #' @return allpeaks_values: list of m/z regions of interest
+
+  # Number used to calculate area under Gaussian curve
+  int_factor <- 1 * 10^5 
+  
+  # initialize list to store results for all peaks
+  allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
+                          "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
+  
+  # find indices where intensity is not equal to zero
+  nonzero_positions <- as.vector(which(ints_fullrange != 0))
+
+  # initialize
+  # start position of the first peak
+  start_index <- nonzero_positions[1]
+  # maximum length of region of interest
+  max_roi_length <- 15
+
+  # find regions of interest
+  for (running_index in 1:length(nonzero_positions)) {
+    # find position of the end of a peak.
+    if (running_index < length(nonzero_positions) && (nonzero_positions[running_index + 1] - nonzero_positions[running_index]) > 1) {
+      end_index <- nonzero_positions[running_index]
+      # get m/z values and intensities for this region of interest
+      mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)])
+      int_vector <- as.vector(ints_fullrange[c(start_index:end_index)])
+      # check if intensity is above threshold or the maximum intensity is NaN
+      if (max(int_vector) < peak_thresh || is.nan(max(int_vector))) {
+        # go to next region of interest
+        start_index <- nonzero_positions[running_index + 1]
+        next
+      }
+      # check if there are more intensities than maximum for region of interest
+      if (length(int_vector) > max_roi_length) {
+        print(length(int_vector))
+        print(running_index)
+        # trim lowest intensities to zero
+        #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0
+        # split the range into multiple sub ranges
+        #sub_range <- int_vector
+        #names(sub_range) <- mass_vector
+        #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, 
+        #                                  sample_name, scanmode, peak_thresh)
+      # A proper peak needs to have at least 3 intensities above threshold  
+      } else if (length(int_vector) > 3) {
+        # check if the sum of intensities is above zero. Why is this necessary?
+        #if (sum(int_vector) == 0) next
+       # define mass_diff as difference between last and first value of mass_vector
+        mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1]
+        # generate a second mass_vector with equally spaced m/z values
+        mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)],
+                            length = mass_diff * int_factor)
+        
+        # Find the index in int_vector with the highest intensity
+        # max_index <- which(int_vector == max(int_vector))
+        # get initial fit values
+        roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, # max_index,
+                                   resol, force = length(max_index),
+                                   use_bounds = FALSE, scanmode)
+        
+        if (roi_values$qual[1] == 1) {
+          # get optimized fit values
+          roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+          # add region of interest to list of all peaks
+          allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+          allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+          allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+          allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+          allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+          allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+          allpeaks_values$spikes <- allpeaks_values$spikes + 1
+          
+        } else {
+          for (j in 1:length(roi_values$mean)){
+            allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j])
+            allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j])
+            allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+            allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1])
+            allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1])
+            allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1])
+          }
+        }
+        
+      } else {
+        
+        roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+        allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+        allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+        allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+        allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+        allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+        allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+        allpeaks_values$spikes <- allpeaks_values$spikes + 1
+      }
+    }
+    start_index <- nonzero_positions[running_index + 1]
+  }
+
+  return(allpeaks_values)
+}
+
+# remove plot sections (commented out)
+fit_gaussian <- function(mass_vector2, mass_vector, int_vector,  
+                         resol, force, use_bounds, scanmode) {
+  #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param force: Number of local maxima in int_vector (integer)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+  
+  # Number used to calculate area under Gaussian curve
+  int_factor <- 1 * 10^5 
+  
+  # Find the index in int_vector with the highest intensity
+  max_index <- which(int_vector == max(int_vector))
+  
+  # Initialise
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_qual <- NULL
+  peak_min <- NULL
+  peak_max <- NULL
+  fit_quality1 <- 0.15
+  fit_quality <- 0.2
+  
+  # One local maximum:
+  if (force == 1) {
+    # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
+    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol,
+                            fit_quality1, use_bounds)
+    
+    # set initial value for scale factor
+    scale <- 2
+    # test if the mean is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
+      # run this function again with fixed boundaries
+      return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol,
+                          force = 1, use_bounds = TRUE, scanmode))
+    } else {
+      # test if the fit is bad
+      if (fit_values$qual > fit_quality1) {
+        # Try to fit two curves; find two local maxima. NB: max_index (now new_index) removed from fit_gaussian
+        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
+        # test if there are two indices in new_index
+        if (length(new_index) != 2) {
+          new_index <- round(length(mass_vector) / 3)
+          new_index <- c(new_index, 2 * new_index)
+        }
+        # run this function again with two local maxima
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, 
+                            resol, force = 2, use_bounds = FALSE, scanmode))
+        # good fit
+      } else {
+        peak_mean <- c(peak_mean, fit_values$mean)
+        peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale,
+                                                fit_values$sigma))
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    }
+    
+    #### Two local maxima; need at least 6 data points for this ####
+  } else if (force == 2 && (length(mass_vector) > 6)) {
+    # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
+    fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, resol,
+                             use_bounds, fit_quality)
+    # test if one of the means is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
+      # check if fit quality is bad
+      if (fit_values$qual > fit_quality) {
+        # run this function again with fixed boundaries
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol,
+                            force = 2, use_bounds = TRUE, scanmode))
+      } else {
+        # check which mean is outside range and remove it from the list of means
+        # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
+        for (i in 1:length(fit_values$mean)){
+          if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) {
+            peak_mean <- c(peak_mean, -i)
+            peak_area <- c(peak_area, -i)
+          } else {
+            peak_mean <- c(peak_mean, fit_values$mean[i])
+            peak_area <- c(peak_area, fit_values$area[i])
+          }
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+      # if all means are within range
+    } else {
+      # check for bad fit
+      if (fit_values$qual > fit_quality) {
+        # Try to fit three curves; find three local maxima
+        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
+        # test if there are three indices in new_index
+        if (length(new_index) != 3) {
+          new_index <- round(length(mass_vector) / 4)
+          new_index <- c(new_index, 2 * new_index, 3 * new_index)
+        }
+        # run this function again with three local maxima
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, 
+                            resol, force = 3, use_bounds = FALSE, scanmode))
+        # good fit, all means are within m/z range
+      } else {
+        # check if means are within 3 ppm and sum if so
+        tmp <- fit_values$qual
+        nr_means_new <- -1
+        nr_means <- length(fit_values$mean)
+        while (nr_means != nr_means_new) {
+          nr_means <- length(fit_values$mean)
+          fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                   mass_vector2, mass_vector, ppm = 4, resol)
+          nr_means_new <- length(fit_values$mean)
+        }
+        # restore original quality score
+        fit_values$qual <- tmp
+        
+        for (i in 1:length(fit_values$mean)){
+          peak_mean <- c(peak_mean, fit_values$mean[i])
+          peak_area <- c(peak_area, fit_values$area[i])
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    }
+    
+  } else { # More than two local maxima; fit 1 peak.
+    scale <- 2
+    fit_quality1 <- 0.40
+    use_bounds <- TRUE
+    max_index <- which(int_vector == max(int_vector))
+    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol,
+                            fit_quality1, use_bounds)
+    # check for bad fit
+    if (fit_values$qual > fit_quality1) {
+      # get fit values from fit_optim
+      fit_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+      peak_mean <- c(peak_mean, fit_values$mean)
+      peak_area <- c(peak_area, fit_values$area)
+      peak_min <- fit_values$min
+      peak_max <- fit_values$max
+      peak_qual <- 0
+    } else {
+      peak_mean <- c(peak_mean, fit_values$mean)
+      peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma))
+      peak_qual <- fit_values$qual
+      peak_min <- mass_vector[1]
+      peak_max <- mass_vector[length(mass_vector)]
+    }
+  }
+  
+  # put all values for this region of interest into a list
+  roi_value_list <- list("mean" = peak_mean,
+                         "area" = peak_area,
+                         "qual" = peak_qual,
+                         "min" = peak_min,
+                         "max" = peak_max)
+  return(roi_value_list)
+}
+
+
+fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, 
+                      resol, fit_quality, use_bounds) {
+  #' Fit 1 Gaussian peak in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+  
+  # set initial value for scale
+  scale <- 2
+  
+  if (length(int_vector) < 3) {
+    message("Range too small, no fit possible!")
+  } else {
+    if ((length(int_vector) == 4)) {
+      # fit 1 peak
+      mu <- weighted.mean(mass_vector, int_vector)
+      sigma <- get_stdev(mass_vector, int_vector)
+      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+    } else {
+      # set range vector
+      if ((length(mass_vector) - length(max_index)) < 2) {
+        range1 <- c((length(mass_vector) - 4) : length(mass_vector))
+      } else if (length(max_index) < 2) {
+        range1 <- c(1:5)
+      } else {
+        range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+      }
+      if (range1[1] == 0) range1 <- range1[-1]
+      # remove NA
+      if (length(which(is.na(int_vector[range1]))) != 0) {
+        range1 <- range1[-which(is.na(int_vector[range1]))]
+      }
+      # fit 1 peak
+      mu <- weighted.mean(mass_vector[range1], int_vector[range1])
+      sigma <- get_stdev(mass_vector[range1], int_vector[range1])
+      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+    }
+    
+    p1 <- fitted_peak$par
+    
+    # get new value for fit quality and scale
+    fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+    scale_new <- 1.2 * scale
+    
+    # bad fit
+    if (fq_new > fit_quality) {
+      # optimize scaling factor
+      fq <- 0
+      scale <- 0
+      if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) {
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+          fq <- fq_new
+          scale <- scale_new
+          # fit 1 peak
+          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+          p1 <- fitted_peak$par
+          # get new value for fit quality and scale
+          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+          scale_new <- 1.2 * scale
+        }
+      } else {
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+          fq <- fq_new
+          scale <- scale_new
+          # fit 1 peak
+          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+          p1 <- fitted_peak$par
+          # get new value for fit quality and scale
+          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+          scale_new <- 0.8 * scale
+        }
+      }
+      # use optimized scale factor to fit 1 peak
+      if (fq < fq_new) {
+        fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+        p1 <- fitted_peak$par
+        fq_new <- fq
+      }
+    }
+  }
+  
+  roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new)
+  return(roi_value_list)
+}
+
+fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol, use_bounds = FALSE,
+                       fit_quality) {
+  #' Fit 2 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+  
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_scale <- NULL
+  peak_sigma <- NULL
+  
+  # set range vectors for 2 peaks
+  range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+  if (range1[1] == 0) range1 <- range1[-1]
+  range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2)
+  if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)]
+  range1 <- check_overlap(range1, range2)[[1]]
+  range2 <- check_overlap(range1, range2)[[2]]
+  # check for negative or 0
+  remove <- which(range1 < 1)
+  if (length(remove) > 0) range1 <- range1[-remove]
+  remove <- which(range2 < 1)
+  if (length(remove) > 0) range2 <- range2[-remove]
+  # remove NA
+  if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))]
+  if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))]
+  
+  # fit 2 peaks, first separately, then together
+  mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
+  sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
+  fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds)
+  p1 <- fitted_peak$par
+  # second peak
+  mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
+  sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
+  fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds)
+  p2 <- fitted_peak$par
+  # combined
+  fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds)
+  pc <- fitted_2peaks$par
+  
+  # get fit quality
+  if (is.null(sigma2)) sigma2 <- sigma1
+  sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) +
+    (pc[4] * dnorm(mass_vector, pc[3], sigma2))
+  fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2],
+                        resol, sum_fit = sum_fit)$fq_new
+  
+  # get parameter values
+  area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor)
+  peak_area <- c(peak_area, area1)
+  peak_area <- c(peak_area, area2)
+  peak_mean <- c(peak_mean, pc[1])
+  peak_mean <- c(peak_mean, pc[3])
+  peak_scale <- c(peak_scale, pc[2])
+  peak_scale <- c(peak_scale, pc[4])
+  peak_sigma <- c(peak_sigma, sigma1)
+  peak_sigma <- c(peak_sigma, sigma2)
+  
+  roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq)
+  return(roi_value_list)
+}
+
+fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds) {
+  #' Fit a Gaussian curve for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma: Value for width of the peak (float)
+  #' @param query_mass: Value for mass at center of peak (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+  
+  # Initial value used to estimate scaling parameter
+  scale <- 2
+  
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
+    sum((d - int_vector) ^ 2)
+  }
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
+    # get optimal value for fitted Gaussian curve
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+                     opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
+                     lower = lower, upper = upper)
+  } else {
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+                     opt_f, control = list(maxit = 10000))
+  }
+  return(opt_fit)
+}
+
+fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
+                           query_mass1, scale1,
+                           query_mass2, scale2, use_bounds) {
+  #' Fit two Gaussian curves for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma1: Value for width of the first peak (float)
+  #' @param sigma2: Value for width of the second peak (float)
+  #' @param query_mass1: Value for mass at center of first peak (float)
+  #' @param scale1: Value for scaling intensities for first peak (float)
+  #' @param query_mass2: Value for mass at center of second peak (float)
+  #' @param scale2: Value for scaling intensities for second peak (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+  
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) +
+      params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2)
+    sum((d - int_vector) ^ 2)
+  }
+  
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
+    # get optimal value for 2 fitted Gaussian curves
+    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+      sigma2 <- sigma1
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass1), as.numeric(scale1)),
+                       opt_f, control = list(maxit = 10000),
+                       method = "L-BFGS-B", lower = lower, upper = upper)
+    } else {
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass2), as.numeric(scale2)),
+                       opt_f, control = list(maxit = 10000),
+                       method = "L-BFGS-B", lower = lower, upper = upper)
+    }
+  } else {
+    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+      sigma2 <- sigma1
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass1), as.numeric(scale1)),
+                       opt_f, control = list(maxit = 10000))
+    } else {
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass2), as.numeric(scale2)),
+                       opt_f, control = list(maxit = 10000))
+    }
+  }
+  return(opt_fit)
+}
+
+get_stdev <- function(mass_vector, int_vector, resol = 140000) {
+  #' Calculate standard deviation to determine width of a peak
+  #'
+  #' @param mass_vector: Vector of 3 mass values (float)
+  #' @param int_vector: Vector of 3 intensities (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return stdev: Value for standard deviation
+  # find maximum intensity in vector
+  max_index <- which(int_vector == max(int_vector))
+  # find corresponding mass at maximum intensity
+  max_mass <- mass_vector[max_index]
+  # calculate resolution at given m/z value
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200)))
+  # calculate full width at half maximum
+  fwhm <- max_mass / resol_mz
+  # calculate standard deviation
+  stdev <- (fwhm / 2) * 0.85
+  return(stdev)
+}
+
+## adapted from getFitQuality.R
+# parameter not used: mu_last
+get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) {
+  #' Fit 1 Gaussian peak in small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param mu_first: Value for first peak (float)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+  if (is.null(sum_fit)) {
+    mass_vector_int <- mass_vector
+    int_vector_int <- int_vector
+    # get new fit quality
+    fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / 
+                     rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int)))
+  } else {
+    sum_fit_int <- sum_fit
+    int_vector_int <- int_vector
+    mass_vector_int <- mass_vector
+    # get new fit quality
+    fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int)))
+  }
+  
+  # Prevent division by 0
+  if (is.nan(fq_new)) fq_new <- 1 
+  
+  list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int)
+  return(list_params)
+}
+
+estimate_area <- function(mass_max, resol, scale, sigma) {
+  #' Estimate area of Gaussian curve
+  #'
+  #' @param mass_max: Value for m/z at maximum intensity of a peak (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param scale: Value for peak width (float)
+  #' @param sigma: Value for standard deviation (float)
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #'
+  #' @return area_curve: Value for area under the Gaussian curve (float)
+  
+  # Number used to calculate area under Gaussian curve
+  int_factor <- 1 * 10^5 
+  
+  # avoid vectors that are too big (cannot allocate vector of size ...)
+  if (mass_max > 1200) return(0)
+  
+  # generate a mass_vector with equally spaced m/z values
+  fwhm <- get_fwhm(mass_max, resol)
+  mz_min <- mass_max - 2 * fwhm
+  mz_max <- mass_max + 2 * fwhm
+  mz_range <- mz_max - mz_min 
+  mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor)
+  
+  # estimate area under the curve
+  area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100
+  
+  return(area_curve)  
+}
+
+get_fwhm <- function(query_mass, resol) {
+  #' Calculate fwhm (full width at half maximum intensity) for a peak
+  #'
+  #' @param query_mass: Value for mass (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return fwhm: Value for full width at half maximum (float)
+  
+  # set aberrant values of query_mass to zero
+  if (is.nan(query_mass)) query_mass <- 0
+  if (is.na(query_mass)) query_mass <- 0
+  if (is.null(query_mass)) query_mass <- 0
+  if (query_mass < 0) query_mass <- 0
+  # calculate resolution at given m/z value
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200)))
+  # calculate full width at half maximum
+  fwhm <- query_mass / resol_mz
+  return(fwhm)
+}
+
+fit_optim <- function(mass_vector, int_vector, resol, scanmode) {
+  #' Determine optimized fit of Gaussian curve to small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Vector of intensities for a region of interest (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+  
+  # Number used to calculate area under Gaussian curve
+  int_factor <- 1 * 10^5
+  factor <- 1.5
+  # Find the index in int_vector with the highest intensity
+  max_index <- which(int_vector == max(int_vector))[1]
+  mass_max <- mass_vector[max_index]
+  int_max <- int_vector[max_index]
+  # get peak width
+  fwhm <- get_fwhm(mass_max, resol)
+  # simplify the peak shape: represent it by a triangle
+  mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm)
+  int_max_simple <- c(0, int_max, 0)
+  
+  # define mass_diff as difference between last and first value of mass_max_simple
+  mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
+  # generate a second mass_vector with equally spaced m/z values
+  mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
+                      length = mass_diff * int_factor)
+  sigma <- get_stdev(mass_vector2, int_max_simple)
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) {
+    curve <- p * dnorm(mass_vector, mass_max, sigma)
+    return((max(curve) - max(int_vector))^2)
+  }
+  
+  # get optimal value for fitted Gaussian curve
+  opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max)
+  scale <- opt_fit$minimum
+  
+  # get an estimate of the area under the peak
+  area <- estimate_area(mass_max, resol, scale, sigma)
+  # put all values for this region of interest into a list
+  roi_value_list <- list("mean" = mass_max,
+                         "area" = area,
+                         "min" = mass_vector2[1],
+                         "max" = mass_vector2[length(mass_vector2)])
+  return(roi_value_list)
+}
+
+within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol) {
+  #' Test whether two mass ranges are within ppm distance of each other
+  #'
+  #' @param mean: Value for mean m/z (float)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param sigma: Value for standard deviation (float)
+  #' @param area: Value for area under the curve (float)
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param ppm: Value for distance between two values of mass (integer)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+  
+  # sort
+  index <- order(mean)
+  mean <- mean[index]
+  scale <- scale[index]
+  sigma <- sigma[index]
+  area <- area[index]
+  
+  summed <- NULL
+  remove <- NULL
+  
+  if (length(mean) > 1) {
+    for (i in 2:length(mean)) {
+      if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) {
+        
+        # avoid double occurrance in sum
+        if ((i - 1) %in% summed) next
+        
+        result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i],
+                                    mass_vector2, mass_vector, resol)
+        summed <- c(summed, i - 1, i)
+        if (is.nan(result_values$mean)) result_values$mean <- 0
+        mean[i - 1] <- result_values$mean
+        mean[i] <- result_values$mean
+        area[i - 1] <- result_values$area
+        area[i] <- result_values$area
+        scale[i - 1] <- result_values$scale
+        scale[i] <- result_values$scale
+        sigma[i - 1] <- result_values$sigma
+        sigma[i] <- result_values$sigma
+        
+        remove <- c(remove, i)
+      }
+    }
+  }
+  
+  if (length(remove) != 0) {
+    mean <- mean[-c(remove)]
+    area <- area[-c(remove)]
+    scale <- scale[-c(remove)]
+    sigma <- sigma[-c(remove)]
+  }
+  
+  list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL)
+  return(list_params)
+}
+
+sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol) {
+  #' Sum two curves
+  #'
+  #' @param mean1: Value for mean m/z of first peak (float)
+  #' @param mean2: Value for mean m/z of second peak (float)
+  #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer)
+  #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer)
+  #' @param sigma1: Value for standard deviation for first peak (float)
+  #' @param sigma2: Value for standard deviation for second peak (float)
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+  
+  sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2))
+  
+  mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)),
+                                                  max(scale2 * dnorm(mass_vector2, mean2, sigma2))))
+  
+  # get new values for parameters
+  fwhm <- get_fwhm(mean1_plus2, resol)
+  area <- max(sum_fit)
+  scale <- scale1 + scale2
+  sigma <- (fwhm / 2) * 0.85
+  
+  list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma)
+  return(list_params)
+}
+

From 9d7a69fcb2a3330bc77e05bbc645008c7793627d Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 10 Feb 2025 10:03:21 +0100
Subject: [PATCH 04/32] refactor DIMS PeakFinding, flow between scripts

---
 DIMS/AveragePeaks.R                         |  72 ++++
 DIMS/AveragePeaks.nf                        |  18 +
 DIMS/PeakFinding.R                          |  30 +-
 DIMS/PeakGrouping.R                         |   2 +-
 DIMS/preprocessing/peak_finding_functions.R | 430 +++++++++++---------
 5 files changed, 334 insertions(+), 218 deletions(-)
 create mode 100644 DIMS/AveragePeaks.R
 create mode 100644 DIMS/AveragePeaks.nf

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
new file mode 100644
index 00000000..695c08fa
--- /dev/null
+++ b/DIMS/AveragePeaks.R
@@ -0,0 +1,72 @@
+# define parameters
+# ppm as fixed value, not the same ppm as in peak grouping
+ppm_peak <- 2
+
+library(dplyr)
+
+scanmodes <- c("positive", "negative")
+
+for (scanmode in scanmodes){
+  # get sample names
+  load(paste0(scanmode, "_repl_pattern.RData"))
+  sample_names <- names(repl_pattern_filtered)
+  # initialize
+  outlist_total <- NULL
+  # for each biological sample, average peaks in technical replicates
+  for (sample_name in sample_names) {
+    print(sample_name)
+    # Initialize per sample
+    peaklist_allrepl <- NULL
+    nr_repl_persample <- 0
+    # averaged_peaks <- matrix(0, nrow = 10 ^ 7, ncol = 6) # how big does it need to be?
+    averaged_peaks <- matrix(0, nrow = 0, ncol = 6) # append
+    colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+    # load RData files of technical replicates belonging to biological sample
+    sample_techreps_file <- repl_pattern_filtered[sample_name][[1]]
+    for (file_nr in 1:length(sample_techreps_file)) {
+      print(sample_techreps_file[file_nr])
+      tech_repl_file <- paste0(sample_techreps_file[file_nr], "_positive.RData")
+      tech_repl <- get(load(tech_repl_file))
+      # combine data for all technical replicates
+      peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
+      # count number of replicates for each biological sample
+      nr_repl_persample <- nr_repl_persample + 1
+    }
+    # sort on mass
+    peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
+    peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
+    peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
+    # peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt))
+    peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
+    # average over technical replicates
+    while (nrow(peaklist_allrepl_sorted) > 1) {
+      # store row numbers
+      peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
+      # find the peaks in the dataset with corresponding m/z plus or minus tolerance
+      reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
+      mz_tolerance <- (reference_mass * ppm_peak) / 10^6
+      minmz_ref <- reference_mass - mz_tolerance
+      maxmz_ref <- reference_mass + mz_tolerance
+      select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
+      select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
+      nrsamples <- length(select_peak_indices)
+      # put averaged intensities into a new row and append to averaged_peaks
+      averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
+      colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+      # calculate m/z values for peak group
+      averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
+      averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
+      averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
+      averaged_1peak[1, "fq"] <- nrsamples
+      averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
+      # put intensities into proper columns
+      peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
+      averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
+    }
+    # add sample name to first column and append to outlist_total for all samples
+    averaged_peaks[ , "samplenr"] <- sample_name
+    outlist_total <- rbind(outlist_total, averaged_peaks)
+  }
+  save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData"))
+}
+
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
new file mode 100644
index 00000000..9d49f5f8
--- /dev/null
+++ b/DIMS/AveragePeaks.nf
@@ -0,0 +1,18 @@
+process AveragePeaks {
+    tag "DIMS AveragePeaks"
+    label 'AveragePeaks'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(rdata_files)
+       path(replication_pattern)
+
+    output:
+       path 'AvgPeaks_*.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R 
+        """
+}
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 738a0bc7..c54a64cf 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,30 +1,26 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-replicate_mzmlfile <- cmd_args[1]
+replicate_rdatafile <- cmd_args[1]
 breaks_file <- cmd_args[2]
 resol <- as.numeric(cmd_args[3])
-scripts_dir <- cmd_args[4]
+preprocessing_scripts_dir <- cmd_args[4]
 peak_thresh <- 2000
 
 # load in function scripts
-preprocessing_scripts_dir <- gsub("Utils", "preprocessing", scripts_dir)
 source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R"))
 
 load(breaks_file)
 
 # Load output of AssignToBins for a sample
-sample_techrepl <- get(load(replicate_mzmlfile))
-scanmodes <- c("positive", "negative")
-ints_pos <- peak_list$pos
-ints_neg <- peak_list$neg
+sample_techrepl <- get(load(replicate_rdatafile))
+techrepl_name <- colnames(peak_list$pos)[1]
 
 # Initialize
 options(digits = 16)
 
 # run the findPeaks function
-techrepl_name <- colnames(ints_pos)[1]
-
+scanmodes <- c("positive", "negative")
 for (scanmode in scanmodes) {
   # turn dataframe with intensities into a named list
   if (scanmode == "positive") {
@@ -36,18 +32,12 @@ for (scanmode in scanmodes) {
   ints_fullrange <- as.vector(ints_perscanmode)
   names(ints_fullrange) <- rownames(ints_perscanmode)
   
-  start.time <- Sys.time()
-  
   # look for m/z range for all peaks
-  allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, scanmode, peak_thresh)
-  
-  end.time <- Sys.time()
-  time.taken <- end.time - start.time
-  time.taken
+  allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, peak_thresh)
   
   # turn the list into a dataframe
-  outlist_persample <- NULL
-  outlist_persample <- cbind("samplenr" = allpeaks_values$nr,
+  outlist_techrep <- NULL
+  outlist_techrep <- cbind("samplenr" = allpeaks_values$nr,
                              "mzmed.pkt" = allpeaks_values$mean,
                              "fq" = allpeaks_values$qual,
                              "mzmin.pkt" = allpeaks_values$min,
@@ -55,10 +45,10 @@ for (scanmode in scanmodes) {
                              "height.pkt" = allpeaks_values$area)
   
   # remove peaks with height = 0
-  outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ]
+  outlist_techrep <- outlist_techrep[outlist_techrep[, "height.pkt"] != 0, ]
   
   # save output to file
-  save(outlist_persample, file = paste0(techrepl_name, "_", scanmode, ".RData"))
+  save(outlist_techrep, file = paste0(techrepl_name, "_", scanmode, ".RData"))
   
   # generate text output to log file on number of spikes for this sample
   # spikes are peaks that are too narrow, e.g. 1 data point
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index 53e42de3..69fcde9c 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -22,7 +22,7 @@ if (grepl("negative", basename(hmdb_part_file))) {
 batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2]
 
 # load file with spectrum peaks
-spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
+spec_peaks_file <- paste0("AvgPeaks_", scanmode, ".RData")
 load(spec_peaks_file)
 outlist_copy <- outlist_total
 rm(outlist_total)
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
index 331d563b..947fea1f 100644
--- a/DIMS/preprocessing/peak_finding_functions.R
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -1,18 +1,14 @@
 # functions for peak finding
-search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_thresh) {
+search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) {
   #' Divide the full m/z range into regions of interest with min, max and mean m/z
   #'
   #' @param ints_fullrange: Named list of intensities (float)
   #' @param resol: Value for resolution (integer)
   #' @param sample_name: Sample name (string)
-  #' @param scanmode: Scan mode, positive or negative (string)
   #' @param peak_thresh: Value for noise level threshold (integer)
   #'
   #' @return allpeaks_values: list of m/z regions of interest
 
-  # Number used to calculate area under Gaussian curve
-  int_factor <- 1 * 10^5 
-  
   # initialize list to store results for all peaks
   allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
                           "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
@@ -50,27 +46,27 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th
         #sub_range <- int_vector
         #names(sub_range) <- mass_vector
         #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, 
-        #                                  sample_name, scanmode, peak_thresh)
+        #                                  sample_name, peak_thresh)
       # A proper peak needs to have at least 3 intensities above threshold  
       } else if (length(int_vector) > 3) {
         # check if the sum of intensities is above zero. Why is this necessary?
         #if (sum(int_vector) == 0) next
        # define mass_diff as difference between last and first value of mass_vector
-        mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1]
+        # mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1]
         # generate a second mass_vector with equally spaced m/z values
-        mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)],
-                            length = mass_diff * int_factor)
+        mass_vector_eq <- seq(mass_vector[1], mass_vector[length(mass_vector)],
+                            length = 10 * length(mass_vector))
         
         # Find the index in int_vector with the highest intensity
         # max_index <- which(int_vector == max(int_vector))
         # get initial fit values
-        roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, # max_index,
-                                   resol, force = length(max_index),
-                                   use_bounds = FALSE, scanmode)
+        roi_values <- fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
+                                   resol, force_nr = length(max_index),
+                                   use_bounds = FALSE)
         
         if (roi_values$qual[1] == 1) {
           # get optimized fit values
-          roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+          roi_values <- fit_optim(mass_vector, int_vector, resol)
           # add region of interest to list of all peaks
           allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
           allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
@@ -93,7 +89,7 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th
         
       } else {
         
-        roi_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+        roi_values <- fit_optim(mass_vector, int_vector, resol)
         allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
         allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
         allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
@@ -109,24 +105,19 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, scanmode, peak_th
   return(allpeaks_values)
 }
 
-# remove plot sections (commented out)
-fit_gaussian <- function(mass_vector2, mass_vector, int_vector,  
-                         resol, force, use_bounds, scanmode) {
+fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector,  
+                         resol, force_nr, use_bounds) {
   #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z
   #'
-  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
   #' @param resol: Value for resolution (integer)
-  #' @param force: Number of local maxima in int_vector (integer)
+  #' @param force_nr: Number of local maxima in int_vector (integer)
   #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #' @param scanmode: Scan mode, positive or negative (string)
   #'
   #' @return roi_value_list: list of fit values for region of interest (list)
   
-  # Number used to calculate area under Gaussian curve
-  int_factor <- 1 * 10^5 
-  
   # Find the index in int_vector with the highest intensity
   max_index <- which(int_vector == max(int_vector))
   
@@ -138,20 +129,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
   peak_max <- NULL
   fit_quality1 <- 0.15
   fit_quality <- 0.2
+  # set initial value for scale factor
+  scale_factor <- 2
   
   # One local maximum:
-  if (force == 1) {
-    # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
-    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol,
+  if (force_nr == 1) {
+    # determine fit values for 1 Gaussian peak 
+    fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol,
                             fit_quality1, use_bounds)
     
-    # set initial value for scale factor
-    scale <- 2
     # test if the mean is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
       # run this function again with fixed boundaries
-      return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol,
-                          force = 1, use_bounds = TRUE, scanmode))
+      return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol,
+                          force_nr = 1, use_bounds = TRUE))
     } else {
       # test if the fit is bad
       if (fit_values$qual > fit_quality1) {
@@ -163,12 +154,12 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
           new_index <- c(new_index, 2 * new_index)
         }
         # run this function again with two local maxima
-        return(fit_gaussian(mass_vector2, mass_vector, int_vector, 
-                            resol, force = 2, use_bounds = FALSE, scanmode))
+        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
+                            resol, force_nr = 2, use_bounds = FALSE))
         # good fit
       } else {
         peak_mean <- c(peak_mean, fit_values$mean)
-        peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale,
+        peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor,
                                                 fit_values$sigma))
         peak_qual <- fit_values$qual
         peak_min <- mass_vector[1]
@@ -177,9 +168,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
     }
     
     #### Two local maxima; need at least 6 data points for this ####
-  } else if (force == 2 && (length(mass_vector) > 6)) {
-    # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
-    fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, resol,
+  } else if (force_nr == 2 && (length(mass_vector) > 6)) {
+    # determine fit values for 2 Gaussian peaks
+    fit_values <- fit_2peaks(mass_vector_eq, mass_vector, int_vector, max_index, resol,
                              use_bounds, fit_quality)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
@@ -187,8 +178,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
       # check if fit quality is bad
       if (fit_values$qual > fit_quality) {
         # run this function again with fixed boundaries
-        return(fit_gaussian(mass_vector2, mass_vector, int_vector, resol,
-                            force = 2, use_bounds = TRUE, scanmode))
+        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol,
+                            force_nr = 2, use_bounds = TRUE))
       } else {
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
@@ -217,9 +208,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
           new_index <- c(new_index, 2 * new_index, 3 * new_index)
         }
         # run this function again with three local maxima
-        return(fit_gaussian(mass_vector2, mass_vector, int_vector, 
-                            resol, force = 3, use_bounds = FALSE, scanmode))
-        # good fit, all means are within m/z range
+        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
+                            resol, force_nr = 3, use_bounds = FALSE))
+      # good fit, all means are within m/z range
       } else {
         # check if means are within 3 ppm and sum if so
         tmp <- fit_values$qual
@@ -227,8 +218,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
         nr_means <- length(fit_values$mean)
         while (nr_means != nr_means_new) {
           nr_means <- length(fit_values$mean)
-          fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
-                                   mass_vector2, mass_vector, ppm = 4, resol)
+          fit_values <- within_ppm(fit_values$mean, fit_values$scale_factor, fit_values$sigma, fit_values$area, 
+                                   mass_vector_eq, mass_vector, ppm = 4, resol)
           nr_means_new <- length(fit_values$mean)
         }
         # restore original quality score
@@ -245,16 +236,15 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
     }
     
   } else { # More than two local maxima; fit 1 peak.
-    scale <- 2
     fit_quality1 <- 0.40
     use_bounds <- TRUE
     max_index <- which(int_vector == max(int_vector))
-    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, resol,
+    fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol,
                             fit_quality1, use_bounds)
     # check for bad fit
     if (fit_values$qual > fit_quality1) {
       # get fit values from fit_optim
-      fit_values <- fit_optim(mass_vector, int_vector, resol, scanmode)
+      fit_values <- fit_optim(mass_vector, int_vector, resol)
       peak_mean <- c(peak_mean, fit_values$mean)
       peak_area <- c(peak_area, fit_values$area)
       peak_min <- fit_values$min
@@ -262,7 +252,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
       peak_qual <- 0
     } else {
       peak_mean <- c(peak_mean, fit_values$mean)
-      peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma))
+      peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, fit_values$sigma))
       peak_qual <- fit_values$qual
       peak_min <- mass_vector[1]
       peak_max <- mass_vector[length(mass_vector)]
@@ -279,11 +269,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector,
 }
 
 
-fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, 
+fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, 
                       resol, fit_quality, use_bounds) {
   #' Fit 1 Gaussian peak in small region of m/z
   #'
-  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
   #' @param max_index: Index in int_vector with the highest intensity (integer)
@@ -293,17 +283,17 @@ fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index,
   #'
   #' @return roi_value_list: list of fit values for region of interest (list)
   
-  # set initial value for scale
-  scale <- 2
+  # set initial value for scale_factor
+  scale_factor <- 2
   
   if (length(int_vector) < 3) {
     message("Range too small, no fit possible!")
   } else {
-    if ((length(int_vector) == 4)) {
+    if (length(int_vector) == 4) {
       # fit 1 peak
-      mu <- weighted.mean(mass_vector, int_vector)
+      weighted_mu <- weighted.mean(mass_vector, int_vector)
       sigma <- get_stdev(mass_vector, int_vector)
-      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+      fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
     } else {
       # set range vector
       if ((length(mass_vector) - length(max_index)) < 2) {
@@ -311,71 +301,79 @@ fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index,
       } else if (length(max_index) < 2) {
         range1 <- c(1:5)
       } else {
-        range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+        range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2))
+      }
+      # remove zero at the beginning of range1
+      if (range1[1] == 0) {
+        range1 <- range1[-1]
       }
-      if (range1[1] == 0) range1 <- range1[-1]
       # remove NA
       if (length(which(is.na(int_vector[range1]))) != 0) {
         range1 <- range1[-which(is.na(int_vector[range1]))]
       }
       # fit 1 peak
-      mu <- weighted.mean(mass_vector[range1], int_vector[range1])
+      weighted_mu <- weighted.mean(mass_vector[range1], int_vector[range1])
       sigma <- get_stdev(mass_vector[range1], int_vector[range1])
-      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
+      fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
     }
     
-    p1 <- fitted_peak$par
+    fitted_mz <- fitted_peak$par[1]
+    fitted_nr <- fitted_peak$par[2]
     
-    # get new value for fit quality and scale
-    fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
-    scale_new <- 1.2 * scale
+    # get new value for fit quality and scale_factor
+    fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
+    scale_factor_new <- 1.2 * scale_factor
     
     # bad fit
     if (fq_new > fit_quality) {
       # optimize scaling factor
       fq <- 0
-      scale <- 0
-      if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) {
-        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+      scale_factor <- 0
+      if (sum(int_vector) > sum(params1[2] * dnorm(mass_vector, fitted_mz, sigma))) {
+        # increase scale_factor until convergence
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) {
           fq <- fq_new
-          scale <- scale_new
+          scale_factor <- scale_factor_new
           # fit 1 peak
-          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
-          p1 <- fitted_peak$par
-          # get new value for fit quality and scale
-          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
-          scale_new <- 1.2 * scale
+          fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
+          params1 <- fitted_peak$par
+          # get new value for fit quality and scale_factor
+          fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
+          scale_factor_new <- 1.2 * scale_factor
         }
       } else {
-        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+        # decrease scale_factor until convergence
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) {
           fq <- fq_new
-          scale <- scale_new
+          scale_factor <- scale_factor_new
           # fit 1 peak
-          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
-          p1 <- fitted_peak$par
-          # get new value for fit quality and scale
-          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
-          scale_new <- 0.8 * scale
+          fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
+          params1 <- fitted_peak$par
+          # get new value for fit quality and scale_factor
+          fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
+          scale_factor_new <- 0.8 * scale_factor
+          print(scale_factor_new)
         }
       }
-      # use optimized scale factor to fit 1 peak
+      # use optimized scale_factor factor to fit 1 peak
       if (fq < fq_new) {
-        fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, use_bounds)
-        p1 <- fitted_peak$par
+        fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
+        fitted_mz <- fitted_peak$par[1]
+        fitted_nr <- fitted_peak$par[2]
         fq_new <- fq
       }
     }
   }
   
-  roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new)
+  roi_value_list <- list("mean" = fitted_mz, "scale_factor" = fitted_nr, "sigma" = sigma, "qual" = fq_new)
   return(roi_value_list)
 }
 
-fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol, use_bounds = FALSE,
+fit_2peaks <- function(mass_vector_eq, mass_vector, int_vector, max_index, resol, use_bounds = FALSE,
                        fit_quality) {
-  #' Fit 2 Gaussian peaks in small region of m/z
+  #' Fit 2 Gaussian peaks in a small region of m/z
   #'
-  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
   #' @param max_index: Index in int_vector with the highest intensity (integer)
@@ -390,73 +388,93 @@ fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, resol,
   peak_scale <- NULL
   peak_sigma <- NULL
   
-  # set range vectors for 2 peaks
-  range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
-  if (range1[1] == 0) range1 <- range1[-1]
-  range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2)
-  if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)]
+  # set range vectors for first peak
+  range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2))
+  # remove zero at the beginning of range1
+  if (range1[1] == 0) {
+    range1 <- range1[-1]
+  }
+  # set range vectors for second peak
+  range2 <- seq(from = (max_index[2] - 2), to = (max_index[2] + 2))
+  # if range2 ends outside mass_vector, shorten it
+  if (length(mass_vector) < range2[length(range2)]) {
+    range2 <- range2[-length(range2)]
+  }
+  # check whether the two ranges overlap
   range1 <- check_overlap(range1, range2)[[1]]
   range2 <- check_overlap(range1, range2)[[2]]
-  # check for negative or 0
+  # check for negative or 0 values in range1 or range2
   remove <- which(range1 < 1)
-  if (length(remove) > 0) range1 <- range1[-remove]
+  if (length(remove) > 0) {
+    range1 <- range1[-remove]
+  }
   remove <- which(range2 < 1)
-  if (length(remove) > 0) range2 <- range2[-remove]
+  if (length(remove) > 0) {
+    range2 <- range2[-remove]
+  }
   # remove NA
-  if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))]
-  if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))]
+  if (length(which(is.na(int_vector[range1]))) != 0) {
+    range1 <- range1[-which(is.na(int_vector[range1]))]
+  }
+  if (length(which(is.na(int_vector[range2]))) != 0) {
+    range2 <- range2[-which(is.na(int_vector[range2]))]
+  }
   
   # fit 2 peaks, first separately, then together
-  mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
+  weighted_mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
   sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
-  fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds)
-  p1 <- fitted_peak$par
+  fitted_peak1 <- optimize_1gaussian(mass_vector[range1], int_vector[range1], sigma1, weighted_mu1, scale_factor, use_bounds)
+  fitted_mz1 <- fitted_peak1$par[1]
+  fitted_nr1 <- fitted_peak1$par[2]
   # second peak
-  mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
+  weighted_mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
   sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
-  fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds)
-  p2 <- fitted_peak$par
+  fitted_peak2 <- optimize_1gaussian(mass_vector[range2], int_vector[range2], sigma2, weighted_mu2, scale_factor, use_bounds)
+  fitted_mz2 <- fitted_peak2$par[1]
+  fitted_nr2 <- fitted_peak2$par[2]
   # combined
-  fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds)
-  pc <- fitted_2peaks$par
+  fitted_2peaks <- optimize_2gaussians(mass_vector, int_vector, sigma1, sigma2, 
+                                       fitted_mz1, fitted_nr1, 
+                                       fitted_mz2, fitted_nr2, use_bounds)
+  fitted_2peaks_mz1 <- fitted_2peaks$par[1]
+  fitted_2peaks_nr1 <- fitted_2peaks$par[2]
+  fitted_2peaks_mz2 <- fitted_2peaks$par[3]
+  fitted_2peaks_nr2 <- fitted_2peaks$par[4]
   
   # get fit quality
-  if (is.null(sigma2)) sigma2 <- sigma1
-  sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) +
-    (pc[4] * dnorm(mass_vector, pc[3], sigma2))
-  fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2],
+  if (is.null(sigma2)) {
+    sigma2 <- sigma1
+  }
+  sum_fit <- (fitted_2peaks_nr1 * dnorm(mass_vector, fitted_2peaks_mz1, sigma1)) +
+    (fitted_2peaks_nr2 * dnorm(mass_vector, fitted_2peaks_mz2, sigma2))
+  lowest_mz <- sort(c(fitted_2peaks_mz1, fitted_2peaks_mz2))[1]
+  fq_new <- get_fit_quality(mass_vector, int_vector, lowest_mz,
                         resol, sum_fit = sum_fit)$fq_new
   
   # get parameter values
-  area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor)
-  area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor)
-  peak_area <- c(peak_area, area1)
-  peak_area <- c(peak_area, area2)
-  peak_mean <- c(peak_mean, pc[1])
-  peak_mean <- c(peak_mean, pc[3])
-  peak_scale <- c(peak_scale, pc[2])
-  peak_scale <- c(peak_scale, pc[4])
-  peak_sigma <- c(peak_sigma, sigma1)
-  peak_sigma <- c(peak_sigma, sigma2)
-  
-  roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq)
+  area1 <- estimate_area(fitted_2peaks_mz1, resol, fitted_2peaks_nr1, sigma1)
+  area2 <- estimate_area(fitted_2peaks_mz2, resol, fitted_2peaks_nr2, sigma2)
+  peak_area <- c(peak_area, area1, area2)
+  peak_mean <- c(peak_mean, fitted_2peaks_mz1, fitted_2peaks_mz2)
+  peak_scale <- c(peak_scale, fitted_2peaks_nr1, fitted_2peaks_nr2)
+  peak_sigma <- c(peak_sigma, sigma1, sigma2)
+  
+  roi_value_list <- list("mean" = peak_mean, "scale_factor" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq_new)
   return(roi_value_list)
 }
 
-fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds) {
+optimize_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale_factor, use_bounds) {
   #' Fit a Gaussian curve for a peak with given parameters
   #'
   #' @param mass_vector: Vector of masses (float)
   #' @param int_vector: Vector of intensities (float)
   #' @param sigma: Value for width of the peak (float)
   #' @param query_mass: Value for mass at center of peak (float)
+  #' @param scale_factor: Value for scaling intensities (float)
   #' @param use_bounds: Boolean to indicate whether boundaries are to be used
   #'
   #' @return opt_fit: list of parameters and values describing the optimal fit
   
-  # Initial value used to estimate scaling parameter
-  scale <- 2
-  
   # define optimization function for optim based on normal distribution
   opt_f <- function(params) {
     d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
@@ -467,19 +485,19 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, use_bounds
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
     upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
     # get optimal value for fitted Gaussian curve
-    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)),
                      opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
                      lower = lower, upper = upper)
   } else {
-    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)),
                      opt_f, control = list(maxit = 10000))
   }
   return(opt_fit)
 }
 
-fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
-                           query_mass1, scale1,
-                           query_mass2, scale2, use_bounds) {
+optimize_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
+                           query_mass1, scale_factor1,
+                           query_mass2, scale_factor2, use_bounds) {
   #' Fit two Gaussian curves for a peak with given parameters
   #'
   #' @param mass_vector: Vector of masses (float)
@@ -487,9 +505,9 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
   #' @param sigma1: Value for width of the first peak (float)
   #' @param sigma2: Value for width of the second peak (float)
   #' @param query_mass1: Value for mass at center of first peak (float)
-  #' @param scale1: Value for scaling intensities for first peak (float)
+  #' @param scale_factor1: Value for scaling intensities for first peak (float)
   #' @param query_mass2: Value for mass at center of second peak (float)
-  #' @param scale2: Value for scaling intensities for second peak (float)
+  #' @param scale_factor2: Value for scaling intensities for second peak (float)
   #' @param use_bounds: Boolean to indicate whether boundaries are to be used
   #'
   #' @return opt_fit: list of parameters and values describing the optimal fit
@@ -506,27 +524,27 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
     upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
     # get optimal value for 2 fitted Gaussian curves
-    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+    if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) {
       sigma2 <- sigma1
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass1), as.numeric(scale1)),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
+                         as.numeric(query_mass1), as.numeric(scale_factor1)),
                        opt_f, control = list(maxit = 10000),
                        method = "L-BFGS-B", lower = lower, upper = upper)
     } else {
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass2), as.numeric(scale2)),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
+                         as.numeric(query_mass2), as.numeric(scale_factor2)),
                        opt_f, control = list(maxit = 10000),
                        method = "L-BFGS-B", lower = lower, upper = upper)
     }
   } else {
-    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+    if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) {
       sigma2 <- sigma1
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass1), as.numeric(scale1)),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
+                         as.numeric(query_mass1), as.numeric(scale_factor1)),
                        opt_f, control = list(maxit = 10000))
     } else {
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass2), as.numeric(scale2)),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
+                         as.numeric(query_mass2), as.numeric(scale_factor2)),
                        opt_f, control = list(maxit = 10000))
     }
   }
@@ -554,15 +572,13 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) {
   return(stdev)
 }
 
-## adapted from getFitQuality.R
-# parameter not used: mu_last
-get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) {
-  #' Fit 1 Gaussian peak in small region of m/z
+get_fit_quality <- function(mass_vector, int_vector, mu_first, resol, scale_factor = NULL, sigma = NULL, sum_fit = NULL) {
+  #' Get fit quality for 1 Gaussian peak in small region of m/z
   #'
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
   #' @param mu_first: Value for first peak (float)
-  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param scale_factor: Initial value used to estimate scaling parameter (integer)
   #' @param resol: Value for resolution (integer)
   #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float)
   #'
@@ -571,8 +587,8 @@ get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, s
     mass_vector_int <- mass_vector
     int_vector_int <- int_vector
     # get new fit quality
-    fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / 
-                     rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int)))
+    fq_new <- mean(abs((scale_factor * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / 
+                     rep((max(scale_factor * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int)))
   } else {
     sum_fit_int <- sum_fit
     int_vector_int <- int_vector
@@ -588,32 +604,27 @@ get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, s
   return(list_params)
 }
 
-estimate_area <- function(mass_max, resol, scale, sigma) {
+estimate_area <- function(mass_max, resol, scale_factor, sigma) {
   #' Estimate area of Gaussian curve
   #'
   #' @param mass_max: Value for m/z at maximum intensity of a peak (float)
   #' @param resol: Value for resolution (integer)
-  #' @param scale: Value for peak width (float)
+  #' @param scale_factor: Value for peak width (float)
   #' @param sigma: Value for standard deviation (float)
-  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
   #'
   #' @return area_curve: Value for area under the Gaussian curve (float)
   
-  # Number used to calculate area under Gaussian curve
-  int_factor <- 1 * 10^5 
-  
-  # avoid vectors that are too big (cannot allocate vector of size ...)
-  if (mass_max > 1200) return(0)
+  # calculate width of peak at half maximum
+  fwhm <- get_fwhm(mass_max, resol)
   
   # generate a mass_vector with equally spaced m/z values
-  fwhm <- get_fwhm(mass_max, resol)
   mz_min <- mass_max - 2 * fwhm
   mz_max <- mass_max + 2 * fwhm
   mz_range <- mz_max - mz_min 
-  mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor)
+  mass_vector_eq <- seq(mz_min, mz_max, length = 100)
   
   # estimate area under the curve
-  area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100
+  area_curve <- sum(scale_factor * dnorm(mass_vector_eq, mass_max, sigma)) / 100
   
   return(area_curve)  
 }
@@ -625,32 +636,34 @@ get_fwhm <- function(query_mass, resol) {
   #' @param resol: Value for resolution (integer)
   #'
   #' @return fwhm: Value for full width at half maximum (float)
-  
-  # set aberrant values of query_mass to zero
-  if (is.nan(query_mass)) query_mass <- 0
-  if (is.na(query_mass)) query_mass <- 0
-  if (is.null(query_mass)) query_mass <- 0
-  if (query_mass < 0) query_mass <- 0
+
+  # set aberrant values of query_mass to default value of 200
+  if (is.na(query_mass)) {
+    query_mass <- 200
+  } 
+  if (query_mass < 0) {
+    query_mass <- 200
+  }
   # calculate resolution at given m/z value
   resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200)))
   # calculate full width at half maximum
   fwhm <- query_mass / resol_mz
+  
   return(fwhm)
 }
 
-fit_optim <- function(mass_vector, int_vector, resol, scanmode) {
+fit_optim <- function(mass_vector, int_vector, resol) {
   #' Determine optimized fit of Gaussian curve to small region of m/z
   #'
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Vector of intensities for a region of interest (float)
   #' @param resol: Value for resolution (integer)
-  #' @param scanmode: Scan mode, positive or negative (string)
   #'
   #' @return roi_value_list: list of fit values for region of interest (list)
   
-  # Number used to calculate area under Gaussian curve
-  int_factor <- 1 * 10^5
-  factor <- 1.5
+  # initial value for scale_factor
+  scale_factor <- 1.5
+
   # Find the index in int_vector with the highest intensity
   max_index <- which(int_vector == max(int_vector))[1]
   mass_max <- mass_vector[max_index]
@@ -658,15 +671,15 @@ fit_optim <- function(mass_vector, int_vector, resol, scanmode) {
   # get peak width
   fwhm <- get_fwhm(mass_max, resol)
   # simplify the peak shape: represent it by a triangle
-  mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm)
+  mass_max_simple <- c(mass_max - scale_factor * fwhm, mass_max, mass_max + scale_factor * fwhm)
   int_max_simple <- c(0, int_max, 0)
   
   # define mass_diff as difference between last and first value of mass_max_simple
-  mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
+  # mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
   # generate a second mass_vector with equally spaced m/z values
-  mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
-                      length = mass_diff * int_factor)
-  sigma <- get_stdev(mass_vector2, int_max_simple)
+  mass_vector_eq <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
+                      length = 100 * length(mass_max_simple))
+  sigma <- get_stdev(mass_vector_eq, int_max_simple)
   # define optimization function for optim based on normal distribution
   opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) {
     curve <- p * dnorm(mass_vector, mass_max, sigma)
@@ -675,26 +688,26 @@ fit_optim <- function(mass_vector, int_vector, resol, scanmode) {
   
   # get optimal value for fitted Gaussian curve
   opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max)
-  scale <- opt_fit$minimum
+  scale_factor <- opt_fit$minimum
   
   # get an estimate of the area under the peak
-  area <- estimate_area(mass_max, resol, scale, sigma)
+  area <- estimate_area(mass_max, resol, scale_factor, sigma)
   # put all values for this region of interest into a list
   roi_value_list <- list("mean" = mass_max,
                          "area" = area,
-                         "min" = mass_vector2[1],
-                         "max" = mass_vector2[length(mass_vector2)])
+                         "min" = mass_vector_eq[1],
+                         "max" = mass_vector_eq[length(mass_vector_eq)])
   return(roi_value_list)
 }
 
-within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol) {
+within_ppm <- function(mean, scale_factor, sigma, area, mass_vector_eq, mass_vector, ppm = 4, resol) {
   #' Test whether two mass ranges are within ppm distance of each other
   #'
   #' @param mean: Value for mean m/z (float)
-  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param scale_factor: Initial value used to estimate scaling parameter (integer)
   #' @param sigma: Value for standard deviation (float)
   #' @param area: Value for area under the curve (float)
-  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param ppm: Value for distance between two values of mass (integer)
   #' @param resol: Value for resolution (integer)
@@ -704,7 +717,7 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm
   # sort
   index <- order(mean)
   mean <- mean[index]
-  scale <- scale[index]
+  scale_factor <- scale_factor[index]
   sigma <- sigma[index]
   area <- area[index]
   
@@ -718,16 +731,16 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm
         # avoid double occurrance in sum
         if ((i - 1) %in% summed) next
         
-        result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i],
-                                    mass_vector2, mass_vector, resol)
+        result_values <- sum_curves(mean[i - 1], mean[i], scale_factor[i - 1], scale_factor[i], sigma[i - 1], sigma[i],
+                                    mass_vector_eq, mass_vector, resol)
         summed <- c(summed, i - 1, i)
         if (is.nan(result_values$mean)) result_values$mean <- 0
         mean[i - 1] <- result_values$mean
         mean[i] <- result_values$mean
         area[i - 1] <- result_values$area
         area[i] <- result_values$area
-        scale[i - 1] <- result_values$scale
-        scale[i] <- result_values$scale
+        scale_factor[i - 1] <- result_values$scale_factor
+        scale_factor[i] <- result_values$scale_factor
         sigma[i - 1] <- result_values$sigma
         sigma[i] <- result_values$sigma
         
@@ -739,41 +752,64 @@ within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm
   if (length(remove) != 0) {
     mean <- mean[-c(remove)]
     area <- area[-c(remove)]
-    scale <- scale[-c(remove)]
+    scale_factor <- scale_factor[-c(remove)]
     sigma <- sigma[-c(remove)]
   }
   
-  list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL)
+  list_params <- list("mean" = mean, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma, "qual" = NULL)
   return(list_params)
 }
 
-sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol) {
+sum_curves <- function(mean1, mean2, scale_factor1, scale_factor2, sigma1, sigma2, mass_vector_eq, mass_vector, resol) {
   #' Sum two curves
   #'
   #' @param mean1: Value for mean m/z of first peak (float)
   #' @param mean2: Value for mean m/z of second peak (float)
-  #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer)
-  #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer)
+  #' @param scale_factor1: Initial value used to estimate scaling parameter for first peak (integer)
+  #' @param scale_factor2: Initial value used to estimate scaling parameter for second peak (integer)
   #' @param sigma1: Value for standard deviation for first peak (float)
   #' @param sigma2: Value for standard deviation for second peak (float)
-  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param resol: Value for resolution (integer)
   #'
   #' @return list_params: list of parameters indicating quality of fit (list)
   
-  sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2))
+  sum_fit <- (scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)) + (scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2))
   
-  mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)),
-                                                  max(scale2 * dnorm(mass_vector2, mean2, sigma2))))
+  mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)),
+                                                  max(scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2))))
   
   # get new values for parameters
   fwhm <- get_fwhm(mean1_plus2, resol)
   area <- max(sum_fit)
-  scale <- scale1 + scale2
+  scale_factor <- scale_factor1 + scale_factor2
   sigma <- (fwhm / 2) * 0.85
   
-  list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma)
+  list_params <- list("mean" = mean1_plus2, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma)
   return(list_params)
 }
 
+check_overlap <- function(range1, range2) {
+  #' Modify range1 and range2 in case of overlap
+  #'
+  #' @param range1: Vector of m/z values for first peak (float)
+  #' @param range2: Vector of m/z values for second peak (float)
+  #'
+  #' @return new_ranges: list of two ranges (list)
+  
+  # Check for overlap
+  if (length(intersect(range1, range2)) == 2) {
+    if (length(range1) >= length(range2)) {
+      range1 <- range1[-length(range1)]  
+    } else {
+      range2 <- range2[-1]
+    }
+  } else if (length(intersect(range1, range2)) == 3) {
+    range1 <- range1[-length(range1)]  
+    range2 <- range2[-1]
+  }
+  new_ranges <- list("range1" = range1, "range2" = range2)
+  return(new_ranges)
+}
+

From 62fc22503e990e70921ba3f41e827c9b874fa57c Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 11 Feb 2025 11:17:01 +0100
Subject: [PATCH 05/32] added unit tests for DIMS peak finding

---
 DIMS/preprocessing/peak_finding_functions.R   |  13 +-
 DIMS/tests/testthat.R                         |   6 +
 .../testthat/test_peak_finding_functions.R    | 218 ++++++++++++++++++
 3 files changed, 231 insertions(+), 6 deletions(-)
 create mode 100644 DIMS/tests/testthat.R
 create mode 100644 DIMS/tests/testthat/test_peak_finding_functions.R

diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
index 947fea1f..1baef120 100644
--- a/DIMS/preprocessing/peak_finding_functions.R
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -38,8 +38,8 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) {
       }
       # check if there are more intensities than maximum for region of interest
       if (length(int_vector) > max_roi_length) {
+        print("vector of intensities is longer than max")
         print(length(int_vector))
-        print(running_index)
         # trim lowest intensities to zero
         #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0
         # split the range into multiple sub ranges
@@ -107,7 +107,7 @@ search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) {
 
 fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector,  
                          resol, force_nr, use_bounds) {
-  #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z
+  #' Fit 1 or 2 Gaussian peaks in small region of m/z
   #'
   #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
@@ -329,14 +329,15 @@ fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index,
       # optimize scaling factor
       fq <- 0
       scale_factor <- 0
-      if (sum(int_vector) > sum(params1[2] * dnorm(mass_vector, fitted_mz, sigma))) {
+      if (sum(int_vector) > sum(fitted_nr * dnorm(mass_vector, fitted_mz, sigma))) {
         # increase scale_factor until convergence
         while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) {
           fq <- fq_new
           scale_factor <- scale_factor_new
           # fit 1 peak
           fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-          params1 <- fitted_peak$par
+          fitted_mz <- fitted_peak$par[1]
+          fitted_nr <- fitted_peak$par[2]
           # get new value for fit quality and scale_factor
           fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
           scale_factor_new <- 1.2 * scale_factor
@@ -348,11 +349,11 @@ fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index,
           scale_factor <- scale_factor_new
           # fit 1 peak
           fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-          params1 <- fitted_peak$par
+          fitted_mz <- fitted_peak$par[1]
+          fitted_nr <- fitted_peak$par[2]
           # get new value for fit quality and scale_factor
           fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
           scale_factor_new <- 0.8 * scale_factor
-          print(scale_factor_new)
         }
       }
       # use optimized scale_factor factor to fit 1 peak
diff --git a/DIMS/tests/testthat.R b/DIMS/tests/testthat.R
new file mode 100644
index 00000000..a467339d
--- /dev/null
+++ b/DIMS/tests/testthat.R
@@ -0,0 +1,6 @@
+# Run all unit tests
+library(testthat)
+# enable snapshots
+local_edition(3)
+
+testthat::test_file("testthat/test_peak_finding_functions.R")
diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R
new file mode 100644
index 00000000..a0db0ca9
--- /dev/null
+++ b/DIMS/tests/testthat/test_peak_finding_functions.R
@@ -0,0 +1,218 @@
+# unit tests for PeakFinding functions: 
+
+source("../preprocessing/peak_finding_functions.R")
+
+# test fit_quality function:
+test_that("fit quality is correctly calculated", {
+  # initialize
+  test_resol <- 140000
+  test_scale_factor <- 2
+  test_sigma <- 0.0001
+  test_sum_fit <- NULL
+
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_mz_max <- max(test_mass_vector)
+
+  expect_type(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit), "list")
+  expect_equal(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit)$fq_new, 2.426, tolerance = 0.001, TRUE)
+})
+
+# test get_stdev:
+test_that("standard deviation is correctly calculated", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_resol <- 140000
+
+  expect_type(get_stdev(test_mass_vector, test_int_vector, test_resol), "double")
+  expect_equal(get_stdev(test_mass_vector, test_int_vector, test_resol), 0.000126, tolerance = 0.00001, TRUE)
+})
+
+# test estimate_area
+test_that("area under the curve is correctly calculated", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_mz_max <- max(test_mass_vector)
+  test_resol <- 140000
+  test_scale_factor <- 2
+  test_sigma <- 0.0001
+
+  expect_type(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), "double")
+  expect_equal(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), 1673.061, tolerance = 0.001, TRUE)
+})
+
+# test get_fwhm
+test_that("fwhm is correctly calculated", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_mz_max <- max(test_mass_vector)
+  test_resol <- 140000
+
+  expect_type(get_fwhm(test_mz_max, test_resol), "double")
+  expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE)
+})
+
+# test get_fwhm for NA
+test_that("fwhm for mass NA gives standard fwhm", {
+  test_resol <- 140000
+
+  expect_type(get_fwhm(NA, test_resol), "double")
+  expect_equal(get_fwhm(NA, test_resol), 0.001428571, tolerance = 0.000001, TRUE)
+})
+
+# test within_ppm
+test_that("two peaks are within 5 ppm", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_mz_max <- max(test_mass_vector)
+  test_scale_factor <- 2
+  test_sigma <- 0.0001
+  test_area <- 20000
+  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
+  test_ppm <- 5
+  test_resol <- 140000
+
+  expect_type(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol), "list")
+  expect_equal(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol)$mean, 70.00962, tolerance = 0.0001, TRUE)
+})
+
+# test sum_curves
+test_that("two curves are correctly summed", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
+  test_mean1 <- 70.00938
+  test_mean2 <- 70.00962
+  test_scale_factor1 <- 2
+  test_scale_factor2 <- 4
+  test_sigma1 <- 0.0001
+  test_sigma2 <- 0.0002
+  test_resol <- 140000
+  
+  expect_type(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol), "list")
+  expect_equal(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol)$mean, 70.0095, tolerance = 0.0001, TRUE)
+})
+
+# test fit_optim
+test_that("optimal peak fit can be found", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_resol <- 140000
+  # create output to test on:
+  test_output <- list(mean = 70.0095, area = 5009.596, min = 70.00906, max = 70.00994)
+  
+  expect_type(fit_optim(test_mass_vector, test_int_vector, test_resol), "list")
+  for (key in names(test_output)) {
+    expect_equal(fit_optim(test_mass_vector, test_int_vector, test_resol)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
+  }
+})
+
+# test optimize_1gaussian:
+test_that("optimal value for m/z and scale_factor can be found", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_sigma <- 0.0001
+  test_query_mass <- median(test_mass_vector)
+  test_scale_factor <- 2
+  test_use_bounds <- FALSE
+  
+  expect_type(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds), "list")
+  expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[1], 70.00949, tolerance = 0.0001, TRUE)
+  expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[2], 4.570024, tolerance = 0.0001, TRUE)
+})
+
+# test fit_1peak:
+test_that("correct mean m/z and scale_factor can be found for 1 peak", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
+  test_max_index <- which(test_int_vector == max(test_int_vector))
+  test_resol <- 140000
+  test_fit_quality <- 0.5
+  test_use_bounds <- FALSE
+  # create output to test on:
+  test_output <- list(mean = 70.0095, scale_factor = 5.23334, sigma = 0.000126, qual = 0.24300)
+  
+  expect_type(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds), "list")
+  for (key in names(test_output)) {
+    expect_equal(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
+  }
+})
+
+# test optimize_2gaussians: 
+test_that("optimal value for m/z and scale_factor can be found", {
+  # create info for two peaks in a small region
+  test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006
+  test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2)
+  test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100)
+  test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks))
+  test_sigma1 <- 0.0001
+  test_sigma2 <- 0.0002
+  test_query_mass1 <- test_max_index[1]
+  test_query_mass2 <- test_max_index[2]
+  test_scale_factor1 <- 2
+  test_scale_factor2 <- 3
+  test_use_bounds <- FALSE
+  
+  expect_type(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds), "list")
+  expect_equal(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds)$par[1], 4, tolerance = 0.1, TRUE)
+})
+
+# test fit_2peaks:
+test_that("correct mean m/z and scale_factor can be found for 2 peaks", {
+  # create info for two peaks in a small region
+  test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006
+  test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2)
+  test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100)
+  test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks))
+  test_resol <- 140000
+  test_fit_quality <- 0.5
+  test_use_bounds <- FALSE
+  # create output to test on:
+  test_output <- list(mean = c(70.00954, 70.00998), scale_factor = c(4.823598, 4.828072), sigma = c(0.0001257425, 0.0001257436), qual = 0.38341)
+  
+  expect_type(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, 
+                         test_resol, test_use_bounds, test_fit_quality), "list")
+  for (key in names(test_output)) {
+    expect_equal(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, 
+                            test_resol, test_use_bounds, test_fit_quality)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
+  }
+})
+
+# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds)
+test_that("initial gaussian fit is done correctly", {
+  # create peak info to test on:
+  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
+  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
+  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
+  test_resol <- 140000
+  test_use_bounds <- FALSE
+  test_force_nr <- 1
+  
+  expect_type(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds), "list")
+  expect_equal(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds)$mean, 70.00956, tolerance = 0.00001, TRUE)
+})
+
+# test search_mzrange(ints_fullrange, resol, sample_name, peak_thresh)
+test_that("all peak finding functions work together", {
+  # enable snapshot
+  local_edition(3)
+  # create info for ten peaks separated by zero
+  test_large_mass_vector <- rep(70.00938, 80) + 1:80 * 0.00006
+  test_large_int_vector <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000, 0), 10)
+  names(test_large_int_vector) <- test_large_mass_vector
+  test_resol <- 140000
+  test_sample_name <- "C1.1"
+  test_peak_thresh <- 100
+  
+  expect_type(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), "list")
+  expect_snapshot(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), error = FALSE)
+})
+
+# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds)
+test_that("fit gaussian")

From 5d311a3203e0eee999781d9621c10f01c63508f9 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:27:24 +0200
Subject: [PATCH 06/32] changed variable name tmp to replicates_persample

---
 DIMS/MakeInit.R | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 44d49962..9ff22623 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,5 +1,3 @@
-## adapted from makeInit in old pipeline
-
 # define parameters
 args <- commandArgs(trailingOnly = TRUE)
 
@@ -15,12 +13,12 @@ group_names_unique <- unique(group_names)
 # generate the replication pattern
 repl_pattern <- c()
 for (sample_group in 1:nr_sample_groups) {
-  tmp <- c()
+  replicates_persample <- c()
   for (repl in nr_replicates:1) {
     index <- ((sample_group * nr_replicates) - repl) + 1
-    tmp <- c(tmp, sample_names[index])
+    replicates_persample <- c(replicates_persample, sample_names[index])
   }
-  repl_pattern <- c(repl_pattern, list(tmp))
+  repl_pattern <- c(repl_pattern, list(replicates_persample))
 }
 
 names(repl_pattern) <- group_names_unique

From 708b872f548ffee4a4b114a2b2b1ba20c6cfdc0a Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:29:54 +0200
Subject: [PATCH 07/32] omitted obsolete lines

---
 DIMS/GenerateBreaks.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index ddb68c56..007d5ab7 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -1,5 +1,3 @@
-## adapted from 1-generateBreaksFwhm.HPC.R ##
-
 # load required package
 suppressPackageStartupMessages(library("xcms"))
 

From 0521390b07a4ea54810ea80abc8e9d54766dac19 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:33:21 +0200
Subject: [PATCH 08/32] added weighted mean for half-bad TICs

---
 DIMS/AssignToBins.R | 76 ++++++++++++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 29 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index a25a5e70..8b31af7e 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -6,51 +6,63 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 
 mzml_filepath <- cmd_args[1]
 breaks_filepath <- cmd_args[2]
-trimparams_filepath <- cmd_args[3]
+trim_parameters_filepath <- cmd_args[3]
 resol <- as.numeric(cmd_args[4])
-trim <- 0.1
-dims_thresh <- 100
 
-# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg
+# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
 load(breaks_filepath)
-# load trim paramters: trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(trimparams_filepath)
+# load trim parameters file: contains trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
+load(trim_parameters_filepath)
 
 # get sample name
-sample_name <- sub("\\..*$", "", basename(mzml_filepath))
+techrep_name <- sub("\\..*$", "", basename(mzml_filepath))
 
 options(digits = 16)
 
 # Initialize
 pos_results <- NULL
 neg_results <- NULL
-
-# read in the data for 1 sample
-raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath))
-
-# for TIC plots: prepare txt files with data for plots
-tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
-colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
-write.table(tic_intensity_persample, file = paste0(sample_name, "_TIC.txt"))
-
-# Create empty placeholders for later use
 bins <- rep(0, length(breaks_fwhm) - 1)
 pos_bins <- bins
 neg_bins <- bins
+dims_thresh <- 100
 
-# Generate a matrix
+# read in the data for 1 sample
+raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath))
+
+# Generate a matrix with retention times and intensities
 raw_data_matrix <- xcms::rawMat(raw_data)
 
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
 neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
 # Select scans between trim_left and trim_right
-pos_times <- pos_times[pos_times > trim_left_pos & pos_times < trim_right_pos]
-neg_times <- neg_times[neg_times > trim_left_neg & neg_times < trim_right_neg]
+pos_times_trimmed <- pos_times[pos_times > trim_left_pos & pos_times < trim_right_pos]
+neg_times_trimmed <- neg_times[neg_times > trim_left_neg & neg_times < trim_right_neg]
 
+# get TIC intensities for areas between trim_left and trim_right
+tic_intensity_persample <- cbind(raw_data@scantime, raw_data@tic)
+colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
+tic_intensity_pos <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(pos_times_trimmed) &
+                                             tic_intensity_persample[ , "retention_time"] < max(pos_times_trimmed), ]
+tic_intensity_neg <- tic_intensity_persample[tic_intensity_persample[ , "retention_time"] > min(neg_times_trimmed) &
+                                             tic_intensity_persample[ , "retention_time"] < max(neg_times_trimmed), ]
+# calculate weighted mean of intensities for pos and neg separately
+mean_pos <- weighted.mean(tic_intensity_pos[ , "tic_intensity"], tic_intensity_pos[ , "tic_intensity"])
+mean_neg <- weighted.mean(tic_intensity_neg[ , "tic_intensity"], tic_intensity_neg[ , "tic_intensity"])
+# intensity per scan should be at least 80% of weighted mean
+dims_thresh_pos <- 0.8 * mean_pos
+dims_thresh_neg <- 0.8 * mean_neg
+
+# Generate an index with which to select values for each mode
+#pos_index <- which(raw_data_matrix[, "time"] %in% pos_times)
+#neg_index <- which(raw_data_matrix[, "time"] %in% neg_times)
+# select only data from scans which pass the dims_thresh_pos and *_neg filter
+pos_times_pass <- tic_intensity_pos[which(tic_intensity_pos[ , "tic_intensity"] > dims_thresh_pos), "retention_time"]
+neg_times_pass <- tic_intensity_neg[which(tic_intensity_neg[ , "tic_intensity"] > dims_thresh_neg), "retention_time"]
 # Generate an index with which to select values for each mode
-pos_index <- which(raw_data_matrix[, "time"] %in% pos_times)
-neg_index <- which(raw_data_matrix[, "time"] %in% neg_times)
+pos_index <- which(raw_data_matrix[, "time"] %in% pos_times_pass)
+neg_index <- which(raw_data_matrix[, "time"] %in% neg_times_pass)
 # Separate each mode into its own matrix
 pos_raw_data_matrix <- raw_data_matrix[pos_index, ]
 neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
@@ -76,10 +88,10 @@ bin_indices_neg <- cut(
 if (nrow(pos_raw_data_matrix) > 0) {
   # set NA in intensities to zero
   pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0
-  # aggregate intensities, calculate mean, use only values above dims_thresh
+  # aggregate intensities, calculate mean, use only values above dims_thresh_pos
   aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"],
 				   list(bin_indices_pos),
-				   FUN = function(x) { mean(x[which(x > dims_thresh)]) })
+				   FUN = function(x) { mean(x) })
   # set NA to zero in second column
   aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0
   pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2]
@@ -87,10 +99,10 @@ if (nrow(pos_raw_data_matrix) > 0) {
 if (nrow(neg_raw_data_matrix) > 0) {
   # set NA in intensities to zero
   neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0
-  # aggregate intensities, calculate mean, use only values above dims_thresh
+  # aggregate intensities, calculate mean, use only values above dims_thresh_neg
   aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"],
 				   list(bin_indices_neg),
-				   FUN = function(x) { mean(x[which(x > dims_thresh)]) })
+				   FUN = function(x) { mean(x) })
   # set NA to zero in second column
   aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0
   neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2]
@@ -108,8 +120,8 @@ pos_results_transpose <- t(pos_results)
 neg_results_transpose <- t(neg_results)
 
 # Add file names as row names
-rownames(pos_results_transpose) <- sample_name
-rownames(neg_results_transpose) <- sample_name
+rownames(pos_results_transpose) <- techrep_name
+rownames(neg_results_transpose) <- techrep_name
 
 # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results
 breaks_fwhm_avg_minuslast <- breaks_fwhm_avg[-length(breaks_fwhm_avg)]
@@ -126,4 +138,10 @@ neg_results_final <- t(neg_results_transpose)
 
 peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm)
 
-save(peak_list, file = paste0(sample_name, ".RData"))
+save(peak_list, file = paste0(techrep_name, ".RData"))
+
+# for TIC plots: write txt files with data including threshold values for plots
+dims_thresh <- c(rep(dims_thresh_pos, length(pos_times)), rep(dims_thresh_neg, length(neg_times)))
+tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic, round(dims_thresh, 0))
+colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity", "threshold")
+write.table(tic_intensity_persample, file = paste0(techrep_name, "_TIC.txt"))

From 15403cd559ab7f953858bfee430df3575a670dec Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:39:05 +0200
Subject: [PATCH 09/32] replaced AverageTechReplicates step by EvaluateTics

---
 DIMS/EvaluateTics.R  | 65 ++++++++++++++++++++++++++++++++++----------
 DIMS/EvaluateTics.nf |  3 +-
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R
index 5b8c1e30..bc66a34e 100644
--- a/DIMS/EvaluateTics.R
+++ b/DIMS/EvaluateTics.R
@@ -13,7 +13,6 @@ highest_mz_file <- cmd_args[5]
 highest_mz <- get(load(highest_mz_file))
 trim_params_filepath <- cmd_args[6]
 thresh2remove <- 1000000000
-dims_thresh <- 100
 
 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
   # collect list of samples to remove from replication pattern
@@ -76,31 +75,62 @@ for (sample_nr in 1:length(repl_pattern)) {
     if (sum(peak_list$neg[, 1]) < thresh2remove) {
       cat(" ... Removed")
       remove_neg <- c(remove_neg, tech_reps[file_nr])
-    } 
+    }
     tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg)
   }
 }
 
+# negative scan mode
+print("neg")
 pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
+print(repl_pattern_filtered)
+print(length(repl_pattern_filtered))
 save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
-write.table(
-  remove_neg, 
-  file = "miss_infusions_negative.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
-  sep = "\t"
+# save replication pattern in txt file for use in Nextflow
+allsamples_techreps_neg <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
+for (sample_nr in 1:length(repl_pattern_filtered)) {
+  allsamples_techreps_neg[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
+  allsamples_techreps_neg[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
+}
+allsamples_techreps_neg[, 3] <- "negative"
+
+# write information on miss_infusions
+write.table(remove_neg,
+	    file = "miss_infusions_negative.txt",
+	    row.names = FALSE,
+	    col.names = FALSE,
+	    sep = "\t"
 )
 
+# positive scan mode
 pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+print(pattern_list)
 repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
-write.table(
-  remove_pos, 
-  file = "miss_infusions_positive.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
-  sep = "\t"
+# save replication pattern in txt file for use in Nextflow
+allsamples_techreps_pos <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
+for (sample_nr in 1:length(repl_pattern_filtered)) {
+  allsamples_techreps_pos[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
+  allsamples_techreps_pos[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
+}
+allsamples_techreps_pos[, 3] <- "positive"
+
+# combine information on samples and technical replicates for pos and neg
+allsamples_techreps <- rbind(allsamples_techreps_pos, allsamples_techreps_neg)
+write.table(allsamples_techreps,
+	    file = paste0("replicates_per_sample.txt"),
+            col.names = FALSE,
+	    row.names = FALSE,
+	    sep = ","
+)
+
+# write information on miss_infusions
+write.table(remove_pos,
+	    file = "miss_infusions_positive.txt",
+	    row.names = FALSE,
+	    col.names = FALSE,
+	    sep = "\t"
 )
 
 ## generate TIC plots
@@ -127,8 +157,11 @@ for (sample_nr in c(1:length(repl_pattern))) {
   sample_name <- names(repl_pattern)[sample_nr]
   for (file_nr in 1:length(tech_reps)) {
     plot_nr <- plot_nr + 1
-    # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
     repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
+    # get threshold values per technical replicate
+    dims_thresh_pos <- repl1_nr[1, "threshold"]
+    dims_thresh_neg <- repl1_nr[nrow(repl1_nr), "threshold"]
+    # for replicates with bad TIC, determine what color the border of the plot should be
     bad_color_pos <- tech_reps[file_nr] %in% remove_pos
     bad_color_neg <- tech_reps[file_nr] %in% remove_neg
     if (bad_color_neg & bad_color_pos) {
@@ -147,6 +180,8 @@ for (sample_nr in c(1:length(repl_pattern))) {
       geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) +
       geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) +
       geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) +
+      geom_hline(yintercept = dims_thresh_pos, col = "green", linetype = 2, linewidth = 0.3) +
+      geom_hline(yintercept = dims_thresh_neg, col = "blue", linetype = 2, linewidth = 0.3) +
       labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
       theme(plot.background = element_rect(fill = plot_color),
             axis.text = element_text(size = 4),
diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf
index 7ce58684..831c9500 100644
--- a/DIMS/EvaluateTics.nf
+++ b/DIMS/EvaluateTics.nf
@@ -15,7 +15,8 @@ process EvaluateTics {
        path(trim_params_file)
 
     output:
-       path('*_repl_pattern.RData'), emit: pattern_files
+       path('*_repl_pattern.RData'),      emit: pattern_files
+       path('replicates_per_sample.txt'), emit: sample_techreps
        path('miss_infusions_negative.txt')
        path('miss_infusions_positive.txt')
        path('*_TICplots.pdf')

From 6a7ada6d303af488141a77252b7b8487ad759b4c Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:39:59 +0200
Subject: [PATCH 10/32] replaced AverageTechReplicates step by EvaluateTics

---
 DIMS/AverageTechReplicates.R  | 200 ----------------------------------
 DIMS/AverageTechReplicates.nf |  35 ------
 2 files changed, 235 deletions(-)
 delete mode 100644 DIMS/AverageTechReplicates.R
 delete mode 100644 DIMS/AverageTechReplicates.nf

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
deleted file mode 100644
index 98a2d414..00000000
--- a/DIMS/AverageTechReplicates.R
+++ /dev/null
@@ -1,200 +0,0 @@
-# adapted from 3-AverageTechReplicates.R
-
-# load packages
-library("ggplot2")
-library("gridExtra")
-
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
-
-init_file <- cmd_args[1]
-nr_replicates <- as.numeric(cmd_args[2])
-run_name <- cmd_args[3]
-dims_matrix <- cmd_args[4]
-highest_mz_file <- cmd_args[5]
-highest_mz <- get(load(highest_mz_file))
-breaks_filepath <- cmd_args[6]
-thresh2remove <- 1000000000
-dims_thresh <- 100
-
-remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
-  # collect list of samples to remove from replication pattern
-  remove_from_group <- NULL
-  for (sample_nr in 1:length(repl_pattern)){
-    repl_pattern_1sample <- repl_pattern[[sample_nr]]
-    remove <- NULL
-    for (file_nr in 1:length(repl_pattern_1sample)) {
-      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
-        remove <- c(remove, file_nr)
-      }
-    }
-    if (length(remove) == nr_replicates) {
-      remove_from_group <- c(remove_from_group, sample_nr)
-    }
-    if (!is.null(remove)) {
-      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
-    }
-  }
-  if (length(remove_from_group) != 0) {
-    repl_pattern <- repl_pattern[-remove_from_group]
-  }
-  return(list("pattern" = repl_pattern))
-}
-
-# load init_file: contains repl_pattern
-load(init_file)
-
-# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg,
-# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos
-load(breaks_filepath)
-
-# lower the threshold below which a sample will be removed for DBS and for high m/z
-if (dims_matrix == "DBS") {
-  thresh2remove <- 500000000
-}
-if (highest_mz > 700) {
-  thresh2remove <- 1000000
-}
-
-
-# remove technical replicates which are below the threshold
-remove_neg <- NULL
-remove_pos <- NULL
-cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (sample_nr in 1:length(repl_pattern)) {
-  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
-  tech_reps_array_pos <- NULL
-  tech_reps_array_neg <- NULL
-  sum_neg <- 0
-  sum_pos <- 0
-  nr_pos <- 0
-  nr_neg <- 0
-  for (file_nr in 1:length(tech_reps)) {
-    load(paste(tech_reps[file_nr], ".RData", sep = ""))
-    cat("\n\nParsing", tech_reps[file_nr])
-    # negative scanmode
-    cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
-    if (sum(peak_list$neg[, 1]) < thresh2remove) {
-      cat(" ... Removed")
-      remove_neg <- c(remove_neg, tech_reps[file_nr])
-    } else {
-      nr_neg <- nr_neg + 1
-      sum_neg <- sum_neg + peak_list$neg
-    }
-    tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg)
-    # positive scanmode
-    cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
-    if (sum(peak_list$pos[, 1]) < thresh2remove) {
-      cat(" ... Removed")
-      remove_pos <- c(remove_pos, tech_reps[file_nr])
-    } else {
-      nr_pos <- nr_pos + 1
-      sum_pos <- sum_pos + peak_list$pos
-    }
-    tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
-  }
-  # save to file  
-  if (nr_neg != 0) {
-    sum_neg[, 1] <- sum_neg[, 1] / nr_neg
-    colnames(sum_neg) <- names(repl_pattern)[sample_nr]
-    save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData"))
-  }
-  if (nr_pos != 0) {
-    sum_pos[, 1] <- sum_pos[, 1] / nr_pos
-    colnames(sum_pos) <- names(repl_pattern)[sample_nr]
-    save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData"))
-  }
-}
-
-pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
-repl_pattern_filtered <- pattern_list$pattern
-save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
-write.table(
-  remove_neg, 
-  file = "miss_infusions_negative.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
-  sep = "\t"
-)
-
-pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
-repl_pattern_filtered <- pattern_list$pattern
-save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
-write.table(
-  remove_pos, 
-  file = "miss_infusions_positive.txt", 
-  row.names = FALSE, 
-  col.names = FALSE, 
-  sep = "\t"
-)
-
-## generate TIC plots
-# get all txt files
-tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt")
-all_samps <- sub("_TIC\\..*$", "", basename(tic_files))
-
-# determine maximum intensity
-highest_tic_max <- 0
-for (file in tic_files) {
-  tic <- read.table(file)
-  this_tic_max <- max(tic$tic_intensity)
-  if (this_tic_max > highest_tic_max) {
-    highest_tic_max <- this_tic_max
-    max_sample <- sub("_TIC\\..*$", "", basename(file))
-  }
-}
-
-# create a list with information for all TIC plots
-tic_plot_list <- list()
-plot_nr <-  0
-for (sample_nr in c(1:length(repl_pattern))) {
-  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
-  sample_name <- names(repl_pattern)[sample_nr]
-  for (file_nr in 1:length(tech_reps)) {
-    plot_nr <- plot_nr + 1
-    # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
-    repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
-    bad_color_pos <- tech_reps[file_nr] %in% remove_pos
-    bad_color_neg <- tech_reps[file_nr] %in% remove_neg
-    if (bad_color_neg & bad_color_pos) {
-      plot_color <- "#F8766D"
-    } else if (bad_color_pos) {
-      plot_color <- "#ED8141"
-    } else if (bad_color_neg) {
-      plot_color <- "#BF80FF"
-    } else {
-      plot_color <- "white"
-    }
-    tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
-      geom_line(linewidth = 0.3) +
-      geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
-      geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) +
-      geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) +
-      geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) +
-      geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) +
-      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
-      theme(plot.background = element_rect(fill = plot_color),
-            axis.text = element_text(size = 4),
-            axis.title = element_text(size = 4),
-            plot.title = element_text(size = 6))
-    tic_plot_list[[plot_nr]] <- tic_plot
-  }
-}
-
-# create a layout matrix dependent on number of replicates
-layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
-# put TIC plots in matrix
-tic_plot_pdf <- marrangeGrob(
-  grobs = tic_plot_list,
-  nrow = 10, ncol = nr_replicates,
-  layout_matrix = layout,
-  top = quote(paste(
-    "TICs of run", run_name,
-    " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
-    g, "/", npages
-  ))
-)
-
-# save to file
-ggsave(filename = paste0(run_name, "_TICplots.pdf"),
-       tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
deleted file mode 100644
index f4b85b9a..00000000
--- a/DIMS/AverageTechReplicates.nf
+++ /dev/null
@@ -1,35 +0,0 @@
-process AverageTechReplicates {
-    tag "DIMS AverageTechReplicates"
-    label 'AverageTechReplicates'
-    container = 'docker://umcugenbioinf/dims:1.3'
-    shell = ['/bin/bash', '-euo', 'pipefail']
-
-    input:
-       path(rdata_file)
-       path(tic_txt_files)
-       path(init_file)
-       val(nr_replicates)
-       val(analysis_id)
-       val(matrix)
-       path(highest_mz_file)
-       path(breaks_file)
-
-    output:
-       path('*_repl_pattern.RData'), emit: pattern_files
-       path('*_avg.RData'), emit: binned_files
-       path('miss_infusions_negative.txt')
-       path('miss_infusions_positive.txt')
-       path('*_TICplots.pdf')
-
-    script:
-        """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \
-                                                                      $params.nr_replicates \
-                                                                      $analysis_id \
-                                                                      $matrix \
-                                                                      $highest_mz_file \
-                                                                      $breaks_file
-        """
-}
-
-

From 6438e0e6cb0c24bf440b98b632b4d5abb46b9fb5 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:43:38 +0200
Subject: [PATCH 11/32] removed breaks as input for PeakFinding

---
 DIMS/PeakFinding.nf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index dad2a497..04c82540 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -5,13 +5,14 @@ process PeakFinding {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(path(rdata_file), path(breaks_file))
+       path(rdata_file)
+       each path(sample_techreps)
 
     output:
-       path '*tive.RData'
+       path '*tive.RData', optional: true
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.preprocessing_scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $params.resolution $params.preprocessing_scripts_dir
         """
 }

From 1ef197d7c52981cba19be936299c1a95c7492f3b Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:44:27 +0200
Subject: [PATCH 12/32] changed PeakFinding to new two-step method

---
 DIMS/PeakFinding.R | 66 +++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index c54a64cf..529370d4 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -2,57 +2,57 @@
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 replicate_rdatafile <- cmd_args[1]
-breaks_file <- cmd_args[2]
-resol <- as.numeric(cmd_args[3])
-preprocessing_scripts_dir <- cmd_args[4]
-peak_thresh <- 2000
+resol <- as.numeric(cmd_args[2])
+preprocessing_scripts_dir <- cmd_args[3]
+# use fixed theshold between noise and signal for peak
+peak_thresh <- 2000 
 
-# load in function scripts
+# source functions script
 source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R"))
 
-load(breaks_file)
+library(dplyr)
 
-# Load output of AssignToBins for a sample
-sample_techrepl <- get(load(replicate_rdatafile))
+# Load output of AssignToBins (peak_list) for a technical replicate
+load(replicate_rdatafile)
 techrepl_name <- colnames(peak_list$pos)[1]
 
+# load list of technical replicates per sample that passed threshold filter
+techreps_passed <- read.table("replicates_per_sample.txt", sep=",")
+
 # Initialize
 options(digits = 16)
 
 # run the findPeaks function
 scanmodes <- c("positive", "negative")
 for (scanmode in scanmodes) {
-  # turn dataframe with intensities into a named list
+  # get intensities for scan mode
   if (scanmode == "positive") {
     ints_perscanmode <- peak_list$pos
   } else if (scanmode == "negative") {
     ints_perscanmode <- peak_list$neg
   }
-  
-  ints_fullrange <- as.vector(ints_perscanmode)
-  names(ints_fullrange) <- rownames(ints_perscanmode)
-  
+ 
+  # check whether technical replicate has passed threshold filter for this scanmode
+  techreps_scanmode <- techreps_passed[grep(scanmode, techreps_passed[, 3]), ]
+  # if techrep is ok, it will be found. If not, skip this techrep.
+  if (length(grep(techrepl_name, techreps_scanmode)) == 0) {
+    break
+  }
+
+  # put mz and intensities into dataframe
+  ints_fullrange <- as.data.frame(cbind(mz = as.numeric(rownames(ints_perscanmode)), 
+                                        int = as.numeric(ints_perscanmode)))
+
   # look for m/z range for all peaks
-  allpeaks_values <- search_mzrange(ints_fullrange, resol, techrepl_name, peak_thresh)
-  
-  # turn the list into a dataframe
-  outlist_techrep <- NULL
-  outlist_techrep <- cbind("samplenr" = allpeaks_values$nr,
-                             "mzmed.pkt" = allpeaks_values$mean,
-                             "fq" = allpeaks_values$qual,
-                             "mzmin.pkt" = allpeaks_values$min,
-                             "mzmax.pkt" = allpeaks_values$max,
-                             "height.pkt" = allpeaks_values$area)
-  
-  # remove peaks with height = 0
-  outlist_techrep <- outlist_techrep[outlist_techrep[, "height.pkt"] != 0, ]
-  
+  regions_of_interest <- search_regions_of_interest(ints_fullrange)
+ 
+  # fit Gaussian curve and calculate integrated area under curve
+  integrated_peak_df <- integrate_peaks(ints_fullrange, regions_of_interest, resol, peak_thresh)
+ 
+  # add sample name to dataframe
+  integrated_peak_df <- as.data.frame(cbind(samplenr = techrepl_name, integrated_peak_df))
+ 
   # save output to file
-  save(outlist_techrep, file = paste0(techrepl_name, "_", scanmode, ".RData"))
-  
-  # generate text output to log file on number of spikes for this sample
-  # spikes are peaks that are too narrow, e.g. 1 data point
-  cat(paste("There were", allpeaks_values$spikes, "spikes"))
-  
+  save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData"))
 }
 

From 9272ec3f93884064b0949d06bcc14c523b65a5a8 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:46:17 +0200
Subject: [PATCH 13/32] functions for new two-step PeakFinding method

---
 DIMS/preprocessing/peak_finding_functions.R | 956 ++++----------------
 1 file changed, 194 insertions(+), 762 deletions(-)

diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
index 1baef120..8c2f62d7 100644
--- a/DIMS/preprocessing/peak_finding_functions.R
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -1,633 +1,204 @@
 # functions for peak finding
-search_mzrange <- function(ints_fullrange, resol, sample_name, peak_thresh) {
-  #' Divide the full m/z range into regions of interest with min, max and mean m/z
+
+search_regions_of_interest <- function(ints_fullrange) {
+  #' Divide the full m/z range into regions of interest with indices
   #'
-  #' @param ints_fullrange: Named list of intensities (float)
-  #' @param resol: Value for resolution (integer)
-  #' @param sample_name: Sample name (string)
-  #' @param peak_thresh: Value for noise level threshold (integer)
+  #' @param ints_fullrange: Matrix with m/z values and intensities (float)
   #'
-  #' @return allpeaks_values: list of m/z regions of interest
+  #' @return regions_of_interest: matrix of m/z regions of interest (integer)
 
-  # initialize list to store results for all peaks
-  allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
-                          "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
-  
   # find indices where intensity is not equal to zero
-  nonzero_positions <- as.vector(which(ints_fullrange != 0))
-
-  # initialize
-  # start position of the first peak
-  start_index <- nonzero_positions[1]
-  # maximum length of region of interest
-  max_roi_length <- 15
-
-  # find regions of interest
-  for (running_index in 1:length(nonzero_positions)) {
-    # find position of the end of a peak.
-    if (running_index < length(nonzero_positions) && (nonzero_positions[running_index + 1] - nonzero_positions[running_index]) > 1) {
-      end_index <- nonzero_positions[running_index]
-      # get m/z values and intensities for this region of interest
-      mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)])
-      int_vector <- as.vector(ints_fullrange[c(start_index:end_index)])
-      # check if intensity is above threshold or the maximum intensity is NaN
-      if (max(int_vector) < peak_thresh || is.nan(max(int_vector))) {
-        # go to next region of interest
-        start_index <- nonzero_positions[running_index + 1]
-        next
-      }
-      # check if there are more intensities than maximum for region of interest
-      if (length(int_vector) > max_roi_length) {
-        print("vector of intensities is longer than max")
-        print(length(int_vector))
-        # trim lowest intensities to zero
-        #int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0
-        # split the range into multiple sub ranges
-        #sub_range <- int_vector
-        #names(sub_range) <- mass_vector
-        #allpeaks_values <- search_mzrange(sub_range, allpeaks_values, resol, 
-        #                                  sample_name, peak_thresh)
-      # A proper peak needs to have at least 3 intensities above threshold  
-      } else if (length(int_vector) > 3) {
-        # check if the sum of intensities is above zero. Why is this necessary?
-        #if (sum(int_vector) == 0) next
-       # define mass_diff as difference between last and first value of mass_vector
-        # mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1]
-        # generate a second mass_vector with equally spaced m/z values
-        mass_vector_eq <- seq(mass_vector[1], mass_vector[length(mass_vector)],
-                            length = 10 * length(mass_vector))
-        
-        # Find the index in int_vector with the highest intensity
-        # max_index <- which(int_vector == max(int_vector))
-        # get initial fit values
-        roi_values <- fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
-                                   resol, force_nr = length(max_index),
-                                   use_bounds = FALSE)
-        
-        if (roi_values$qual[1] == 1) {
-          # get optimized fit values
-          roi_values <- fit_optim(mass_vector, int_vector, resol)
-          # add region of interest to list of all peaks
-          allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
-          allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
-          allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
-          allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
-          allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
-          allpeaks_values$qual <- c(allpeaks_values$qual, 0)
-          allpeaks_values$spikes <- allpeaks_values$spikes + 1
-          
-        } else {
-          for (j in 1:length(roi_values$mean)){
-            allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j])
-            allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j])
-            allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
-            allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1])
-            allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1])
-            allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1])
-          }
+  nonzero_positions <- as.vector(which(ints_fullrange$int != 0))
+
+  # find regions of interest (look for consecutive numbers)
+  regions_of_interest_consec <- seqToIntervals(nonzero_positions)
+  # add length of regions of interest
+  regions_of_interest_diff <- regions_of_interest_consec[, 2] - regions_of_interest_consec[, 1] + 1
+  regions_of_interest_length <- cbind(regions_of_interest_consec, length = regions_of_interest_diff)
+  # remove short lengths; a peak should have at least 3 data points
+  if (any(regions_of_interest_length[, "length"] < 3)) {
+    regions_of_interest_gte3 <- regions_of_interest_length[-which(regions_of_interest_length[, "length"] < 3), ]
+  } else {
+    regions_of_interest_gte3 <- regions_of_interest_length
+  }
+  # test for length of roi. If length is greater than 11, remove roi and break up into separate rois
+  remove_roi_index <- c()
+  new_rois_all <- regions_of_interest_gte3[0, ]
+  for (roi_nr in 1:nrow(regions_of_interest_gte3)) {
+    if (regions_of_interest_gte3[roi_nr, "length"] > 11) {
+      roi <- ints_fullrange[(regions_of_interest_gte3[roi_nr, "from"]:regions_of_interest_gte3[roi_nr, "to"]), ]
+      roi_intrange <- as.numeric(roi$int)
+      # look for local minima that separate the peaks
+      local_min_positions <- which(diff(sign(diff(roi_intrange))) == 2) + 1
+      if (length(local_min_positions) > 0) {
+        remove_roi_index <- c(remove_roi_index, roi_nr)
+        # find new indices for rois after splitting
+        start_pos <- regions_of_interest_gte3[roi_nr, "from"]
+        new_rois <- data.frame(from = 0, to = 0, length = 0)
+        new_rois_splitroi <- regions_of_interest_gte3[0, ]
+        for (local_min_index in 1:length(local_min_positions)) {
+          new_rois[, 1] <- start_pos
+          new_rois[, 2] <- start_pos + local_min_positions[local_min_index]
+          new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+          new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+          start_pos <- new_rois[, 2]
         }
-        
+        # last part
+        new_rois[, 1] <- start_pos
+        new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
+        new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
+        new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
+        # append
+        new_rois_all <- rbind(new_rois_all, new_rois_splitroi)
       } else {
-        
-        roi_values <- fit_optim(mass_vector, int_vector, resol)
-        allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
-        allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
-        allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
-        allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
-        allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
-        allpeaks_values$qual <- c(allpeaks_values$qual, 0)
-        allpeaks_values$spikes <- allpeaks_values$spikes + 1
+        # if there are no local minima, all intensities belong to one peak.
+        remove_roi_index <- c(remove_roi_index, roi_nr)
+        # look for maximum and take 5 intensities to the left and right
+        max_index <- which(roi_intrange == max(roi_intrange))
+        max_pos <- as.numeric(rownames(roi)[max_index])
+        new_roi <- data.frame(from = 0, to = 0, length = 0)
+        new_roi[, 1] <- max_pos - 5
+        new_roi[, 2] <- max_pos + 5
+        new_roi[, 3] <- new_roi[, 2] - new_roi[, 1] + 1
+        # append
+        new_rois_all <- rbind(new_rois_all, new_roi)
       }
     }
-    start_index <- nonzero_positions[running_index + 1]
   }
 
-  return(allpeaks_values)
-}
-
-fit_gaussian <- function(mass_vector_eq, mass_vector, int_vector,  
-                         resol, force_nr, use_bounds) {
-  #' Fit 1 or 2 Gaussian peaks in small region of m/z
-  #'
-  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
-  #' @param resol: Value for resolution (integer)
-  #' @param force_nr: Number of local maxima in int_vector (integer)
-  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #'
-  #' @return roi_value_list: list of fit values for region of interest (list)
-  
-  # Find the index in int_vector with the highest intensity
-  max_index <- which(int_vector == max(int_vector))
-  
-  # Initialise
-  peak_mean <- NULL
-  peak_area <- NULL
-  peak_qual <- NULL
-  peak_min <- NULL
-  peak_max <- NULL
-  fit_quality1 <- 0.15
-  fit_quality <- 0.2
-  # set initial value for scale factor
-  scale_factor <- 2
-  
-  # One local maximum:
-  if (force_nr == 1) {
-    # determine fit values for 1 Gaussian peak 
-    fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol,
-                            fit_quality1, use_bounds)
-    
-    # test if the mean is outside the m/z range
-    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
-      # run this function again with fixed boundaries
-      return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol,
-                          force_nr = 1, use_bounds = TRUE))
-    } else {
-      # test if the fit is bad
-      if (fit_values$qual > fit_quality1) {
-        # Try to fit two curves; find two local maxima. NB: max_index (now new_index) removed from fit_gaussian
-        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
-        # test if there are two indices in new_index
-        if (length(new_index) != 2) {
-          new_index <- round(length(mass_vector) / 3)
-          new_index <- c(new_index, 2 * new_index)
-        }
-        # run this function again with two local maxima
-        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
-                            resol, force_nr = 2, use_bounds = FALSE))
-        # good fit
-      } else {
-        peak_mean <- c(peak_mean, fit_values$mean)
-        peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor,
-                                                fit_values$sigma))
-        peak_qual <- fit_values$qual
-        peak_min <- mass_vector[1]
-        peak_max <- mass_vector[length(mass_vector)]
-      }
-    }
-    
-    #### Two local maxima; need at least 6 data points for this ####
-  } else if (force_nr == 2 && (length(mass_vector) > 6)) {
-    # determine fit values for 2 Gaussian peaks
-    fit_values <- fit_2peaks(mass_vector_eq, mass_vector, int_vector, max_index, resol,
-                             use_bounds, fit_quality)
-    # test if one of the means is outside the m/z range
-    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
-        fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
-      # check if fit quality is bad
-      if (fit_values$qual > fit_quality) {
-        # run this function again with fixed boundaries
-        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, resol,
-                            force_nr = 2, use_bounds = TRUE))
-      } else {
-        # check which mean is outside range and remove it from the list of means
-        # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
-        for (i in 1:length(fit_values$mean)){
-          if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) {
-            peak_mean <- c(peak_mean, -i)
-            peak_area <- c(peak_area, -i)
-          } else {
-            peak_mean <- c(peak_mean, fit_values$mean[i])
-            peak_area <- c(peak_area, fit_values$area[i])
-          }
-        }
-        peak_qual <- fit_values$qual
-        peak_min <- mass_vector[1]
-        peak_max <- mass_vector[length(mass_vector)]
-      }
-      # if all means are within range
-    } else {
-      # check for bad fit
-      if (fit_values$qual > fit_quality) {
-        # Try to fit three curves; find three local maxima
-        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
-        # test if there are three indices in new_index
-        if (length(new_index) != 3) {
-          new_index <- round(length(mass_vector) / 4)
-          new_index <- c(new_index, 2 * new_index, 3 * new_index)
-        }
-        # run this function again with three local maxima
-        return(fit_gaussian(mass_vector_eq, mass_vector, int_vector, 
-                            resol, force_nr = 3, use_bounds = FALSE))
-      # good fit, all means are within m/z range
-      } else {
-        # check if means are within 3 ppm and sum if so
-        tmp <- fit_values$qual
-        nr_means_new <- -1
-        nr_means <- length(fit_values$mean)
-        while (nr_means != nr_means_new) {
-          nr_means <- length(fit_values$mean)
-          fit_values <- within_ppm(fit_values$mean, fit_values$scale_factor, fit_values$sigma, fit_values$area, 
-                                   mass_vector_eq, mass_vector, ppm = 4, resol)
-          nr_means_new <- length(fit_values$mean)
-        }
-        # restore original quality score
-        fit_values$qual <- tmp
-        
-        for (i in 1:length(fit_values$mean)){
-          peak_mean <- c(peak_mean, fit_values$mean[i])
-          peak_area <- c(peak_area, fit_values$area[i])
-        }
-        peak_qual <- fit_values$qual
-        peak_min <- mass_vector[1]
-        peak_max <- mass_vector[length(mass_vector)]
-      }
-    }
-    
-  } else { # More than two local maxima; fit 1 peak.
-    fit_quality1 <- 0.40
-    use_bounds <- TRUE
-    max_index <- which(int_vector == max(int_vector))
-    fit_values <- fit_1peak(mass_vector_eq, mass_vector, int_vector, max_index, resol,
-                            fit_quality1, use_bounds)
-    # check for bad fit
-    if (fit_values$qual > fit_quality1) {
-      # get fit values from fit_optim
-      fit_values <- fit_optim(mass_vector, int_vector, resol)
-      peak_mean <- c(peak_mean, fit_values$mean)
-      peak_area <- c(peak_area, fit_values$area)
-      peak_min <- fit_values$min
-      peak_max <- fit_values$max
-      peak_qual <- 0
-    } else {
-      peak_mean <- c(peak_mean, fit_values$mean)
-      peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale_factor, fit_values$sigma))
-      peak_qual <- fit_values$qual
-      peak_min <- mass_vector[1]
-      peak_max <- mass_vector[length(mass_vector)]
-    }
+  # remove rois that have been split into chunks or shortened
+  #print(dim(regions_of_interest_gte3))
+  if (length(remove_roi_index) > 0) {
+    regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ]
+  } else {
+    regions_of_interest_minus_short <- regions_of_interest_gte3
+  }
+  #print(dim(regions_of_interest_minus_short))
+  # combine remaining rois with info on chunks
+  regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all)
+  # remove regions of interest with short lengths again
+  if (any(regions_of_interest_split[, "length"] < 3)) {
+    regions_of_interest_final <- regions_of_interest_split[-which(regions_of_interest_split[, "length"] < 3), ]
+  } else {
+    regions_of_interest_final <- regions_of_interest_split
   }
-  
-  # put all values for this region of interest into a list
-  roi_value_list <- list("mean" = peak_mean,
-                         "area" = peak_area,
-                         "qual" = peak_qual,
-                         "min" = peak_min,
-                         "max" = peak_max)
-  return(roi_value_list)
-}
-
 
-fit_1peak <- function(mass_vector_eq, mass_vector, int_vector, max_index, 
-                      resol, fit_quality, use_bounds) {
-  #' Fit 1 Gaussian peak in small region of m/z
-  #'
-  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
-  #' @param max_index: Index in int_vector with the highest intensity (integer)
-  #' @param resol: Value for resolution (integer)
-  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
-  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #'
-  #' @return roi_value_list: list of fit values for region of interest (list)
-  
-  # set initial value for scale_factor
-  scale_factor <- 2
-  
-  if (length(int_vector) < 3) {
-    message("Range too small, no fit possible!")
+  # sort on first index
+  if (nrow(regions_of_interest_final) > 1){
+    regions_of_interest_sorted <- regions_of_interest_final %>% dplyr::arrange(from)
   } else {
-    if (length(int_vector) == 4) {
-      # fit 1 peak
-      weighted_mu <- weighted.mean(mass_vector, int_vector)
-      sigma <- get_stdev(mass_vector, int_vector)
-      fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-    } else {
-      # set range vector
-      if ((length(mass_vector) - length(max_index)) < 2) {
-        range1 <- c((length(mass_vector) - 4) : length(mass_vector))
-      } else if (length(max_index) < 2) {
-        range1 <- c(1:5)
-      } else {
-        range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2))
-      }
-      # remove zero at the beginning of range1
-      if (range1[1] == 0) {
-        range1 <- range1[-1]
-      }
-      # remove NA
-      if (length(which(is.na(int_vector[range1]))) != 0) {
-        range1 <- range1[-which(is.na(int_vector[range1]))]
-      }
-      # fit 1 peak
-      weighted_mu <- weighted.mean(mass_vector[range1], int_vector[range1])
-      sigma <- get_stdev(mass_vector[range1], int_vector[range1])
-      fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-    }
-    
-    fitted_mz <- fitted_peak$par[1]
-    fitted_nr <- fitted_peak$par[2]
-    
-    # get new value for fit quality and scale_factor
-    fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
-    scale_factor_new <- 1.2 * scale_factor
-    
-    # bad fit
-    if (fq_new > fit_quality) {
-      # optimize scaling factor
-      fq <- 0
-      scale_factor <- 0
-      if (sum(int_vector) > sum(fitted_nr * dnorm(mass_vector, fitted_mz, sigma))) {
-        # increase scale_factor until convergence
-        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) {
-          fq <- fq_new
-          scale_factor <- scale_factor_new
-          # fit 1 peak
-          fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-          fitted_mz <- fitted_peak$par[1]
-          fitted_nr <- fitted_peak$par[2]
-          # get new value for fit quality and scale_factor
-          fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
-          scale_factor_new <- 1.2 * scale_factor
-        }
-      } else {
-        # decrease scale_factor until convergence
-        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_factor_new < 10000)) {
-          fq <- fq_new
-          scale_factor <- scale_factor_new
-          # fit 1 peak
-          fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-          fitted_mz <- fitted_peak$par[1]
-          fitted_nr <- fitted_peak$par[2]
-          # get new value for fit quality and scale_factor
-          fq_new <- get_fit_quality(mass_vector, int_vector, fitted_mz, resol, fitted_nr, sigma)$fq_new
-          scale_factor_new <- 0.8 * scale_factor
-        }
-      }
-      # use optimized scale_factor factor to fit 1 peak
-      if (fq < fq_new) {
-        fitted_peak <- optimize_1gaussian(mass_vector, int_vector, sigma, weighted_mu, scale_factor, use_bounds)
-        fitted_mz <- fitted_peak$par[1]
-        fitted_nr <- fitted_peak$par[2]
-        fq_new <- fq
-      }
-    }
+    regions_of_interest_sorted <- regions_of_interest_final
   }
-  
-  roi_value_list <- list("mean" = fitted_mz, "scale_factor" = fitted_nr, "sigma" = sigma, "qual" = fq_new)
-  return(roi_value_list)
+
+  return(regions_of_interest_sorted)
 }
 
-fit_2peaks <- function(mass_vector_eq, mass_vector, int_vector, max_index, resol, use_bounds = FALSE,
-                       fit_quality) {
-  #' Fit 2 Gaussian peaks in a small region of m/z
+integrate_peaks <- function(ints_fullrange, regions_of_interest, resol, peak_thresh) {
+  #' Fit Gaussian peak for each region of interest and integrate area under the curve
   #'
-  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
-  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param ints_fullrange: Named list of intensities (float)
+  #' @param regions_of_interest: Named list of intensities (float)
   #' @param resol: Value for resolution (integer)
-  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
-  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #'
-  #' @return roi_value_list: list of fit values for region of interest (list)
-  
-  peak_mean <- NULL
-  peak_area <- NULL
-  peak_scale <- NULL
-  peak_sigma <- NULL
-  
-  # set range vectors for first peak
-  range1 <- seq(from = (max_index[1] - 2), to = (max_index[1] + 2))
-  # remove zero at the beginning of range1
-  if (range1[1] == 0) {
-    range1 <- range1[-1]
-  }
-  # set range vectors for second peak
-  range2 <- seq(from = (max_index[2] - 2), to = (max_index[2] + 2))
-  # if range2 ends outside mass_vector, shorten it
-  if (length(mass_vector) < range2[length(range2)]) {
-    range2 <- range2[-length(range2)]
-  }
-  # check whether the two ranges overlap
-  range1 <- check_overlap(range1, range2)[[1]]
-  range2 <- check_overlap(range1, range2)[[2]]
-  # check for negative or 0 values in range1 or range2
-  remove <- which(range1 < 1)
-  if (length(remove) > 0) {
-    range1 <- range1[-remove]
-  }
-  remove <- which(range2 < 1)
-  if (length(remove) > 0) {
-    range2 <- range2[-remove]
-  }
-  # remove NA
-  if (length(which(is.na(int_vector[range1]))) != 0) {
-    range1 <- range1[-which(is.na(int_vector[range1]))]
-  }
-  if (length(which(is.na(int_vector[range2]))) != 0) {
-    range2 <- range2[-which(is.na(int_vector[range2]))]
-  }
-  
-  # fit 2 peaks, first separately, then together
-  weighted_mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
-  sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
-  fitted_peak1 <- optimize_1gaussian(mass_vector[range1], int_vector[range1], sigma1, weighted_mu1, scale_factor, use_bounds)
-  fitted_mz1 <- fitted_peak1$par[1]
-  fitted_nr1 <- fitted_peak1$par[2]
-  # second peak
-  weighted_mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
-  sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
-  fitted_peak2 <- optimize_1gaussian(mass_vector[range2], int_vector[range2], sigma2, weighted_mu2, scale_factor, use_bounds)
-  fitted_mz2 <- fitted_peak2$par[1]
-  fitted_nr2 <- fitted_peak2$par[2]
-  # combined
-  fitted_2peaks <- optimize_2gaussians(mass_vector, int_vector, sigma1, sigma2, 
-                                       fitted_mz1, fitted_nr1, 
-                                       fitted_mz2, fitted_nr2, use_bounds)
-  fitted_2peaks_mz1 <- fitted_2peaks$par[1]
-  fitted_2peaks_nr1 <- fitted_2peaks$par[2]
-  fitted_2peaks_mz2 <- fitted_2peaks$par[3]
-  fitted_2peaks_nr2 <- fitted_2peaks$par[4]
-  
-  # get fit quality
-  if (is.null(sigma2)) {
-    sigma2 <- sigma1
+  #' @param peak_thresh: Value for noise level threshold (integer) # NOT USED YET
+  #'
+  #' @return allpeaks_values: matrix of integrated peaks
+
+  # initialize dataframe to store results for all peaks
+  allpeaks_values <- matrix(0, nrow = nrow(regions_of_interest), ncol = 5)
+  colnames(allpeaks_values) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+
+  for (roi_nr in 1:nrow(regions_of_interest)) {
+    # find m/z values and intensities corresponding to the region of interest
+    index_range <- regions_of_interest[roi_nr, "from"] : regions_of_interest[roi_nr, "to"]
+    roi_mzrange <- as.numeric(ints_fullrange$mz[index_range])
+    roi_intrange <- as.numeric(ints_fullrange$int[index_range])
+    # find m/z value for maximum of peak (mu in Gaussian function)
+    weighted_mzmean <- weighted.mean(roi_mzrange, roi_intrange)
+    # find expected peak width at this m/z value
+    fwhm_mzmed <- get_fwhm(weighted_mzmean, resol)
+    # calculate sigma for Gaussian curve (https://en.wikipedia.org/wiki/Full_width_at_half_maximum)
+    sigma_peak <- fwhm_mzmed / 2.355
+    # find scale factor for Gaussian. Initial estimate is maximum intensity in roi
+    # divided by 2 for better correlation with intensities from old PeakFinding method
+    scale_factor <- max(roi_intrange) / 2
+    # fit Gaussian peak. Find intensities according to normal distribution
+    normal_ints <- scale_factor * gaussfunc(roi_mzrange, weighted_mzmean, sigma_peak)
+    # sum intensities in roi
+    sum_ints <- sum(roi_intrange)
+    sum_gauss <- sum(normal_ints)
+    # calculate quality of fit
+    quality_fit <- 1 - sum(abs(normal_ints - roi_intrange)) / sum_ints
+    # put all values into dataframe
+    allpeaks_values[roi_nr, "mzmed.pkt"] <- weighted_mzmean
+    allpeaks_values[roi_nr, "fq"] <- quality_fit
+    allpeaks_values[roi_nr, "mzmax.pkt"] <- max(roi_mzrange)
+    allpeaks_values[roi_nr, "mzmin.pkt"] <- min(roi_mzrange)
+    allpeaks_values[roi_nr, "height.pkt"] <- sum_gauss
+  }
+
+  # remove peaks with height = 0, look for NA or NaN
+  remove_na <- which(is.na(as.numeric(allpeaks_values[, "height.pkt"])))
+  if (length(remove_na) > 0) {
+    allpeaks_values <- allpeaks_values[-remove_na, ]
   }
-  sum_fit <- (fitted_2peaks_nr1 * dnorm(mass_vector, fitted_2peaks_mz1, sigma1)) +
-    (fitted_2peaks_nr2 * dnorm(mass_vector, fitted_2peaks_mz2, sigma2))
-  lowest_mz <- sort(c(fitted_2peaks_mz1, fitted_2peaks_mz2))[1]
-  fq_new <- get_fit_quality(mass_vector, int_vector, lowest_mz,
-                        resol, sum_fit = sum_fit)$fq_new
-  
-  # get parameter values
-  area1 <- estimate_area(fitted_2peaks_mz1, resol, fitted_2peaks_nr1, sigma1)
-  area2 <- estimate_area(fitted_2peaks_mz2, resol, fitted_2peaks_nr2, sigma2)
-  peak_area <- c(peak_area, area1, area2)
-  peak_mean <- c(peak_mean, fitted_2peaks_mz1, fitted_2peaks_mz2)
-  peak_scale <- c(peak_scale, fitted_2peaks_nr1, fitted_2peaks_nr2)
-  peak_sigma <- c(peak_sigma, sigma1, sigma2)
-  
-  roi_value_list <- list("mean" = peak_mean, "scale_factor" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq_new)
-  return(roi_value_list)
-}
 
-optimize_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale_factor, use_bounds) {
-  #' Fit a Gaussian curve for a peak with given parameters
-  #'
-  #' @param mass_vector: Vector of masses (float)
-  #' @param int_vector: Vector of intensities (float)
-  #' @param sigma: Value for width of the peak (float)
-  #' @param query_mass: Value for mass at center of peak (float)
-  #' @param scale_factor: Value for scaling intensities (float)
-  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #'
-  #' @return opt_fit: list of parameters and values describing the optimal fit
-  
-  # define optimization function for optim based on normal distribution
-  opt_f <- function(params) {
-    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
-    sum((d - int_vector) ^ 2)
-  }
-  if (use_bounds) {
-    # determine lower and upper boundaries
-    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
-    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
-    # get optimal value for fitted Gaussian curve
-    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)),
-                     opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
-                     lower = lower, upper = upper)
-  } else {
-    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale_factor)),
-                     opt_f, control = list(maxit = 10000))
-  }
-  return(opt_fit)
+  return(allpeaks_values)
 }
 
-optimize_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
-                           query_mass1, scale_factor1,
-                           query_mass2, scale_factor2, use_bounds) {
-  #' Fit two Gaussian curves for a peak with given parameters
-  #'
-  #' @param mass_vector: Vector of masses (float)
-  #' @param int_vector: Vector of intensities (float)
-  #' @param sigma1: Value for width of the first peak (float)
-  #' @param sigma2: Value for width of the second peak (float)
-  #' @param query_mass1: Value for mass at center of first peak (float)
-  #' @param scale_factor1: Value for scaling intensities for first peak (float)
-  #' @param query_mass2: Value for mass at center of second peak (float)
-  #' @param scale_factor2: Value for scaling intensities for second peak (float)
-  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
-  #'
-  #' @return opt_fit: list of parameters and values describing the optimal fit
-  
-  # define optimization function for optim based on normal distribution
-  opt_f <- function(params) {
-    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) +
-      params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2)
-    sum((d - int_vector) ^ 2)
-  }
-  
-  if (use_bounds) {
-    # determine lower and upper boundaries
-    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
-    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
-    # get optimal value for 2 fitted Gaussian curves
-    if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) {
-      sigma2 <- sigma1
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
-                         as.numeric(query_mass1), as.numeric(scale_factor1)),
-                       opt_f, control = list(maxit = 10000),
-                       method = "L-BFGS-B", lower = lower, upper = upper)
-    } else {
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
-                         as.numeric(query_mass2), as.numeric(scale_factor2)),
-                       opt_f, control = list(maxit = 10000),
-                       method = "L-BFGS-B", lower = lower, upper = upper)
-    }
-  } else {
-    if (is.null(query_mass2) && is.null(scale_factor2) && is.null(sigma2)) {
-      sigma2 <- sigma1
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
-                         as.numeric(query_mass1), as.numeric(scale_factor1)),
-                       opt_f, control = list(maxit = 10000))
-    } else {
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale_factor1),
-                         as.numeric(query_mass2), as.numeric(scale_factor2)),
-                       opt_f, control = list(maxit = 10000))
+# in the next version of the docker image, package R.utils will be included
+seqToIntervals <- function(idx) {
+  #' Find consecutive stretches of numbers
+  #' function seqToIntervals copied from R.utils library
+  #' see https://rdrr.io/cran/R.utils/src/R/seqToIntervals.R
+  #'
+  #' @param idx: Sequence of indices (integers)
+  #'
+  #' @return res: Matrix of start and end positions of consecutive numbers (matrix)
+
+  # Clean up sequence
+  idx <- as.integer(idx)
+  idx <- unique(idx)
+  idx <- sort(idx)
+
+  len_idx <- length(idx)
+  if (len_idx == 0L) {
+    res <- matrix(NA_integer_, nrow = 0L, ncol = 2L)
+    colnames(res) <- c("from", "to")
+    return(res)
+  }
+
+  # Identify end points of intervals
+  diff_idx <- diff(idx)
+  diff_idx <- (diff_idx > 1)
+  diff_idx <- which(diff_idx)
+  nr_intervals <- length(diff_idx) + 1
+
+  # Allocate return matrix
+  res <- matrix(NA_integer_, nrow = nr_intervals, ncol = 2L)
+  colnames(res) <- c("from", "to")
+
+  from_value <- idx[1]
+  to_value <- from_value - 1
+  last_value <- from_value
+
+  count <- 1
+  for (running_index in seq_along(idx)) {
+    value <- idx[running_index]
+    if (value - last_value > 1) {
+      to_value <- last_value
+      res[count, ] <- c(from_value, to_value)
+      from_value <- value
+      count <- count + 1
     }
+    last_value <- value
   }
-  return(opt_fit)
-}
 
-get_stdev <- function(mass_vector, int_vector, resol = 140000) {
-  #' Calculate standard deviation to determine width of a peak
-  #'
-  #' @param mass_vector: Vector of 3 mass values (float)
-  #' @param int_vector: Vector of 3 intensities (float)
-  #' @param resol: Value for resolution (integer)
-  #'
-  #' @return stdev: Value for standard deviation
-  # find maximum intensity in vector
-  max_index <- which(int_vector == max(int_vector))
-  # find corresponding mass at maximum intensity
-  max_mass <- mass_vector[max_index]
-  # calculate resolution at given m/z value
-  resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200)))
-  # calculate full width at half maximum
-  fwhm <- max_mass / resol_mz
-  # calculate standard deviation
-  stdev <- (fwhm / 2) * 0.85
-  return(stdev)
-}
-
-get_fit_quality <- function(mass_vector, int_vector, mu_first, resol, scale_factor = NULL, sigma = NULL, sum_fit = NULL) {
-  #' Get fit quality for 1 Gaussian peak in small region of m/z
-  #'
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
-  #' @param mu_first: Value for first peak (float)
-  #' @param scale_factor: Initial value used to estimate scaling parameter (integer)
-  #' @param resol: Value for resolution (integer)
-  #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float)
-  #'
-  #' @return list_params: list of parameters indicating quality of fit (list)
-  if (is.null(sum_fit)) {
-    mass_vector_int <- mass_vector
-    int_vector_int <- int_vector
-    # get new fit quality
-    fq_new <- mean(abs((scale_factor * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / 
-                     rep((max(scale_factor * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int)))
-  } else {
-    sum_fit_int <- sum_fit
-    int_vector_int <- int_vector
-    mass_vector_int <- mass_vector
-    # get new fit quality
-    fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int)))
+  if (to_value < from_value) {
+    to_value <- last_value
+    res[count, ] <- c(from_value, to_value)
   }
-  
-  # Prevent division by 0
-  if (is.nan(fq_new)) fq_new <- 1 
-  
-  list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int)
-  return(list_params)
-}
 
-estimate_area <- function(mass_max, resol, scale_factor, sigma) {
-  #' Estimate area of Gaussian curve
-  #'
-  #' @param mass_max: Value for m/z at maximum intensity of a peak (float)
-  #' @param resol: Value for resolution (integer)
-  #' @param scale_factor: Value for peak width (float)
-  #' @param sigma: Value for standard deviation (float)
-  #'
-  #' @return area_curve: Value for area under the Gaussian curve (float)
-  
-  # calculate width of peak at half maximum
-  fwhm <- get_fwhm(mass_max, resol)
-  
-  # generate a mass_vector with equally spaced m/z values
-  mz_min <- mass_max - 2 * fwhm
-  mz_max <- mass_max + 2 * fwhm
-  mz_range <- mz_max - mz_min 
-  mass_vector_eq <- seq(mz_min, mz_max, length = 100)
-  
-  # estimate area under the curve
-  area_curve <- sum(scale_factor * dnorm(mass_vector_eq, mass_max, sigma)) / 100
-  
-  return(area_curve)  
+  return(res)
 }
 
 get_fwhm <- function(query_mass, resol) {
@@ -641,7 +212,7 @@ get_fwhm <- function(query_mass, resol) {
   # set aberrant values of query_mass to default value of 200
   if (is.na(query_mass)) {
     query_mass <- 200
-  } 
+  }
   if (query_mass < 0) {
     query_mass <- 200
   }
@@ -649,168 +220,29 @@ get_fwhm <- function(query_mass, resol) {
   resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200)))
   # calculate full width at half maximum
   fwhm <- query_mass / resol_mz
-  
-  return(fwhm)
-}
-
-fit_optim <- function(mass_vector, int_vector, resol) {
-  #' Determine optimized fit of Gaussian curve to small region of m/z
-  #'
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param int_vector: Vector of intensities for a region of interest (float)
-  #' @param resol: Value for resolution (integer)
-  #'
-  #' @return roi_value_list: list of fit values for region of interest (list)
-  
-  # initial value for scale_factor
-  scale_factor <- 1.5
-
-  # Find the index in int_vector with the highest intensity
-  max_index <- which(int_vector == max(int_vector))[1]
-  mass_max <- mass_vector[max_index]
-  int_max <- int_vector[max_index]
-  # get peak width
-  fwhm <- get_fwhm(mass_max, resol)
-  # simplify the peak shape: represent it by a triangle
-  mass_max_simple <- c(mass_max - scale_factor * fwhm, mass_max, mass_max + scale_factor * fwhm)
-  int_max_simple <- c(0, int_max, 0)
-  
-  # define mass_diff as difference between last and first value of mass_max_simple
-  # mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
-  # generate a second mass_vector with equally spaced m/z values
-  mass_vector_eq <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
-                      length = 100 * length(mass_max_simple))
-  sigma <- get_stdev(mass_vector_eq, int_max_simple)
-  # define optimization function for optim based on normal distribution
-  opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) {
-    curve <- p * dnorm(mass_vector, mass_max, sigma)
-    return((max(curve) - max(int_vector))^2)
-  }
-  
-  # get optimal value for fitted Gaussian curve
-  opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max)
-  scale_factor <- opt_fit$minimum
-  
-  # get an estimate of the area under the peak
-  area <- estimate_area(mass_max, resol, scale_factor, sigma)
-  # put all values for this region of interest into a list
-  roi_value_list <- list("mean" = mass_max,
-                         "area" = area,
-                         "min" = mass_vector_eq[1],
-                         "max" = mass_vector_eq[length(mass_vector_eq)])
-  return(roi_value_list)
-}
 
-within_ppm <- function(mean, scale_factor, sigma, area, mass_vector_eq, mass_vector, ppm = 4, resol) {
-  #' Test whether two mass ranges are within ppm distance of each other
-  #'
-  #' @param mean: Value for mean m/z (float)
-  #' @param scale_factor: Initial value used to estimate scaling parameter (integer)
-  #' @param sigma: Value for standard deviation (float)
-  #' @param area: Value for area under the curve (float)
-  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param ppm: Value for distance between two values of mass (integer)
-  #' @param resol: Value for resolution (integer)
-  #'
-  #' @return list_params: list of parameters indicating quality of fit (list)
-  
-  # sort
-  index <- order(mean)
-  mean <- mean[index]
-  scale_factor <- scale_factor[index]
-  sigma <- sigma[index]
-  area <- area[index]
-  
-  summed <- NULL
-  remove <- NULL
-  
-  if (length(mean) > 1) {
-    for (i in 2:length(mean)) {
-      if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) {
-        
-        # avoid double occurrance in sum
-        if ((i - 1) %in% summed) next
-        
-        result_values <- sum_curves(mean[i - 1], mean[i], scale_factor[i - 1], scale_factor[i], sigma[i - 1], sigma[i],
-                                    mass_vector_eq, mass_vector, resol)
-        summed <- c(summed, i - 1, i)
-        if (is.nan(result_values$mean)) result_values$mean <- 0
-        mean[i - 1] <- result_values$mean
-        mean[i] <- result_values$mean
-        area[i - 1] <- result_values$area
-        area[i] <- result_values$area
-        scale_factor[i - 1] <- result_values$scale_factor
-        scale_factor[i] <- result_values$scale_factor
-        sigma[i - 1] <- result_values$sigma
-        sigma[i] <- result_values$sigma
-        
-        remove <- c(remove, i)
-      }
-    }
-  }
-  
-  if (length(remove) != 0) {
-    mean <- mean[-c(remove)]
-    area <- area[-c(remove)]
-    scale_factor <- scale_factor[-c(remove)]
-    sigma <- sigma[-c(remove)]
-  }
-  
-  list_params <- list("mean" = mean, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma, "qual" = NULL)
-  return(list_params)
+  return(fwhm)
 }
 
-sum_curves <- function(mean1, mean2, scale_factor1, scale_factor2, sigma1, sigma2, mass_vector_eq, mass_vector, resol) {
-  #' Sum two curves
-  #'
-  #' @param mean1: Value for mean m/z of first peak (float)
-  #' @param mean2: Value for mean m/z of second peak (float)
-  #' @param scale_factor1: Initial value used to estimate scaling parameter for first peak (integer)
-  #' @param scale_factor2: Initial value used to estimate scaling parameter for second peak (integer)
-  #' @param sigma1: Value for standard deviation for first peak (float)
-  #' @param sigma2: Value for standard deviation for second peak (float)
-  #' @param mass_vector_eq: Vector of equally spaced m/z values (float)
-  #' @param mass_vector: Vector of m/z values for a region of interest (float)
-  #' @param resol: Value for resolution (integer)
-  #'
-  #' @return list_params: list of parameters indicating quality of fit (list)
-  
-  sum_fit <- (scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)) + (scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2))
-  
-  mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale_factor1 * dnorm(mass_vector_eq, mean1, sigma1)),
-                                                  max(scale_factor2 * dnorm(mass_vector_eq, mean2, sigma2))))
-  
-  # get new values for parameters
-  fwhm <- get_fwhm(mean1_plus2, resol)
-  area <- max(sum_fit)
-  scale_factor <- scale_factor1 + scale_factor2
-  sigma <- (fwhm / 2) * 0.85
-  
-  list_params <- list("mean" = mean1_plus2, "area" = area, "scale_factor" = scale_factor, "sigma" = sigma)
-  return(list_params)
-}
 
-check_overlap <- function(range1, range2) {
-  #' Modify range1 and range2 in case of overlap
-  #'
-  #' @param range1: Vector of m/z values for first peak (float)
-  #' @param range2: Vector of m/z values for second peak (float)
-  #'
-  #' @return new_ranges: list of two ranges (list)
-  
-  # Check for overlap
-  if (length(intersect(range1, range2)) == 2) {
-    if (length(range1) >= length(range2)) {
-      range1 <- range1[-length(range1)]  
-    } else {
-      range2 <- range2[-1]
-    }
-  } else if (length(intersect(range1, range2)) == 3) {
-    range1 <- range1[-length(range1)]  
-    range2 <- range2[-1]
-  }
-  new_ranges <- list("range1" = range1, "range2" = range2)
-  return(new_ranges)
+# from https://rdrr.io/cran/rvmethod/src/R/gaussfit.R
+#' Gaussian Function from package rvmethod
+#'
+#' This function returns the unnormalized (height of 1.0) Gaussian curve with a
+#' given center and spread.
+#'
+#' @param x the vector of values at which to evaluate the Gaussian
+#' @param mu the center of the Gaussian
+#' @param sigma the spread of the Gaussian (must be greater than 0)
+#' @return vector of values of the Gaussian
+#' @examples x = seq(-4, 4, length.out = 100)
+#' y = gaussfunc(x, 0, 1)
+#' plot(x, y)
+#'
+#' @import stats
+#'
+#' @export
+gaussfunc <- function(x, mu, sigma) {
+  return(exp(-((x - mu) ^ 2) / (2 * (sigma ^ 2))))
 }
 

From e006160d51ab942a4da4f88173b0ba1906717624 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:47:10 +0200
Subject: [PATCH 14/32] unit tests for new two-step PeakFinding method

---
 .../testthat/test_peak_finding_functions.R    | 261 +++++-------------
 1 file changed, 65 insertions(+), 196 deletions(-)

diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R
index a0db0ca9..1e001a0c 100644
--- a/DIMS/tests/testthat/test_peak_finding_functions.R
+++ b/DIMS/tests/testthat/test_peak_finding_functions.R
@@ -1,218 +1,87 @@
-# unit tests for PeakFinding functions: 
+# unit tests for PeakFinding functions:
 
 source("../preprocessing/peak_finding_functions.R")
 
-# test fit_quality function:
-test_that("fit quality is correctly calculated", {
-  # initialize
-  test_resol <- 140000
-  test_scale_factor <- 2
-  test_sigma <- 0.0001
-  test_sum_fit <- NULL
-
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_mz_max <- max(test_mass_vector)
-
-  expect_type(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit), "list")
-  expect_equal(get_fit_quality(test_mass_vector, test_int_vector, test_mz_max, test_resol, test_scale_factor, test_sigma, test_sum_fit)$fq_new, 2.426, tolerance = 0.001, TRUE)
-})
-
-# test get_stdev:
-test_that("standard deviation is correctly calculated", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_resol <- 140000
-
-  expect_type(get_stdev(test_mass_vector, test_int_vector, test_resol), "double")
-  expect_equal(get_stdev(test_mass_vector, test_int_vector, test_resol), 0.000126, tolerance = 0.00001, TRUE)
-})
-
-# test estimate_area
-test_that("area under the curve is correctly calculated", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_mz_max <- max(test_mass_vector)
-  test_resol <- 140000
-  test_scale_factor <- 2
-  test_sigma <- 0.0001
-
-  expect_type(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), "double")
-  expect_equal(estimate_area(test_mz_max, test_resol, test_scale_factor, test_sigma), 1673.061, tolerance = 0.001, TRUE)
-})
-
-# test get_fwhm
-test_that("fwhm is correctly calculated", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_mz_max <- max(test_mass_vector)
-  test_resol <- 140000
-
-  expect_type(get_fwhm(test_mz_max, test_resol), "double")
-  expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE)
-})
-
-# test get_fwhm for NA
-test_that("fwhm for mass NA gives standard fwhm", {
-  test_resol <- 140000
-
-  expect_type(get_fwhm(NA, test_resol), "double")
-  expect_equal(get_fwhm(NA, test_resol), 0.001428571, tolerance = 0.000001, TRUE)
-})
-
-# test within_ppm
-test_that("two peaks are within 5 ppm", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_mz_max <- max(test_mass_vector)
-  test_scale_factor <- 2
-  test_sigma <- 0.0001
-  test_area <- 20000
-  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
-  test_ppm <- 5
-  test_resol <- 140000
-
-  expect_type(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol), "list")
-  expect_equal(within_ppm(test_mz_max, test_scale_factor, test_sigma, test_area, test_mass_vector_eq, test_mass_vector, test_ppm, test_resol)$mean, 70.00962, tolerance = 0.0001, TRUE)
-})
-
-# test sum_curves
-test_that("two curves are correctly summed", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
-  test_mean1 <- 70.00938
-  test_mean2 <- 70.00962
-  test_scale_factor1 <- 2
-  test_scale_factor2 <- 4
-  test_sigma1 <- 0.0001
-  test_sigma2 <- 0.0002
-  test_resol <- 140000
+# test search_regions_of_interest function:
+test_that("regions of interest are correctly found for a single peak", {
+  # create intensities to test on
+  test_stdev <- 0.00006
+  test_mass_vector <- rep(70, 10) + 1:10 * test_stdev
+  test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev)
+  test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, 
+                                             int = test_ints_range))
   
-  expect_type(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol), "list")
-  expect_equal(sum_curves(test_mean1, test_mean2, test_scale_factor1, test_scale_factor2, test_sigma1, test_sigma2, test_mass_vector_eq, test_mass_vector, test_resol)$mean, 70.0095, tolerance = 0.0001, TRUE)
+  # expected output
+  expected_output <- matrix(c(1, 10, 10), nrow = 1, ncol = 3L)
+  colnames(expected_output) <- c("from", "to", "length")
+  rownames(expected_output) <- "to"
+
+  # tests
+  expect_type(search_regions_of_interest(test_ints_fullrange), "double")
+  expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 1)
+  expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, tolerance = 0.000001, TRUE)
 })
-
-# test fit_optim
-test_that("optimal peak fit can be found", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_resol <- 140000
-  # create output to test on:
-  test_output <- list(mean = 70.0095, area = 5009.596, min = 70.00906, max = 70.00994)
+test_that("regions of interest are correctly found for two peaks", {
+  # create intensities to test on
+  test_stdev <- 0.00006
+  test_mass_vector <- rep(70, 20) + 1:20 * test_stdev
+  test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + 
+                     dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev)
+  test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, 
+                                             int = test_ints_range))
   
-  expect_type(fit_optim(test_mass_vector, test_int_vector, test_resol), "list")
-  for (key in names(test_output)) {
-    expect_equal(fit_optim(test_mass_vector, test_int_vector, test_resol)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
-  }
-})
-
-# test optimize_1gaussian:
-test_that("optimal value for m/z and scale_factor can be found", {
-  # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_sigma <- 0.0001
-  test_query_mass <- median(test_mass_vector)
-  test_scale_factor <- 2
-  test_use_bounds <- FALSE
+  # expected output
+  expected_output <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3))
+  colnames(expected_output) <- c("from", "to", "length")
+  rownames(expected_output) <- as.character(c(1, 2))
   
-  expect_type(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds), "list")
-  expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[1], 70.00949, tolerance = 0.0001, TRUE)
-  expect_equal(optimize_1gaussian(test_mass_vector, test_int_vector, test_sigma, test_query_mass, test_scale_factor, test_use_bounds)$par[2], 4.570024, tolerance = 0.0001, TRUE)
+  # tests
+  expect_type(search_regions_of_interest(test_ints_fullrange), "list")
+  expect_equal(nrow(search_regions_of_interest(test_ints_fullrange)), 2)
+  expect_equal(search_regions_of_interest(test_ints_fullrange)[, "length"], as.numeric(expected_output[, "length"]))
+  expect_equal(search_regions_of_interest(test_ints_fullrange), expected_output, check.attributes = FALSE)
 })
 
-# test fit_1peak:
-test_that("correct mean m/z and scale_factor can be found for 1 peak", {
+# test integrate_peaks function:
+test_that("peaks are correctly integrated", {
   # create peak info to test on:
-  test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
-  test_max_index <- which(test_int_vector == max(test_int_vector))
-  test_resol <- 140000
-  test_fit_quality <- 0.5
-  test_use_bounds <- FALSE
-  # create output to test on:
-  test_output <- list(mean = 70.0095, scale_factor = 5.23334, sigma = 0.000126, qual = 0.24300)
+  test_stdev <- 0.00006
+  test_mass_vector <- rep(70, 20) + 1:20 * test_stdev
+  test_ints_range <- dnorm(test_mass_vector, mean = 70.0003, sd = test_stdev) + 
+                     dnorm(test_mass_vector, mean = 70.0005, sd = test_stdev)
+  test_ints_fullrange <- as.data.frame(cbind(mz = test_mass_vector, 
+                                             int = test_ints_range))
   
-  expect_type(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds), "list")
-  for (key in names(test_output)) {
-    expect_equal(fit_1peak(test_mass_vector_eq, test_mass_vector, test_int_vector, test_max_index, test_resol, test_fit_quality, test_use_bounds)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
-  }
-})
-
-# test optimize_2gaussians: 
-test_that("optimal value for m/z and scale_factor can be found", {
-  # create info for two peaks in a small region
-  test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006
-  test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2)
-  test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100)
-  test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks))
-  test_sigma1 <- 0.0001
-  test_sigma2 <- 0.0002
-  test_query_mass1 <- test_max_index[1]
-  test_query_mass2 <- test_max_index[2]
-  test_scale_factor1 <- 2
-  test_scale_factor2 <- 3
-  test_use_bounds <- FALSE
+  # create regions of interest to test on:
+  test_regions_of_interest <- as.data.frame(matrix(c(1, 8, 8, 20, 8, 13), nrow = 2, ncol = 3L))
+  colnames(test_regions_of_interest) <- c("from", "to", "length")
+  rownames(test_regions_of_interest) <- as.character(c(1, 2))
   
-  expect_type(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds), "list")
-  expect_equal(optimize_2gaussians(test_mass_vector_2peaks, test_int_vector_2peaks, test_sigma1, test_sigma2, test_query_mass1, test_scale_factor1, test_query_mass2, test_scale_factor2, test_use_bounds)$par[1], 4, tolerance = 0.1, TRUE)
-})
-
-# test fit_2peaks:
-test_that("correct mean m/z and scale_factor can be found for 2 peaks", {
-  # create info for two peaks in a small region
-  test_mass_vector_2peaks <- rep(70.00938, 14) + 1:14 * 0.00006
-  test_int_vector_2peaks <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000), 2)
-  test_mass_vector_eq <- seq(min(test_mass_vector_2peaks), max(test_mass_vector_2peaks), length = 100)
-  test_max_index <- which(test_int_vector_2peaks == max(test_int_vector_2peaks))
+  # set other parameters
   test_resol <- 140000
-  test_fit_quality <- 0.5
-  test_use_bounds <- FALSE
-  # create output to test on:
-  test_output <- list(mean = c(70.00954, 70.00998), scale_factor = c(4.823598, 4.828072), sigma = c(0.0001257425, 0.0001257436), qual = 0.38341)
+  test_peak_thresh <- 2000
+  
+  # expected output
+  expected_output <- as.data.frame(matrix(c(70.000357, 70.000521, 0.5148094, 0.3016569, 70.00006, 70.00048, 
+                                            70.00048, 70.00120, 15526.826791, 11950.90570), nrow = 2, ncol = 5))
+  colnames(expected_output) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
   
-  expect_type(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, 
-                         test_resol, test_use_bounds, test_fit_quality), "list")
-  for (key in names(test_output)) {
-    expect_equal(fit_2peaks(test_mass_vector_eq, test_mass_vector_2peaks, test_int_vector_2peaks, test_max_index, 
-                            test_resol, test_use_bounds, test_fit_quality)[[key]], test_output[[key]], tolerance = 0.0001, TRUE)
-  }
+  expect_type(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh), "double")
+  expect_equal(nrow(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)), 2)
+  expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "mzmed.pkt"], 
+               expected_output[, "mzmed.pkt"], tolerance = 0.0001)
+  expect_equal(integrate_peaks(test_ints_fullrange, test_regions_of_interest, test_resol, test_peak_thresh)[, "height.pkt"], 
+               expected_output[, "height.pkt"], tolerance = 0.0001)
 })
 
-# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds)
-test_that("initial gaussian fit is done correctly", {
+# test get_fwhm function
+test_that("fwhm is correctly calculated", {
   # create peak info to test on:
   test_mass_vector <- rep(70.00938, 5) + 1:5 * 0.00006
-  test_int_vector <- c(9000, 16000, 19000, 15000, 6000)
-  test_mass_vector_eq <- seq(min(test_mass_vector), max(test_mass_vector), length = 100)
+  test_mz_max <- max(test_mass_vector)
   test_resol <- 140000
-  test_use_bounds <- FALSE
-  test_force_nr <- 1
-  
-  expect_type(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds), "list")
-  expect_equal(fit_gaussian(test_mass_vector_eq, test_mass_vector, test_int_vector, test_resol, test_force_nr, test_use_bounds)$mean, 70.00956, tolerance = 0.00001, TRUE)
-})
 
-# test search_mzrange(ints_fullrange, resol, sample_name, peak_thresh)
-test_that("all peak finding functions work together", {
-  # enable snapshot
-  local_edition(3)
-  # create info for ten peaks separated by zero
-  test_large_mass_vector <- rep(70.00938, 80) + 1:80 * 0.00006
-  test_large_int_vector <- rep(c(2000, 9000, 16000, 19000, 15000, 6000, 3000, 0), 10)
-  names(test_large_int_vector) <- test_large_mass_vector
-  test_resol <- 140000
-  test_sample_name <- "C1.1"
-  test_peak_thresh <- 100
-  
-  expect_type(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), "list")
-  expect_snapshot(search_mzrange(test_large_int_vector, test_resol, test_sample_name, test_peak_thresh), error = FALSE)
+  expect_type(get_fwhm(test_mz_max, test_resol), "double")
+  expect_equal(get_fwhm(test_mz_max, test_resol), 0.000295865, tolerance = 0.000001, TRUE)
 })
 
-# test fit_gaussian(mass_vector2, mass_vector, int_vector, resol, force, use_bounds)
-test_that("fit gaussian")

From 8102bdb83853b7d02eef2a928543bad0d7dabc4c Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:54:33 +0200
Subject: [PATCH 15/32] information for averaging peaks for technical
 replicates based on txt file with scanmode

---
 DIMS/AveragePeaks.R  | 126 +++++++++++++++++++++----------------------
 DIMS/AveragePeaks.nf |   4 +-
 2 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
index 695c08fa..eef2d8a2 100644
--- a/DIMS/AveragePeaks.R
+++ b/DIMS/AveragePeaks.R
@@ -1,72 +1,66 @@
+library(dplyr)
+
 # define parameters
-# ppm as fixed value, not the same ppm as in peak grouping
-ppm_peak <- 2
+cmd_args <- commandArgs(trailingOnly = TRUE)
 
-library(dplyr)
+sample_name <- cmd_args[1]
+techreps <- cmd_args[2]
+scanmode <- cmd_args[3]
+tech_reps <- strsplit(techreps, ";")[[1]]
+print(sample_name)
+print(techreps)
+print(scanmode)
 
-scanmodes <- c("positive", "negative")
+# set ppm as fixed value, not the same ppm as in peak grouping
+ppm_peak <- 2
 
-for (scanmode in scanmodes){
-  # get sample names
-  load(paste0(scanmode, "_repl_pattern.RData"))
-  sample_names <- names(repl_pattern_filtered)
-  # initialize
-  outlist_total <- NULL
-  # for each biological sample, average peaks in technical replicates
-  for (sample_name in sample_names) {
-    print(sample_name)
-    # Initialize per sample
-    peaklist_allrepl <- NULL
-    nr_repl_persample <- 0
-    # averaged_peaks <- matrix(0, nrow = 10 ^ 7, ncol = 6) # how big does it need to be?
-    averaged_peaks <- matrix(0, nrow = 0, ncol = 6) # append
-    colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
-    # load RData files of technical replicates belonging to biological sample
-    sample_techreps_file <- repl_pattern_filtered[sample_name][[1]]
-    for (file_nr in 1:length(sample_techreps_file)) {
-      print(sample_techreps_file[file_nr])
-      tech_repl_file <- paste0(sample_techreps_file[file_nr], "_positive.RData")
-      tech_repl <- get(load(tech_repl_file))
-      # combine data for all technical replicates
-      peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
-      # count number of replicates for each biological sample
-      nr_repl_persample <- nr_repl_persample + 1
-    }
-    # sort on mass
-    peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
-    peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
-    peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
-    # peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt))
-    peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
-    # average over technical replicates
-    while (nrow(peaklist_allrepl_sorted) > 1) {
-      # store row numbers
-      peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
-      # find the peaks in the dataset with corresponding m/z plus or minus tolerance
-      reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
-      mz_tolerance <- (reference_mass * ppm_peak) / 10^6
-      minmz_ref <- reference_mass - mz_tolerance
-      maxmz_ref <- reference_mass + mz_tolerance
-      select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
-      select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
-      nrsamples <- length(select_peak_indices)
-      # put averaged intensities into a new row and append to averaged_peaks
-      averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
-      colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
-      # calculate m/z values for peak group
-      averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
-      averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
-      averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
-      averaged_1peak[1, "fq"] <- nrsamples
-      averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
-      # put intensities into proper columns
-      peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
-      averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
-    }
-    # add sample name to first column and append to outlist_total for all samples
-    averaged_peaks[ , "samplenr"] <- sample_name
-    outlist_total <- rbind(outlist_total, averaged_peaks)
-  }
-  save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData"))
+# average peaks in technical replicates
+# Initialize per sample
+peaklist_allrepl <- NULL
+nr_repl_persample <- 0
+averaged_peaks <- matrix(0, nrow = 0, ncol = 6) 
+colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+# load RData files of technical replicates belonging to biological sample
+for (file_nr in 1:length(tech_reps)) {
+  tech_repl_file <- paste0(tech_reps[file_nr], "_", scanmode, ".RData")
+  tech_repl <- get(load(tech_repl_file))
+  # combine data for all technical replicates
+  peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
+  # count number of replicates for each biological sample
+  nr_repl_persample <- nr_repl_persample + 1
+}
+# sort on mass
+peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
+peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
+peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
+# peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt))
+peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
+# average over technical replicates
+while (nrow(peaklist_allrepl_sorted) > 1) {
+  # store row numbers
+  peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
+  # find the peaks in the dataset with corresponding m/z plus or minus tolerance
+  reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
+  mz_tolerance <- (reference_mass * ppm_peak) / 10^6
+  minmz_ref <- reference_mass - mz_tolerance
+  maxmz_ref <- reference_mass + mz_tolerance
+  select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
+  select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
+  nrsamples <- length(select_peak_indices)
+  # put averaged intensities into a new row and append to averaged_peaks
+  averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
+  colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+  # calculate m/z values for peak group
+  averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
+  averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
+  averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
+  averaged_1peak[1, "fq"] <- nrsamples
+  averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
+  # put intensities into proper columns
+  peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
+  averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
 }
+# add sample name to first column
+averaged_peaks[ , "samplenr"] <- sample_name
+save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData"))
 
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
index 9d49f5f8..de4b21b0 100644
--- a/DIMS/AveragePeaks.nf
+++ b/DIMS/AveragePeaks.nf
@@ -6,13 +6,13 @@ process AveragePeaks {
 
     input:
        path(rdata_files)
-       path(replication_pattern)
+       tuple val(sample_id), val(tech_reps), val(scanmode)
 
     output:
        path 'AvgPeaks_*.RData'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R 
+        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode
         """
 }

From d0ed769ec7373ce3cd47243065e4743d99192de4 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:56:15 +0200
Subject: [PATCH 16/32] modified input for PeakGrouping corresponding to new
 PeakFinding method

---
 DIMS/PeakGrouping.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index ae4382c7..0f20dc5b 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -6,7 +6,7 @@ process PeakGrouping {
 
     input:
        path(hmdbpart_file)
-       each path(spectrumpeak_file)
+       path(averagedpeaks_file)
        each path(pattern_file)
 
     output:

From a35c4ca7decffc6e3379b44e220a0a47ec5392fb Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 8 Jul 2025 15:57:30 +0200
Subject: [PATCH 17/32] collect averaged peaks per biological sample,
 corresponding to new PeakFinding method

---
 DIMS/CollectAveraged.R  | 18 ++++++++++++++++++
 DIMS/CollectAveraged.nf | 17 +++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100755 DIMS/CollectAveraged.R
 create mode 100644 DIMS/CollectAveraged.nf

diff --git a/DIMS/CollectAveraged.R b/DIMS/CollectAveraged.R
new file mode 100755
index 00000000..e6466d9e
--- /dev/null
+++ b/DIMS/CollectAveraged.R
@@ -0,0 +1,18 @@
+# define parameters
+cmd_args <- commandArgs(trailingOnly = TRUE)
+
+scripts_dir <- cmd_args[1]
+
+# for each scan mode, collect all averaged peak lists per biological sample
+scanmodes <- c("positive", "negative")
+for (scanmode in scanmodes) {
+  # get list of files
+  filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".RData"))
+  # load files and combine into one object
+  outlist_total <- NULL
+  for (file_nr in 1:length(filled_files)) {
+    peaklist_averaged <- get(load(filled_files[file_nr]))
+    outlist_total <- rbind(outlist_total, peaklist_averaged)
+  }
+  save(outlist_total, file = paste0("AvgPeaks_", scanmode, ".RData"))
+}
diff --git a/DIMS/CollectAveraged.nf b/DIMS/CollectAveraged.nf
new file mode 100644
index 00000000..fc65bf21
--- /dev/null
+++ b/DIMS/CollectAveraged.nf
@@ -0,0 +1,17 @@
+process CollectAveraged {
+    tag "DIMS CollectAveraged"
+    label 'CollectAveraged'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(averaged_files)
+
+    output:
+       path('AvgPeaks*.RData'), emit: averaged_peaks
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/CollectAveraged.R
+        """
+}

From 295e4609abf81ee7384fa50fb5a4f4d958ffcb26 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 15 Jul 2025 14:39:38 +0200
Subject: [PATCH 18/32] fixed path to DIMS peak_finding_functions

---
 DIMS/tests/testthat/test_peak_finding_functions.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DIMS/tests/testthat/test_peak_finding_functions.R b/DIMS/tests/testthat/test_peak_finding_functions.R
index 1e001a0c..bc7f9d86 100644
--- a/DIMS/tests/testthat/test_peak_finding_functions.R
+++ b/DIMS/tests/testthat/test_peak_finding_functions.R
@@ -1,6 +1,6 @@
 # unit tests for PeakFinding functions:
 
-source("../preprocessing/peak_finding_functions.R")
+source("../../preprocessing/peak_finding_functions.R")
 
 # test search_regions_of_interest function:
 test_that("regions of interest are correctly found for a single peak", {

From 109d664f1e1923a0b95e1cfadc67c78bd991310d Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 2 Oct 2025 11:48:14 +0200
Subject: [PATCH 19/32] created function for averaging peaks in
 DIMS/AveragePeaks.R

---
 DIMS/AveragePeaks.R                          | 40 ++++----------------
 DIMS/AveragePeaks.nf                         |  2 +-
 DIMS/preprocessing/average_peaks_functions.R | 39 +++++++++++++++++++
 3 files changed, 47 insertions(+), 34 deletions(-)
 create mode 100644 DIMS/preprocessing/average_peaks_functions.R

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
index eef2d8a2..7e8d9fe1 100644
--- a/DIMS/AveragePeaks.R
+++ b/DIMS/AveragePeaks.R
@@ -6,61 +6,35 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 sample_name <- cmd_args[1]
 techreps <- cmd_args[2]
 scanmode <- cmd_args[3]
+preprocessing_scripts_dir <- cmd_args[4]
 tech_reps <- strsplit(techreps, ";")[[1]]
-print(sample_name)
-print(techreps)
-print(scanmode)
+
+# load in function scripts
+source(paste0(preprocessing_scripts_dir, "average_peaks_functions.R"))
 
 # set ppm as fixed value, not the same ppm as in peak grouping
 ppm_peak <- 2
 
-# average peaks in technical replicates
 # Initialize per sample
 peaklist_allrepl <- NULL
 nr_repl_persample <- 0
 averaged_peaks <- matrix(0, nrow = 0, ncol = 6) 
 colnames(averaged_peaks) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+
 # load RData files of technical replicates belonging to biological sample
 for (file_nr in 1:length(tech_reps)) {
   tech_repl_file <- paste0(tech_reps[file_nr], "_", scanmode, ".RData")
   tech_repl <- get(load(tech_repl_file))
   # combine data for all technical replicates
   peaklist_allrepl <- rbind(peaklist_allrepl, tech_repl)
-  # count number of replicates for each biological sample
-  nr_repl_persample <- nr_repl_persample + 1
 }
 # sort on mass
 peaklist_allrepl_df <- as.data.frame(peaklist_allrepl)
 peaklist_allrepl_df$mzmed.pkt <- as.numeric(peaklist_allrepl_df$mzmed.pkt) 
 peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt) 
-# peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(desc(height.pkt))
 peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
+
 # average over technical replicates
-while (nrow(peaklist_allrepl_sorted) > 1) {
-  # store row numbers
-  peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
-  # find the peaks in the dataset with corresponding m/z plus or minus tolerance
-  reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
-  mz_tolerance <- (reference_mass * ppm_peak) / 10^6
-  minmz_ref <- reference_mass - mz_tolerance
-  maxmz_ref <- reference_mass + mz_tolerance
-  select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
-  select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
-  nrsamples <- length(select_peak_indices)
-  # put averaged intensities into a new row and append to averaged_peaks
-  averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
-  colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
-  # calculate m/z values for peak group
-  averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
-  averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
-  averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
-  averaged_1peak[1, "fq"] <- nrsamples
-  averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
-  # put intensities into proper columns
-  peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
-  averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
-}
-# add sample name to first column
-averaged_peaks[ , "samplenr"] <- sample_name
+averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted)
 save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData"))
 
diff --git a/DIMS/AveragePeaks.nf b/DIMS/AveragePeaks.nf
index de4b21b0..f50bd874 100644
--- a/DIMS/AveragePeaks.nf
+++ b/DIMS/AveragePeaks.nf
@@ -13,6 +13,6 @@ process AveragePeaks {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode
+        Rscript ${baseDir}/CustomModules/DIMS/AveragePeaks.R $sample_id $tech_reps $scanmode $params.preprocessing_scripts_dir
         """
 }
diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R
new file mode 100644
index 00000000..2545b757
--- /dev/null
+++ b/DIMS/preprocessing/average_peaks_functions.R
@@ -0,0 +1,39 @@
+average_peaks_per_sample <- function(peaklist_allrepl_sorted) {
+  #' Average the intensity of peaks that occur in different technical replicates of a biological sample
+  #'
+  #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float)
+  #'
+  #' @return averaged_peaks: matrix of averaged peaks (float)
+  
+  # initialize
+  averaged_peaks <- peaklist_allrepl_sorted[0, ]
+  
+  while (nrow(peaklist_allrepl_sorted) > 1) {
+    # store row numbers
+    peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)
+    # find the peaks in the dataset with corresponding m/z plus or minus tolerance
+    reference_mass <- peaklist_allrepl_sorted$mzmed.pkt[1]
+    mz_tolerance <- (reference_mass * ppm_peak) / 10^6
+    minmz_ref <- reference_mass - mz_tolerance
+    maxmz_ref <- reference_mass + mz_tolerance
+    select_peak_indices <- which((peaklist_allrepl_sorted$mzmed.pkt > minmz_ref) & (peaklist_allrepl_sorted$mzmed.pkt < maxmz_ref))
+    select_peaks <- peaklist_allrepl_sorted[select_peak_indices, ]
+    nrsamples <- length(select_peak_indices)
+    # put averaged intensities into a new row
+    averaged_1peak <- matrix(0, nrow = 1, ncol = 6) 
+    colnames(averaged_1peak) <- c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+    # calculate m/z values for peak group
+    averaged_1peak[1, "mzmed.pkt"] <- mean(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "mzmin.pkt"] <- min(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "mzmax.pkt"] <- max(select_peaks$mzmed.pkt)
+    averaged_1peak[1, "fq"] <- nrsamples
+    averaged_1peak[1, "height.pkt"] <- mean(select_peaks$height.pkt)
+    # remove rownr column and append to averaged_peaks
+    peaklist_allrepl_sorted <- peaklist_allrepl_sorted[-select_peaks$rownr, ]
+    averaged_peaks <- rbind(averaged_peaks, averaged_1peak)
+  }
+  # add sample name to first column
+  averaged_peaks[ , "samplenr"] <- sample_name
+  
+  return(averaged_peaks)
+}

From cb33aba206ce326a79347404a6b76769af3e38d4 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 2 Oct 2025 11:49:30 +0200
Subject: [PATCH 20/32] added unit tests for average_peaks_functions

---
 DIMS/tests/testthat/test_average_peaks.R | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 DIMS/tests/testthat/test_average_peaks.R

diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R
new file mode 100644
index 00000000..994de48c
--- /dev/null
+++ b/DIMS/tests/testthat/test_average_peaks.R
@@ -0,0 +1,26 @@
+# unit test for AveragePeaks
+
+# source all functions for PeakGrouping
+source("../../preprocessing/average_peaks_functions.R")
+
+# test find_peak_groups
+testthat::test_that("peaks are correctly averaged", {
+  # create peak list to test on:
+  samplenrs <- rep(c("repl_001", "repl_002", "repl_003"), 3)
+  test_peaklist_sorted_num <- matrix(0, nrow = 9, ncol = 5)
+  colnames(test_peaklist_sorted_num) <- c("mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt")
+  test_peaklist_sorted_num[, 1] <- 300 + (1:9) / 10000
+  test_peaklist_sorted_num[, 5] <- 100 * (9:1)
+  test_peaklist_sorted <- as.data.frame(cbind(samplenr = samplenrs, test_peaklist_sorted_num))
+  test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2])
+  test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6])
+
+  # test that first peak is correctly averaged
+  expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted)[1, 6]), 600, tolerance = 0.001, TRUE)
+  # test number of rows
+  expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted)), 2)
+  # test column names
+  expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted)),
+               c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE)
+})
+

From db8633e61e15b27c1c052ec4462f7e0b6cfa5520 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 3 Oct 2025 13:49:00 +0200
Subject: [PATCH 21/32] moved parameters matrix and nr_replicates from workflow
 into params

---
 DIMS/EvaluateTics.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/DIMS/EvaluateTics.nf b/DIMS/EvaluateTics.nf
index 831c9500..fdf7f713 100644
--- a/DIMS/EvaluateTics.nf
+++ b/DIMS/EvaluateTics.nf
@@ -8,9 +8,7 @@ process EvaluateTics {
        path(rdata_file)
        path(tic_txt_files)
        path(init_file)
-       val(nr_replicates)
        val(analysis_id)
-       val(matrix)
        path(highest_mz_file)
        path(trim_params_file)
 
@@ -19,14 +17,14 @@ process EvaluateTics {
        path('replicates_per_sample.txt'), emit: sample_techreps
        path('miss_infusions_negative.txt')
        path('miss_infusions_positive.txt')
-       path('*_TICplots.pdf')
+       path('*_TICplots.pdf'),            emit: tic_plots_pdf
 
     script:
         """
         Rscript ${baseDir}/CustomModules/DIMS/EvaluateTics.R $init_file \
                                                              $params.nr_replicates \
                                                              $analysis_id \
-                                                             $matrix \
+                                                             $params.matrix \
                                                              $highest_mz_file \
                                                              $trim_params_file
         """

From 004e3e9aad7b92472d52d4083bdf193ccefef11e Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 3 Oct 2025 13:51:15 +0200
Subject: [PATCH 22/32] refactored DIMS/EvaluateTics

---
 DIMS/EvaluateTics.R | 109 +++++++-------------------------------------
 1 file changed, 16 insertions(+), 93 deletions(-)

diff --git a/DIMS/EvaluateTics.R b/DIMS/EvaluateTics.R
index fb73576a..a61e910f 100644
--- a/DIMS/EvaluateTics.R
+++ b/DIMS/EvaluateTics.R
@@ -14,29 +14,6 @@ highest_mz <- get(load(highest_mz_file))
 trim_params_filepath <- cmd_args[6]
 thresh2remove <- 1000000000
 
-remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
-  # collect list of samples to remove from replication pattern
-  remove_from_group <- NULL
-  for (sample_nr in 1:length(repl_pattern)){
-    repl_pattern_1sample <- repl_pattern[[sample_nr]]
-    remove <- NULL
-    for (file_nr in 1:length(repl_pattern_1sample)) {
-      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
-        remove <- c(remove, file_nr)
-      }
-    }
-    if (length(remove) == nr_replicates) {
-      remove_from_group <- c(remove_from_group, sample_nr)
-    }
-    if (!is.null(remove)) {
-      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
-    }
-  }
-  if (length(remove_from_group) != 0) {
-    repl_pattern <- repl_pattern[-remove_from_group]
-  }
-  return(list("pattern" = repl_pattern))
-}
 
 # load init_file: contains repl_pattern
 load(init_file)
@@ -52,86 +29,31 @@ if (highest_mz > 700) {
   thresh2remove <- 1000000
 }
 
-# remove technical replicates which are below the threshold and average intensity over time
-remove_neg <- NULL
-remove_pos <- NULL
-cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (sample_nr in 1:length(repl_pattern)) {
-  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
-  tech_reps_array_pos <- NULL
-  tech_reps_array_neg <- NULL
-  for (file_nr in 1:length(tech_reps)) {
-    load(paste(tech_reps[file_nr], ".RData", sep = ""))
-    cat("\n\nParsing", tech_reps[file_nr])
-    # positive scan mode: determine whether sum of intensities is above threshold
-    cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
-    if (sum(peak_list$pos[, 1]) < thresh2remove) {
-      cat(" ... Removed")
-      remove_pos <- c(remove_pos, tech_reps[file_nr])
-    }
-    tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
-    # negative scan mode: determine whether sum of intensities is above threshold
-    cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
-    if (sum(peak_list$neg[, 1]) < thresh2remove) {
-      cat(" ... Removed")
-      remove_neg <- c(remove_neg, tech_reps[file_nr])
-    }
-    tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg)
-  }
-}
+# find out which technical replicates are below the threshold
+remove_tech_reps <- find_bad_replicates(repl_pattern, thresh2remove)
+print(remove_tech_reps)
 
 # negative scan mode
-print("neg")
-pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
-repl_pattern_filtered <- pattern_list$pattern
-print(repl_pattern_filtered)
-print(length(repl_pattern_filtered))
+remove_neg <- remove_tech_reps$neg
+repl_pattern_filtered <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
 save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
-# save replication pattern in txt file for use in Nextflow
-allsamples_techreps_neg <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
-for (sample_nr in 1:length(repl_pattern_filtered)) {
-  allsamples_techreps_neg[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
-  allsamples_techreps_neg[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
-}
-allsamples_techreps_neg[, 3] <- "negative"
-
-# write information on miss_infusions
-write.table(remove_neg,
-	    file = "miss_infusions_negative.txt",
-	    row.names = FALSE,
-	    col.names = FALSE,
-	    sep = "\t"
-)
 
 # positive scan mode
-pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
-print(pattern_list)
-repl_pattern_filtered <- pattern_list$pattern
+remove_pos <- remove_tech_reps$pos
+repl_pattern_filtered <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
 save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
-# save replication pattern in txt file for use in Nextflow
-allsamples_techreps_pos <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
-for (sample_nr in 1:length(repl_pattern_filtered)) {
-  allsamples_techreps_pos[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
-  allsamples_techreps_pos[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
-}
-allsamples_techreps_pos[, 3] <- "positive"
 
-# combine information on samples and technical replicates for pos and neg
-allsamples_techreps <- rbind(allsamples_techreps_pos, allsamples_techreps_neg)
-write.table(allsamples_techreps,
-	    file = paste0("replicates_per_sample.txt"),
+# get an overview of suitable technical replicates for both scan modes
+allsamples_techreps_neg <- get_overview_tech_reps(repl_pattern_filtered, "negative")
+allsamples_techreps_pos <- get_overview_tech_reps(repl_pattern_filtered, "positive")
+allsamples_techreps_both_scanmodes <- rbind(allsamples_techreps_pos, allsamples_techreps_neg)
+write.table(allsamples_techreps_both_scanmodes,
+            file = "replicates_per_sample.txt",
             col.names = FALSE,
-	    row.names = FALSE,
-	    sep = ","
+            row.names = FALSE,
+            sep = ","
 )
 
-# write information on miss_infusions
-write.table(remove_pos,
-	    file = "miss_infusions_positive.txt",
-	    row.names = FALSE,
-	    col.names = FALSE,
-	    sep = "\t"
-)
 
 ## generate TIC plots
 # get all txt files
@@ -212,3 +134,4 @@ tic_plot_pdf <- marrangeGrob(
 ggsave(filename = paste0(run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")
 
+

From 06e5e1a69b4e775bbffa615421715894c59d7389 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 3 Oct 2025 13:53:01 +0200
Subject: [PATCH 23/32] moved functions for DIMS/EvaluateTics to separate file

---
 DIMS/preprocessing/evaluate_tics_functions.R | 104 +++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 DIMS/preprocessing/evaluate_tics_functions.R

diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R
new file mode 100644
index 00000000..057b07cd
--- /dev/null
+++ b/DIMS/preprocessing/evaluate_tics_functions.R
@@ -0,0 +1,104 @@
+# EvaluateTics functions
+find_bad_replicates <- function(repl_pattern, thresh2remove) {
+  #' Find technical replicates with a total intensity below a threshold
+  #'
+  #' @param repl_pattern: List of samples with corresponding technical replicates (strings)
+  #' @param thresh2remove: Threshold value for acceptance or rejection of total intensity (integer)
+  #'
+  #' @return remove_tech_reps: Array of rejected technical replicates (strings)
+  
+  remove_pos <- NULL
+  remove_neg <- NULL
+  cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
+  for (sample_nr in 1:length(repl_pattern)) {
+    tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+    tech_reps_array_pos <- NULL
+    #tech_reps_array_neg <- NULL
+    for (file_nr in 1:length(tech_reps)) {
+      load(paste0(tech_reps[file_nr], ".RData"))
+      cat("\n\nParsing", tech_reps[file_nr])
+      # positive scan mode: determine whether sum of intensities is above threshold
+      cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
+      if (sum(peak_list$pos[, 1]) < thresh2remove) {
+        cat(" ... Removed")
+        remove_pos <- c(remove_pos, tech_reps[file_nr])
+      }
+      # negative scan mode: determine whether sum of intensities is above threshold
+      cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
+      if (sum(peak_list$neg[, 1]) < thresh2remove) {
+        cat(" ... Removed")
+        remove_neg <- c(remove_neg, tech_reps[file_nr])
+      }
+    }
+  }
+  cat("\n")
+  # write information on miss_infusions for both scan modes
+  write.table(remove_pos,
+              file = paste0("miss_infusions_positive.txt"),
+              row.names = FALSE,
+              col.names = FALSE,
+              sep = "\t"
+  )
+  write.table(remove_neg,
+              file = paste0("miss_infusions_negative.txt"),
+              row.names = FALSE,
+              col.names = FALSE,
+              sep = "\t"
+  )
+  
+  # combine removed technical replicates from pos and neg
+  remove_tech_reps <- list(pos = remove_pos, neg = remove_neg)
+  return(remove_tech_reps)
+}
+
+remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
+  #' Remove technical replicates with insufficient quality from a biological sample
+  #'
+  #' @param bad_samples: Array of technical replicates of insufficient quality (strings)
+  #' @param repl_pattern: List of samples with corresponding technical replicates (strings)
+  #' @param nr_replicates: Number of technical replicates per biological sample (integer)
+  #'
+  #' @return repl_pattern_filtered: list of technical replicates of sufficient quality (strings)
+  
+  # collect list of samples to remove from replication pattern
+  remove_from_group <- NULL
+  for (sample_nr in 1:length(repl_pattern)){
+    repl_pattern_1sample <- repl_pattern[[sample_nr]]
+    remove <- NULL
+    for (file_nr in 1:length(repl_pattern_1sample)) {
+      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
+        remove <- c(remove, file_nr)
+      }
+    }
+    if (length(remove) == nr_replicates) {
+      remove_from_group <- c(remove_from_group, sample_nr)
+    }
+    if (!is.null(remove)) {
+      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
+    }
+  }
+  if (length(remove_from_group) != 0) {
+    repl_pattern_filtered <- repl_pattern[-remove_from_group]
+  } else {
+    repl_pattern_filtered <- repl_pattern
+  }
+  return(repl_pattern_filtered)
+}
+
+get_overview_tech_reps <- function(repl_pattern_filtered, scanmode) {
+  #' Create an overview of technical replicates with sufficient quality from a biological sample
+  #'
+  #' @param repl_pattern_filtered: List of samples with corresponding technical replicates (strings)
+  #' @param scanmode: Scan mode "positive" or "negative" (string)
+  #'
+  #' @return allsamples_techreps_scanmode: Matrix of technical replicates of sufficient quality (strings)
+  
+  allsamples_techreps_scanmode <- matrix("", ncol = 3, nrow = length(repl_pattern_filtered))
+  for (sample_nr in 1:length(repl_pattern_filtered)) {
+    allsamples_techreps_scanmode[sample_nr, 1] <- names(repl_pattern_filtered)[sample_nr]
+    allsamples_techreps_scanmode[sample_nr, 2] <- paste0(repl_pattern_filtered[[sample_nr]], collapse = ";")
+  }
+  allsamples_techreps_scanmode[, 3] <- scanmode
+  return(allsamples_techreps_scanmode)
+}
+

From 3f24b6dcaa615d0702ffdc634f583f8894ed25de Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 3 Oct 2025 13:54:54 +0200
Subject: [PATCH 24/32] added unit tests for DIMS/EvaluateTics

---
 DIMS/tests/testthat/test_evaluate_tics.R | 66 ++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 DIMS/tests/testthat/test_evaluate_tics.R

diff --git a/DIMS/tests/testthat/test_evaluate_tics.R b/DIMS/tests/testthat/test_evaluate_tics.R
new file mode 100644
index 00000000..b474a81b
--- /dev/null
+++ b/DIMS/tests/testthat/test_evaluate_tics.R
@@ -0,0 +1,66 @@
+# unit test for EvaluateTics
+
+# source all functions for PeakGrouping
+source("../../preprocessing/evaluate_tics_functions.R")
+
+# test find_bad_replicates
+testthat::test_that("TICS are correctly accepted or rejected", {
+  # It's necessary to copy/symlink the files to the current location for the combine_sum_adducts_parts function
+  # local: setwd("~/Development/DIMS_refactor_PeakFinding_codereview/CustomModules/DIMS/tests/testthat")
+  test_files <- list.files("fixtures/", "test_evaluate_tics", full.names = TRUE)
+  file.symlink(file.path(test_files), getwd())
+  
+  # create replication pattern to test on:
+  technical_replicates <- paste0("test_evaluate_tics_file", 1:3)
+  test_repl_pattern <- list(technical_replicates)
+  names(test_repl_pattern) <- "sample1"
+  test_thresh2remove <- 10^9
+  
+  # test that output has two entries
+  expect_equal(length(find_bad_replicates(test_repl_pattern, test_thresh2remove)), 2)
+  # test that first technical replicate is removed in positive scan mode
+  expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$pos, "test_evaluate_tics_file1", TRUE)
+  # test that third technical replicate is removed in negative scan mode
+  expect_equal(find_bad_replicates(test_repl_pattern, test_thresh2remove)$neg, "test_evaluate_tics_file3", TRUE)
+  # test that output files are generated
+  expect_equal(sum(grepl("miss_infusions_", list.files("./"))), 2)
+  
+  # Remove symlinked files
+  files_remove <- list.files("./", "SummedAdducts_test.RData", full.names = TRUE)
+  file.remove(files_remove)
+  
+})
+
+# test remove_from_repl_pattern
+testthat::test_that("technical replicates are correctly removed from replication pattern", {
+  # create replication pattern to test on:
+  technical_replicates <- paste0("test_evaluate_tics_file", 1:3)
+  test_repl_pattern <- list(technical_replicates)
+  names(test_repl_pattern) <- "sample1"
+  test_bad_samples <- "test_evaluate_tics_file2"
+  test_nr_replicates <- 3
+  
+  # test that the output contains 1 sample
+  expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)), 1)
+  # test that the output for the sample contains 2 technical replicates
+  expect_equal(length(remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1), 2)
+  # test that the second technical replicate has been removed
+  expect_false(unique(grepl(test_bad_samples, remove_from_repl_pattern(test_bad_samples, test_repl_pattern, test_nr_replicates)$sample1)))
+})
+
+# test get_overview_tech_reps
+testthat::test_that("overview of technical replicates is correctly created", {
+  # create replication pattern to test on:
+  technical_replicates <- paste0("test_evaluate_tics_file", 1:3)
+  test_repl_pattern_filtered <- list(technical_replicates)
+  names(test_repl_pattern_filtered) <- "sample1"
+  test_scanmode <- "positive"
+
+  # test that overview is correctly created
+  expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,1], "sample1")
+  expect_equal(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,3], "positive")
+  expect_true(get_overview_tech_reps(test_repl_pattern_filtered, test_scanmode)[ ,2] == 
+               paste0(technical_replicates, collapse = ";"), TRUE)
+  
+})
+

From c2c65ddfe715881f3bf2277a28b1e6931880c844 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 3 Oct 2025 14:11:17 +0200
Subject: [PATCH 25/32] modifications suggested in code review DIMS/PeakFinding

---
 DIMS/PeakFinding.R                          | 6 +++---
 DIMS/preprocessing/peak_finding_functions.R | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 529370d4..2cf49d20 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,3 +1,5 @@
+library(dplyr)
+
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
@@ -10,8 +12,6 @@ peak_thresh <- 2000
 # source functions script
 source(paste0(preprocessing_scripts_dir, "peak_finding_functions.R"))
 
-library(dplyr)
-
 # Load output of AssignToBins (peak_list) for a technical replicate
 load(replicate_rdatafile)
 techrepl_name <- colnames(peak_list$pos)[1]
@@ -22,7 +22,7 @@ techreps_passed <- read.table("replicates_per_sample.txt", sep=",")
 # Initialize
 options(digits = 16)
 
-# run the findPeaks function
+# do peak finding
 scanmodes <- c("positive", "negative")
 for (scanmode in scanmodes) {
   # get intensities for scan mode
diff --git a/DIMS/preprocessing/peak_finding_functions.R b/DIMS/preprocessing/peak_finding_functions.R
index 8c2f62d7..8539c03c 100644
--- a/DIMS/preprocessing/peak_finding_functions.R
+++ b/DIMS/preprocessing/peak_finding_functions.R
@@ -21,7 +21,7 @@ search_regions_of_interest <- function(ints_fullrange) {
   } else {
     regions_of_interest_gte3 <- regions_of_interest_length
   }
-  # test for length of roi. If length is greater than 11, remove roi and break up into separate rois
+  # test for length of roi (region of interest). If length is greater than 11, break up into separate rois
   remove_roi_index <- c()
   new_rois_all <- regions_of_interest_gte3[0, ]
   for (roi_nr in 1:nrow(regions_of_interest_gte3)) {
@@ -43,7 +43,7 @@ search_regions_of_interest <- function(ints_fullrange) {
           new_rois_splitroi <- rbind(new_rois_splitroi, new_rois)
           start_pos <- new_rois[, 2]
         }
-        # last part
+        # intensities after last local minimum
         new_rois[, 1] <- start_pos
         new_rois[, 2] <- regions_of_interest_gte3[roi_nr, "to"]
         new_rois[, 3] <- new_rois[, 2] - new_rois[, 1] + 1
@@ -67,13 +67,11 @@ search_regions_of_interest <- function(ints_fullrange) {
   }
 
   # remove rois that have been split into chunks or shortened
-  #print(dim(regions_of_interest_gte3))
   if (length(remove_roi_index) > 0) {
     regions_of_interest_minus_short <- regions_of_interest_gte3[-remove_roi_index, ]
   } else {
     regions_of_interest_minus_short <- regions_of_interest_gte3
   }
-  #print(dim(regions_of_interest_minus_short))
   # combine remaining rois with info on chunks
   regions_of_interest_split <- rbind(regions_of_interest_minus_short, new_rois_all)
   # remove regions of interest with short lengths again

From 15a25ba489a52702b64122b2c2d0b2b9d4a3356f Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 16 Oct 2025 16:21:48 +0200
Subject: [PATCH 26/32] removed two obsolete lines

---
 DIMS/preprocessing/evaluate_tics_functions.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/DIMS/preprocessing/evaluate_tics_functions.R b/DIMS/preprocessing/evaluate_tics_functions.R
index 057b07cd..318ce2b6 100644
--- a/DIMS/preprocessing/evaluate_tics_functions.R
+++ b/DIMS/preprocessing/evaluate_tics_functions.R
@@ -12,8 +12,6 @@ find_bad_replicates <- function(repl_pattern, thresh2remove) {
   cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
   for (sample_nr in 1:length(repl_pattern)) {
     tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
-    tech_reps_array_pos <- NULL
-    #tech_reps_array_neg <- NULL
     for (file_nr in 1:length(tech_reps)) {
       load(paste0(tech_reps[file_nr], ".RData"))
       cat("\n\nParsing", tech_reps[file_nr])

From 007bea441c48b9019b35e6d7ea5fdaa9190122ec Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 16 Oct 2025 17:14:26 +0200
Subject: [PATCH 27/32] moved parameter ppm_peak from DIMS/AveragePeaks.R to
 inside function

---
 DIMS/AveragePeaks.R                          | 3 ---
 DIMS/preprocessing/average_peaks_functions.R | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
index 7e8d9fe1..997b1d3c 100644
--- a/DIMS/AveragePeaks.R
+++ b/DIMS/AveragePeaks.R
@@ -12,9 +12,6 @@ tech_reps <- strsplit(techreps, ";")[[1]]
 # load in function scripts
 source(paste0(preprocessing_scripts_dir, "average_peaks_functions.R"))
 
-# set ppm as fixed value, not the same ppm as in peak grouping
-ppm_peak <- 2
-
 # Initialize per sample
 peaklist_allrepl <- NULL
 nr_repl_persample <- 0
diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R
index 2545b757..77309acf 100644
--- a/DIMS/preprocessing/average_peaks_functions.R
+++ b/DIMS/preprocessing/average_peaks_functions.R
@@ -7,7 +7,9 @@ average_peaks_per_sample <- function(peaklist_allrepl_sorted) {
   
   # initialize
   averaged_peaks <- peaklist_allrepl_sorted[0, ]
-  
+  # set ppm as fixed value, not the same ppm as in peak grouping
+  ppm_peak <- 2
+
   while (nrow(peaklist_allrepl_sorted) > 1) {
     # store row numbers
     peaklist_allrepl_sorted$rownr <- 1:nrow(peaklist_allrepl_sorted)

From e58640b703250c7c702c1f39a78606827854c890 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 16 Oct 2025 17:20:12 +0200
Subject: [PATCH 28/32] added parameter sample_name to
 DIMS/preprocessing/average_peaks_functions.R

---
 DIMS/AveragePeaks.R                          | 2 +-
 DIMS/preprocessing/average_peaks_functions.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DIMS/AveragePeaks.R b/DIMS/AveragePeaks.R
index 997b1d3c..7114e3c4 100644
--- a/DIMS/AveragePeaks.R
+++ b/DIMS/AveragePeaks.R
@@ -32,6 +32,6 @@ peaklist_allrepl_df$height.pkt <- as.numeric(peaklist_allrepl_df$height.pkt)
 peaklist_allrepl_sorted <- peaklist_allrepl_df %>% arrange(mzmed.pkt)
 
 # average over technical replicates
-averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted)
+averaged_peaks <- average_peaks_per_sample(peaklist_allrepl_sorted, sample_name)
 save(averaged_peaks, file = paste0("AvgPeaks_", sample_name, "_", scanmode, ".RData"))
 
diff --git a/DIMS/preprocessing/average_peaks_functions.R b/DIMS/preprocessing/average_peaks_functions.R
index 77309acf..beed29bb 100644
--- a/DIMS/preprocessing/average_peaks_functions.R
+++ b/DIMS/preprocessing/average_peaks_functions.R
@@ -1,4 +1,4 @@
-average_peaks_per_sample <- function(peaklist_allrepl_sorted) {
+average_peaks_per_sample <- function(peaklist_allrepl_sorted, sample_name) {
   #' Average the intensity of peaks that occur in different technical replicates of a biological sample
   #'
   #' @param peaklist_allrepl_sorted: Dataframe with peaks sorted on median m/z (float)

From f0763a012907c191916b5d9fa6eed21be1386e42 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 16 Oct 2025 17:24:58 +0200
Subject: [PATCH 29/32] modified DIMS/tests/testthat/test_average_peaks.R for
 extra variable sample_name

---
 DIMS/tests/testthat/test_average_peaks.R | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R
index 994de48c..bb945b71 100644
--- a/DIMS/tests/testthat/test_average_peaks.R
+++ b/DIMS/tests/testthat/test_average_peaks.R
@@ -14,13 +14,14 @@ testthat::test_that("peaks are correctly averaged", {
   test_peaklist_sorted <- as.data.frame(cbind(samplenr = samplenrs, test_peaklist_sorted_num))
   test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2])
   test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6])
+  test_sample_name <- "P001"
 
   # test that first peak is correctly averaged
-  expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted)[1, 6]), 600, tolerance = 0.001, TRUE)
+  expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)[1, 6]), 600, tolerance = 0.001, TRUE)
   # test number of rows
-  expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted)), 2)
+  expect_equal(nrow(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)), 2)
   # test column names
-  expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted)),
+  expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)),
                c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE)
 })
 

From ac9f43fc6508e5db29beb9d5d10cb8dbaa337e3a Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 17 Oct 2025 14:50:10 +0200
Subject: [PATCH 30/32] added fixture files for unit test for DIMS/EvaluateTics

---
 .../fixtures/test_evaluate_tics_file1.RData       | Bin 0 -> 307 bytes
 .../fixtures/test_evaluate_tics_file2.RData       | Bin 0 -> 301 bytes
 .../fixtures/test_evaluate_tics_file3.RData       | Bin 0 -> 309 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData
 create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData
 create mode 100644 DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData

diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file1.RData
new file mode 100644
index 0000000000000000000000000000000000000000..6d9f40d89645aeee7de35825602471c5252ff7a0
GIT binary patch
literal 307
zcmV-30nGj%iwFP!000000}FDAFy@NjVqjokW?*3flB_@`18ZoAo2~@|0}B(7!^ptG
zzzL)|3sMua<8v~LOBfiKguyx(nD~G+mt(Tf1qP}D6pNWtGIN0xFA#%-xxiAWN;rT@
z@)C1Xi-Bxms5$|#TBy94rJjL-fw2)9-vX6yVvNSOMCF^BqVmlQ(fH=5d~-DW%+c(#
zFoC)g<`15<%$!t^N8$|(41xTAupo9~S`e>5^&n>Yk3C$O{|~=#WB`YwL!bl8KI1ee
zAkDgwt7;pNW?THQUlT~PPusD0E|BKv?B5{>q&aKS&a?n&u6)}EKY%oMlE~K_Ak7o<
zhVwp<=5;?T7mSt;Sz(C~<QQgH`Yp&WhB5O})1geRq#|H`Dt0T+$b~xa9{>|eRIZH!
F003yQiBA9k

literal 0
HcmV?d00001

diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file2.RData
new file mode 100644
index 0000000000000000000000000000000000000000..77d650f943d6314e2650f02db73963f0d1413f73
GIT binary patch
literal 301
zcmV+|0n+{-iwFP!000000}FDAFy@NjVqjokW?*3flB_@`18ZoAo2~@|0}B(7!^ptG
zzzL)|3sMua<8v~LOBfiKguyx(nD~G+mlM;1cm)P(0u+mxQ!;ab6fY2igt@>{s7g41
zO7aqOQ;UIYVW>I*uv)0RnWdhAfq}6R8s7qyZ(@wbw?yTenxgW}4AJ=JsC;uY`^?eo
zvoL|W6Xp+|w9K4TkVoPT3=Dz%f3yu^rvKQ(mHGeh3r7ZUI64G6u<SEVa{|(=8@Z~s
z0cp0y5BoKNH2bt2i{}Dqj?Vraf<T(HChbfMkmky_eeeTFb0>*>%>mLpA#XVE18H9O
z!*aoB>5vtc2tkfvhNa(v{9+h0FEt&?<Vq?6=BHw}@{C-l1OEX4Rpk5RjRODxGf;@;

literal 0
HcmV?d00001

diff --git a/DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData b/DIMS/tests/testthat/fixtures/test_evaluate_tics_file3.RData
new file mode 100644
index 0000000000000000000000000000000000000000..21fd5bca99acaa8d9d45524f0c2c149ca2082e5c
GIT binary patch
literal 309
zcmV-50m}X#iwFP!000000}FDAFy@NjVqjokW?*3flB_@`18ZoAo2~@|0}B(7!^ptG
zzzL)|3sMua<8v~LOBfiKguyx(nD~G+mlM;1cm)P(0u+mxQ!;ab6fY2igt@>{s7g41
zO7aqOQ;UIYVW>I*uv)0RnWdhAfq}6R8s7qyZ(@wbw?yTenxgW}4AJ=JsC;uY`^?eo
zvoL|W6Xp+|w9K4TkVoPT3=Dz%f3P5SOcuI8)er`UAJc#A;mZ7f_=O__I2;`U9a#1m
zr#S&>){R_M+kiCN;)ng3K$?Bpj>U6<G)HIu4nZKzS(A3A1xRz{+dlXKq`8wszUBaF
zo{%@3_klF8`(e3Yv~<V{ON1cDFvHSsL4GlenU|UlWpX7I0rOL_TX{w<)Pes16O~Tc
HjRODx(W;7S

literal 0
HcmV?d00001


From 083aeac9cb67609e0a3d9c5db7a484fd92e02e4c Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 22 Dec 2025 11:55:20 +0100
Subject: [PATCH 31/32] added unit test for empty peaklist

---
 DIMS/tests/testthat/test_average_peaks.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/DIMS/tests/testthat/test_average_peaks.R b/DIMS/tests/testthat/test_average_peaks.R
index bb945b71..0a8d2345 100644
--- a/DIMS/tests/testthat/test_average_peaks.R
+++ b/DIMS/tests/testthat/test_average_peaks.R
@@ -15,6 +15,7 @@ testthat::test_that("peaks are correctly averaged", {
   test_peaklist_sorted[, 2] <- as.numeric(test_peaklist_sorted[, 2])
   test_peaklist_sorted[, 6] <- as.numeric(test_peaklist_sorted[, 6])
   test_sample_name <- "P001"
+  test_empty_peaklist <- test_peaklist_sorted[0, ]
 
   # test that first peak is correctly averaged
   expect_equal(as.numeric(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)[1, 6]), 600, tolerance = 0.001, TRUE)
@@ -23,5 +24,7 @@ testthat::test_that("peaks are correctly averaged", {
   # test column names
   expect_equal(colnames(average_peaks_per_sample(test_peaklist_sorted, test_sample_name)),
                c("samplenr", "mzmed.pkt", "fq", "mzmin.pkt", "mzmax.pkt", "height.pkt"), TRUE)
+  # test what happens when peak list is empty
+  expect_error(average_peaks_per_sample(test_empty_peaklist, test_sample_name))
 })
 

From e813fe716ed8eae56fcb71c7e6fe45f931b4da7e Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 6 Jan 2026 18:45:16 +0100
Subject: [PATCH 32/32] extra output peak list per technical replicate

---
 DIMS/PeakFinding.R  | 1 +
 DIMS/PeakFinding.nf | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 2cf49d20..ab62a880 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -54,5 +54,6 @@ for (scanmode in scanmodes) {
  
   # save output to file
   save(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".RData"))
+  write.table(integrated_peak_df, file = paste0(techrepl_name, "_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
 }
 
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 04c82540..1d02e505 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -9,7 +9,8 @@ process PeakFinding {
        each path(sample_techreps)
 
     output:
-       path '*tive.RData', optional: true
+       path '*tive.RData', emit: peaklist_rdata, optional: true
+       path '*tive.txt', optional: true
 
     script:
         """