From 6f8fc0cb57b9399fd34e2f8267d5058f56397caf Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 15 Jul 2025 17:28:18 +0200 Subject: [PATCH 1/3] refactored HMDBparts, segmentation by mz rather than by nr of lines --- DIMS/HMDBparts.R | 100 +++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 69 deletions(-) diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 232028e6..a7768c72 100755 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -1,11 +1,9 @@ -# adapted from hmdb_parts.R - # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) db_file <- cmd_args[1] breaks_file <- cmd_args[2] -standard_run <- cmd_args[4] +standard_run <- cmd_args[3] # load file with binning breaks load(breaks_file) @@ -15,8 +13,8 @@ max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) # In case of a standard run use external HMDB parts # m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { - # skip generating HMDB parts - hmdb_parts_path <- cmd_args[3] + # skip generating HMDB parts; copy them from hmdb_parts_path + hmdb_parts_path <- cmd_args[4] # find all files containing hmdb in file name hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") for (hmdb_file in hmdb_parts) { @@ -25,8 +23,25 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } else { # generate HMDB parts in case of non-standard mz range load(db_file) - ppm <- as.numeric(cmd_args[5]) + # determine segments of m/z for HMDB parts; smaller parts for m/z < 100 + mz_segments <- c() + segment_start <- min_mz + segment_end <- min_mz + 5 + while (segment_end < max_mz) { + if (segment_start < 100) { + mz_segments <- c(mz_segments, segment_start) + segment_start <- segment_start + 5 + segment_end <- segment_end + 5 + } else { + mz_segments <- c(mz_segments, segment_start) + segment_start <- segment_start + 10 + segment_end <- segment_end + 10 + } + } + #last segment + mz_segments <- c(mz_segments, max_mz) + scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { if (scanmode == "negative") { @@ -38,73 +53,20 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } # filter mass range meassured - hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] & - hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= min_mz & + hmdb_add_iso[ , column_label] <= max_mz), ] # sort on mass - outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] - nr_rows <- dim(outlist)[1] - - # maximum number of rows per file - sub <- 20000 - end <- 0 - last_line <- sub - check <- 0 - outlist_part <- NULL + sorted_hmdb_add_iso <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] + nr_rows <- dim(sorted_hmdb_add_iso)[1] # create parts and save to file - if (nr_rows < sub) { - outlist_part <- outlist - save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData")) - } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { - for (i in 2:floor(nr_rows / sub) - 1) { - start <- -(sub - 1) + i * sub - end <- i * sub - - if (i > 1){ - outlist_i = outlist[c(start:end),] - nr_moved = 0 - # Use ppm to replace border to avoid cut within peak group - while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / - as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1, ]) - outlist_i <- outlist_i[-1, ] - nr_moved <- nr_moved + 1 - } - - save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) - check <- check + dim(outlist_part)[1] - - outlist_part <- outlist_i - last_line <- dim(outlist_part)[1] - - } else { - outlist_part <- outlist[c(start:end),] - } - } - - start <- end + 1 - end <- nr_rows - outlist_i <- outlist[c(start:end), ] - nr_moved <- 0 - - if (!is.null(outlist_part)) { - # Calculate ppm and replace border, avoid cut within peak group - while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / - as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1, ]) - outlist_i <- outlist_i[-1, ] - nr_moved <- nr_moved + 1 - } - - save(outlist_part, file = paste0(scanmode, "_hmdb.", i, ".RData")) - check <- check + dim(outlist_part)[1] - } - - outlist_part <- outlist_i - save(outlist_part, file = paste0(scanmode, "_hmdb.", i + 1, ".RData")) - check <- check + dim(outlist_part)[1] + for (mz_part_index in 1:(length(mz_segments) - 1)) { + mz_start <- mz_segments[mz_part_index] + mz_end <- mz_segments[mz_part_index + 1] + outlist_part <- sorted_hmdb_add_iso[sorted_hmdb_add_iso[ , column_label] > mz_start & + sorted_hmdb_add_iso[ , column_label] <= mz_end, ] + save(outlist_part, file = paste0(scanmode, "_hmdb.", mz_part_index, ".RData")) } } } - From da004675870b7f30805629183459d1bcb9c58801 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 15 Jul 2025 17:29:08 +0200 Subject: [PATCH 2/3] obsolete parameter ppm removed --- DIMS/HMDBparts.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 5f19f750..760b28d0 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -13,6 +13,6 @@ process HMDBparts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.standard_run $params.hmdb_parts_files """ } From c1e79a557eeae1f81c240569ac3721188cd049f7 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Tue, 30 Sep 2025 12:57:27 +0200 Subject: [PATCH 3/3] rewritten selection of segment size for HMDB parts --- DIMS/HMDBparts.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index a7768c72..8c2234f4 100755 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -30,14 +30,13 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < segment_end <- min_mz + 5 while (segment_end < max_mz) { if (segment_start < 100) { - mz_segments <- c(mz_segments, segment_start) - segment_start <- segment_start + 5 - segment_end <- segment_end + 5 + segment_size = 5 } else { - mz_segments <- c(mz_segments, segment_start) - segment_start <- segment_start + 10 - segment_end <- segment_end + 10 + segment_size = 10 } + mz_segments <- c(mz_segments, segment_start) + segment_start <- segment_start + segment_size + segment_end <- segment_end + segment_size } #last segment mz_segments <- c(mz_segments, max_mz)