From dde03e91b150a2d7f6804eda4765503677c05f4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Todor=20Kondi=C4=87?= <todor.kondic@uni.lu>
Date: Tue, 15 Dec 2020 15:09:49 +0100
Subject: [PATCH] feature: enable processing of mzMLs with missing precursors

The extraction routine was modified to first scan for missing
precursors. If there are such, then assign as precursors the nearest
preceeding MS1 signature (in exactly the same way RMassBank is doing
it). Another option is to just omit MS2 with no precursors.
---
 R/api.R        |  8 ++++--
 R/extraction.R | 23 +++++++++++++--
 R/mix.R        | 77 +++++++++++++++++++++++++++++++++++++++++++++++++-
 R/resources.R  |  2 ++
 4 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/R/api.R b/R/api.R
index 57b3416..d5e807e 100644
--- a/R/api.R
+++ b/R/api.R
@@ -317,6 +317,8 @@ extr_data <- function(m) {
     file <- m$out$tab$data[,unique(file)]
     allCEs <- do.call(c,args=lapply(file,function(fn) {
         z <- MSnbase::readMSData(files=fn,msLevel = c(1,2),mode="onDisk")
+
+        
         unique(MSnbase::collisionEnergy(z),fromLast=T)
         
     }))
@@ -348,14 +350,16 @@ extr_data <- function(m) {
         
         
         err_rt <- m$conf$tolerance$rt
-        
+
+        missing_precursor_info <- m$conf$extract$missing_precursor_info
         x <- futuref(extract(fn=fn,
                              tag=the_tag,
                              tab=tab,
                              err_ms1_eic=err_ms1_eic,
                              err_coarse = err_coarse,
                              err_fine= err_fine,
-                             err_rt= err_rt),
+                             err_rt= err_rt,
+                             missing_precursors = missing_precursor_info),
                      lazy = F)
 
         x
diff --git a/R/extraction.R b/R/extraction.R
index a1624e3..6e90f73 100644
--- a/R/extraction.R
+++ b/R/extraction.R
@@ -490,7 +490,7 @@ extr_eic_ms1 <- function(tab,err) {
 }
 
 ##' @export
-extract <- function(fn,tag,tab,err_ms1_eic.,err_coarse,err_fine,err_rt.) {
+extract <- function(fn,tag,tab,err_ms1_eic.,err_coarse,err_fine,err_rt.,missing_precursors) {
     ## Extracts MS1 and MS2 EICs, as well as MS2 spectra, subject to
     ## tolerance specifications.
 
@@ -530,6 +530,9 @@ extract <- function(fn,tag,tab,err_ms1_eic.,err_coarse,err_fine,err_rt.) {
         ms2 <- MSnbase::readMSData(file=fn,msLevel=2,mode="onDisk")
         ms2
     }
+    read_all <- function() {
+        MSnbase::readMSData(file=fn,msLevel. = c(1,2),mode="onDisk")
+    }
     extr_ms1_eic <- function(ms1) {
         eic <- MSnbase::chromatogram(ms1,mz=mzrng,msLevel=1,missing=0.0,rt=rtrng)
         bits <- dtable(N=sapply(eic,NROW))
@@ -555,8 +558,22 @@ extract <- function(fn,tag,tab,err_ms1_eic.,err_coarse,err_fine,err_rt.) {
         data.table::setkeyv(res,BASE_KEY)
         res
     }
-    ms1 <- read_ms1()
-    ms2 <- read_ms2()
+
+    
+    if (missing_precursors != "do_nothing") {
+        ms <- clean_na_precs(read_all(),missing_precursors = missing_precursors)
+        fdta <- as.data.table(MSnbase::fData(ms),keep.rownames = "rn")
+        fdta[,idx := .I]
+        idx_ms1 <- fdta[msLevel == 1,idx]
+        idx_ms2 <- fdta[msLevel == 2,idx]
+        
+        ms1 <- ms1[idx_ms1]
+        ms2 <- ms2[idx_ms2]
+    
+    } else {
+        ms1 <- read_ms1()
+        ms2 <- read_ms2()
+    }
     res_ms1 <- extr_ms1_eic(ms1)
     res_ms2 <- extr_ms2(ms1=ms1,
                         ms2=ms2,
diff --git a/R/mix.R b/R/mix.R
index 5825378..20b5531 100644
--- a/R/mix.R
+++ b/R/mix.R
@@ -793,7 +793,6 @@ base_conf <- function () {
                    compounds=list(lists=list(),
                                   sets="",
                                   data=""),
-                   extr=list(fn=""),
                    debug = F)
     m
 }
@@ -803,6 +802,8 @@ extr_conf <- function(m) {
                              "ms1 fine"=MS1_ERR_FINE,
                              "eic"=EIC_ERR,
                              "rt"=RT_EXTR_ERR)
+    m$conf$extract <- list(missing_precursor_info=DEF_CONF_MISSING_PCS)
+
     m
 }
 
@@ -1614,3 +1615,77 @@ gen_key_plot_tab <- function(m) {
             by=plot_key]
 
 }
+
+enum_na_scans <- function(vec) {
+    ## Canibalised .locf function from RMassBank.
+    tmp <- !is.na(vec)
+    na_idx <- cumsum(tmp)
+    
+    ## The construct below deals with the fact that vec[0] ==
+    ## integer(0) and not NA...  Therefore we retrieve indexes to
+    ## c(NA,vec) from c(0, which(vec.isna)) + 1 (the +1 shifts all
+    ## values so the 0s point to NA).
+
+    c(NA,vec)[c(0,which(tmp))[na_idx+1]+1]
+    
+}
+
+fix_na_precs <- function(ms) {
+    fd_df <- MSnbase::fData(ms)
+    fd <- data.table::as.data.table(fd_df, keep.rownames = "rn")
+    ## Reset precursor numbers.
+    fd[,precursorScanNum := NA_real_]
+    ## Temporarily label MS1 precursors using their acquisition
+    ## numbers.
+    fd[msLevel == 1, precursorScanNum := acquisitionNum]
+    ## Now populate the MS2 precursors based on MS1 scan numbers.
+    fd[,precursorScanNum := enum_na_scans(precursorScanNum)]
+    ## Reset MS1 precursors.
+    fd[msLevel == 1, precursorScanNum := 0]
+    ## Drop remaining MS2 NAs.
+    notna_idx <- fd[!is.na(precursorScanNum),.I]
+    fd <- fd[notna_idx]
+    notna_rns <- fd[,rn]
+    ms <- ms[notna_idx]
+    fd_res <- as.data.frame(fd)
+    ## Fix rownames.
+    rownames(fd_res) <- notna_rns
+    ## Modify header data for `ms'.
+    MSnbase::fData(ms) <- fd_res
+    ms
+    
+}
+
+omit_na_precs <- function(ms) {
+    fd_df <- MSnbase::fData(ms)
+    fd <- data.table::as.data.table(fd_df, keep.rownames = "rn")
+    fd[,idx:=.I]
+    idx <- fd[msLevel == 1 | msLevel == 2 & !is.na(precursorScanNum),idx]
+    ms[idx]
+    
+}
+
+clean_na_precs <- function(ms,missing_precursors) {
+    fd <- data.table::as.data.table(MSnbase::fData(ms))
+    na_pcs <- fd[msLevel==2 & is.na(precursorScanNum),.I]
+    pc_na_info <- missing_precursors
+
+    if (length(na_pcs)>0) {
+            warning("There are MS2 spectra with missing precursor entries in your data. Consider setting missing_precursor_info to `fill', or `omit'.")
+            message("Current missing_precursor_info value is: ", pc_na_info)
+        }
+        
+        
+    ms <- if (pc_na_info == "fill") {
+             fix_na_precs(ms)
+         } else if (pc_na_info == "omit") {
+             omit_na_precs(ms)
+         } else if (pc_na_info == "do_nothing") {
+             ms
+         } else {
+             stop("Fatal: m$conf$missing_precursor_info value ",pc_na_info, "is not one of `fill', `omit', or `do_nothing`.")
+         }
+
+    ms
+    
+}
diff --git a/R/resources.R b/R/resources.R
index 965bf02..0ef98f8 100644
--- a/R/resources.R
+++ b/R/resources.R
@@ -262,3 +262,5 @@ EMPTY_COMP_TAB <- dtable(ID=character(),
                          Formula=character(),
                          file=character())
 
+
+DEF_CONF_MISSING_PCS <- "do_nothing"
-- 
GitLab