From 2386b50422d17c815b1745f5cef863f141e230dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Todor=20Kondi=C4=87?= <kontrapunkt@uclmail.net>
Date: Wed, 29 Mar 2023 23:07:20 +0200
Subject: [PATCH] extr$ms? -> db$extr$cgm$ms?

---
 R/api.R                       |  10 +-
 R/mix.R                       | 254 ----------------------------------
 tests/testthat/helper-state.R |   2 +-
 tests/testthat/test-metfrag.R |   2 +-
 tests/testthat/test-state.R   |   2 +-
 5 files changed, 8 insertions(+), 262 deletions(-)

diff --git a/R/api.R b/R/api.R
index ac87c6a..617efbb 100644
--- a/R/api.R
+++ b/R/api.R
@@ -464,8 +464,8 @@ create_plots <- function(m) {
 
     ## Select the data nedeed for plotting.
     flt_summ <- m$out$tab$flt_summ
-    ms2 <- m$extr$ms2
-    ms1 <- m$extr$ms1
+    ms2 <- m$db$extr$cgm$ms2
+    ms1 <- m$db$extr$cgm$ms1
     rt_min <- rt_in_min(m$conf$figures$rt_min)
     rt_max <- rt_in_min(m$conf$figures$rt_max)
     keytab <- flt_summ[,unique(.SD),.SDcol=c("adduct","ID")]
@@ -606,8 +606,8 @@ report <- function(m) {
     dir.create(REP_TOPDIR,recursive = T,showWarnings = F)
     header <- knitr::knit_expand(fn_header)
     flt_summ <- m$out$tab$reptab
-    ms2 <- m$extr$ms2
-    ms1 <- m$extr$ms1
+    ms2 <- m$db$extr$cgm$ms2
+    ms1 <- m$db$extr$cgm$ms1
     rt_min <- rt_in_min(m$conf$figures$rt_min)
     rt_max <- rt_in_min(m$conf$figures$rt_max)
     keytab <- flt_summ[,unique(.SD),.SDcol=c("adduct","ID")]
@@ -781,7 +781,7 @@ metfrag <- function(m) {
                        path = m$run$metfrag$path,
                        subpaths = m$run$metfrag$subpaths,
                        db_file = m$run$metfrag$db_file,
-                       stag_tab = stagtab, ms2 = m$extr$ms2,
+                       stag_tab = stagtab, ms2 = m$db$extr$cgm$ms2,
                        runtime=m$run$metfrag$runtime,
                        java_bin=m$run$metfrag$java_bin,
                        nproc = m$conf$metfrag$nproc)
diff --git a/R/mix.R b/R/mix.R
index 3846cff..4aa85fe 100644
--- a/R/mix.R
+++ b/R/mix.R
@@ -547,260 +547,6 @@ conf_trans_pres <- function(pres_list) {
     pres_list
 }
 
-## PRESCREENING
-
-create_qa_table <- function(extr,conf_presc) {
-    ## The first input argument is the extracted `ms`, table
-    ## containing MS1 and MS2 spectra. The argument `conf_presc` is
-    ## m$conf$prescreen, the prescreening parameters given in the conf
-    ## file.
-
-    ## The qa table is just a copy of ms with added quality control
-    ## columns QA_COLS.
-
-    ## The QA_FLAGS columns are flags specifying which properties of
-    ## compounds are known well, or not.
-
-    ## For each compound (mass) we ask the following questions:
-    ## qa_ms1_exists -- does the MS1 spectrum exist at all?
-    ## qa_ms2_exists -- do we have any MS2 spectra at all?
-    ## qa_ms1_above_noise -- is MS1 above the noise treshold?
-    ## qa_ms2_near -- is there any MS2 spectrum inside the tolerated
-    ## retention time window around the MS1 peak? That is, are we
-    ## non-RT-shifted?
-    ## qa_ms2_good_int -- Is there any MS2 spectral intensity greater
-    ## than the MS2 threshold and less than the MS1 peak?
-    ## qa_pass -- did the spectrum pass all the checks?
-    
-
-    ## The columns in QA_NUM_REAL are:
-    ## 
-    ## ms1_int -- the maximum intensity of MS1 spectrum over the
-    ## entire run;
-    ##
-    ## ms1_rt -- the retention time of the peak MS1.
-
-    ## The columns in QA_NUM_INT are:
-    ##
-    ## ms2_sel -- index of the selected MS2 spectrum; if not NA, the
-    ## associated spectrum passed all the checks (qa_pass == T); the
-    ## spectrum itself is in one of the member sublists of the `spec'
-    ## column. The integer `ms2_sel' is then the index of the spectrum
-    ## in that sublist.
-    ##
-    ## ms1_rt_ind -- TODO (but not important to end users).
-    
-    
-    qa <- list(prescreen=conf_presc)
-    
-    ## checks <- extr$ms2[,{
-    ## },keyby=BASE_KEY_MS2]
-    checks <- extr$ms2
-    checks[,(QA_FLAGS):=F]
-    ## message("checks:")
-    ## print(checks)
-    ## message("done checks")
-    checks[,(QA_NUM_INT):=NA_integer_]
-    checks[,(QA_NUM_REAL):=NA_real_]
-    setkeyv(checks,BASE_KEY_MS2)
-    qa$checks <- checks
-    qa
-}
-
-assess_ms1 <- function(m) {
-    qa <- m$qa
-    ms1 <- m$extr$ms1
-    ## Calculate auxiliary variables and indices.
-    qa_ms1 <- ms1[,.(ms1_rt_ind=which.max(intensity)),keyby=BASE_KEY]
-    qa_ms1 <- ms1[qa_ms1,.(ms1_rt_ind=ms1_rt_ind,
-                           ms1_int=intensity[[ms1_rt_ind]],
-                           ms1_rt=rt[[ms1_rt_ind]],
-                           ms1_mean=mean(intensity)),on=BASE_KEY,by=.EACHI]
-
-    
-    qa_ms1[,qa_ms1_good_int := ms1_int > qa$prescreen$ms1_int_thresh]
-    qa_ms1[,qa_ms1_above_noise := F]
-    qa_ms1[qa_ms1_good_int==T,qa_ms1_above_noise := .(ms1_int > qa$prescreen$s2n*ms1_mean)]
-    
-    
-    
-
-    
-    ## checks[(!qa_ms1_above_noise),c("qa_ms2_good_int","qa_ms2_near","qa_ms2_exists","qa_pass"):=F]
-    ## qa_ms1 <- check_ms1_noise(check_ms1(qa_ms1))
-    m$qa$ms1 <- qa_ms1
-    m
-}
-
-assess_ms2 <- function(m) {
-
-    presconf <- conf_trans_pres(m$conf$prescreen)
-
-    ms1 <- m$extr$ms1
-    ms2 <- m$extr$ms2
-    qa_ms1 <- m$qa$ms1
-    qa_ms2 <- ms2[qa_ms1[qa_ms1_above_noise==T],.(CE=unique(CE),
-                                                  pc_rt=i.ms1_rt,
-                                                  pc_int=i.ms1_int,
-                                                  scan=unique(scan)),on=BASE_KEY,by=.EACHI,nomatch=NULL]
-
-    rt_win2 <- presconf$ret_time_shift_tol
-    qa_ms2 <- ms2[qa_ms2,.(pc_rt=pc_rt,
-                           pc_int=pc_int,
-                           ms2_int=max(intensity),
-                           ms2_rt=unique(rt),
-                           qa_ms2_near=head(rt,1) < pc_rt + rt_win2 & head(rt,1) > pc_rt - rt_win2),
-                  by=.EACHI,on=c(BASE_KEY_MS2,"scan")]
-
-    qa_ms2$qa_ms2_good_int <-F
-    qa_ms2[qa_ms2_near==T,
-           qa_ms2_good_int := ms2_int > presconf$ms2_int_thresh & ms2_int < pc_int,
-           by=c(BASE_KEY_MS2,"scan")]
-
-
-    ## qa_ms2$qa_pass <- F
-    ## qa_ms2[qa_ms2_good_int==T,qa_pass:=T]
-    qa_ms2$ms2_sel <- F
-    qa_ms2[qa_ms2_good_int==T,ms2_sel:={
-        ind<-which.min(abs(ms2_rt-pc_rt))
-        z<-ms2_sel
-        z[[ind]]<-T
-        z
-    },by=BASE_KEY_MS2]
-    setkeyv(qa_ms2,BASE_KEY_MS2)
-    m$qa$ms2 <- qa_ms2
-    m    
-}
-
-## Analyze extracted data.
-analyse_extracted_data_old <- function(extr,prescreen_param) {
-    ms1 <- extr$ms1
-    ms2 <- extr$ms2
-    ## Parameters.
-    presconf <- conf_trans_pres(prescreen_param)
-    rt_shift <- presconf$ret_time_shift_tol
-    det_ms2_noise <- presconf$det_ms2_noise
-    ms2_int_thresh <- presconf$ms2_int_thresh
-    ms1_int_thresh <- presconf$ms1_int_thresh
-
-    ## Detect MS2 noise.
-    ms2_clc_ns <- if (det_ms2_noise) {
-                      if (ms2_int_thresh>0) {
-                          warning("Ignore user specified ms2_int_thresh value and autodetect noise instead.")
-                      }
-                      ms2[,ms2_thr:=0.33333333*mean(intensity),by="tag"]
-
-                          
-                  } else {
-                      ms2[,ms2_thr:=ms2_int_thresh]
-                      
-                  }
-
-    ## message('ms2_clc_ns:',key(ms2_clc_ns))
-    ## ms2_clc_ns <- ms2_clc_ns[intensity>ms2_thr]
-
-
-    ## We drop mz info.
-    tab_ms2 <- ms2_clc_ns[,.(ms2_rt=first(rt),ms2_int=max(intensity),ms2_thr=first(ms2_thr)),by=c(BASE_KEY_MS2,'an')]
-    tab_ms2[,qa_ms2_good_int:=ms2_int>ms2_thr,by="scan"]
-    data.table::setkeyv(tab_ms2,BASE_KEY_MS2)
-    tab_ms2[,`:=`(rt_left = ms2_rt - rt_shift,rt_right = ms2_rt + rt_shift)]
-
-    ## Get mean ms1 value.
-    tab_ms1 <- extr$ms1
-    tab_ms1_mean <- tab_ms1[,.(ms1_mean=mean(intensity)),keyby=BASE_KEY]
-
-    ## This function extracts intensity maxima on intervals given by
-    ## RT vectors rt_1 and rt_2.
-    find_ms1_max <- function(rt,intensity,rt_1,rt_2)
-    {
-        mapply(function (rt_1,rt_2) {
-            rt_ival <- c(rt_1,rt_2)
-            intv <- findInterval(rt,rt_ival)
-            lintv = length(intv)
-            if (intv[1]==0L && intv[lintv] == 2L) {
-                pos = match(c(1L,2L),intv)
-            } else if (intv[1]==1L && intv[lintv]!=1L) {
-                pos = c(1L,match(2L,intv))
-            } else if (intv[1]==0L && intv[lintv]!=0L) {
-                pos = c(match(1L,intv),lintv)
-            } else {
-                pos = c(1L,lintv)
-            }
-            pmax = pos[[1]] + which.max(intensity[pos[[1]]:pos[[2]]]) - 1L
-            c(rt[pmax],intensity[pmax])
-        }, rt_1, rt_2, USE.NAMES=F)
-        
-    }
-
-    ## Perform MS1 maxima calculation in the neighbourhood of each
-    ## MS2 result.
-    tmp = tab_ms1[tab_ms2,{
-        xx = find_ms1_max(rt,intensity,i.rt_left,i.rt_right)
-        .(scan=i.scan,
-          ms1_rt = xx[1,],
-          ms1_int = xx[2,])
-    },by=.EACHI, nomatch=NULL]
-    ## Calculate QA values.
-    tab_ms2[tmp,c("ms1_rt","ms1_int"):=.(i.ms1_rt,i.ms1_int),on=c(BASE_KEY,'an')]
-    tab_ms2[,c("rt_left","rt_right"):=c(NULL,NULL)]
-    tab_ms2[tab_ms1_mean,ms1_mean:=i.ms1_mean]
-    tab_ms2[,`:=`(qa_ms1_good_int=fifelse(ms1_int>ms1_int_thresh,T,F),
-               qa_ms1_above_noise=F,
-               qa_ms2_near=F)]
-
-    ## TODO: I wonder if so stupidly auto-calculated ms1 noise should
-    ## be taken into account at all? My recollection from earlier
-    ## times was that it was helpful, at least sometimes.
-    tab_ms2[qa_ms1_good_int==T,qa_ms1_above_noise:=fifelse(ms1_int>ms1_mean/3.,T,F)]
-    tab_ms2[qa_ms1_good_int==T & qa_ms1_above_noise==T & qa_ms2_good_int==T,qa_ms2_near:=T]
-    tab_ms2$qa_ms2_exists=T
-
-
-    ## Find MS1 with no corresponding MS2.
-    ms1key <- tab_ms1[,unique(.SD),.SDcol=BASE_KEY]
-    ms2key <- tab_ms2[,unique(.SD),.SDcol=BASE_KEY]
-    ms2key$mch=T
-    tabmatches <- ms2key[ms1key]
-    ms1woms2 <- tabmatches[is.na(mch)][,mch:=NULL]
-
-    ## calculate the most intense peak, its location and the mean for
-    ## childless MS1.
-    tab_noms2 <- tab_ms1[ms1woms2,.(ms1_mean=mean(intensity),ms1_rt=rt[which.max(intensity)],ms1_int=max(intensity)),by=.EACHI,nomatch=NULL]
-
-    ## QA for the above (lazy qa ... take only the max peak into account).
-    tab_noms2[,c("qa_ms1_good_int","qa_ms1_above_noise"):=.(ms1_int>ms1_int_thresh,ms1_int>ms1_mean/3.)]
-
-    ## MS2 QA criteria all fail.
-    tab_noms2[,c("qa_ms2_exists","qa_ms2_good_int","qa_ms2_near"):=.(F,F,F)]
-
-    ## Bind MS1-only and MS1/MS2 entries together.
-    res <- rbind(tab_ms2,tab_noms2,fill=T,use.names=T)
-
-    ## Every single entry which was extracted has at least MS1.
-    res[,qa_ms1_exists:=T]
-    data.table::setkeyv(res,BASE_KEY)
-    
-    qflg <- QA_FLAGS[!(QA_FLAGS %in% "qa_pass")]
-    res[,qa_pass:=apply(.SD,1,all),.SDcols=qflg]
-    res[.(T),del_rt:=abs(ms2_rt - ms1_rt),on="qa_pass",by='an']
-    resby <- BASE_KEY_MS2[! (BASE_KEY_MS2 %in% 'an')]
-    res[.(T),qa_tmp_ms1_max:= ms1_int==max(ms1_int),on="qa_pass",by=resby]
-    res[,ms2_sel:=F]
-    res[.(T,T),ms2_sel:= del_rt == del_rt[which.min(del_rt)],on=c("qa_pass","qa_tmp_ms1_max"),by=resby]
-    res[,qlt_ms1:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms1_exists",
-                                                                 "qa_ms1_above_noise",
-                                                                 "qa_ms1_good_int")]
-    res[,qlt_ms2:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms2_exists",
-                                                                 "qa_ms2_near",
-                                                                 "qa_ms2_good_int")]
-    res
-}
-
-
-
-
-
 gen_mz_err_f <- function(entry,msg) {
     eppm <- grab_unit(entry,"ppm")
     eda <- grab_unit(entry,"Da")
diff --git a/tests/testthat/helper-state.R b/tests/testthat/helper-state.R
index 7585ac0..48c9abf 100644
--- a/tests/testthat/helper-state.R
+++ b/tests/testthat/helper-state.R
@@ -28,7 +28,7 @@ make_dummy_mf_project <- function()  {
     kkk = STATE_DATA$out$tab$summ[,.(ii=.GRP),by=key(STATE_DATA$out$tab$summ)][,ii:=NULL][1:4]
     
     m$out$tab$summ = STATE_DATA$out$tab$summ[kkk,on=key(STATE_DATA$out$tab$summ),nomatch=NULL]
-    m$extr = STATE_DATA$extr
+    m$db$extr = STATE_DATA$db$extr
     m
     
 }
diff --git a/tests/testthat/test-metfrag.R b/tests/testthat/test-metfrag.R
index adf762e..1e1d89f 100644
--- a/tests/testthat/test-metfrag.R
+++ b/tests/testthat/test-metfrag.R
@@ -68,7 +68,7 @@ ok_return_val("metfrag_run",{
                                path = m$run$metfrag$path,
                                subpaths = m$run$metfrag$subpaths,
                                db_file = m$run$metfrag$db_file,
-                               stag_tab = stagtab, ms2 = m$extr$ms2,
+                               stag_tab = stagtab, ms2 = m$db$extr$cgm$ms2,
                                runtime=m$run$metfrag$runtime,
                                java_bin=m$run$metfrag$java_bin,
                                nproc = 2)
diff --git a/tests/testthat/test-state.R b/tests/testthat/test-state.R
index 26f6146..1d5f58f 100644
--- a/tests/testthat/test-state.R
+++ b/tests/testthat/test-state.R
@@ -12,7 +12,7 @@ test_that("Test empty project creation.",{
 
 test_that("pack_ms2_w_summ",{
     summ = STATE_DATA$out$tab$summ
-    ms2 = STATE_DATA$extr$ms2
+    ms2 = STATE_DATA$db$extr$cgm$ms2
     res = pack_ms2_w_summ(summ,ms2)
     expect_snapshot(res[1])
     expect_snapshot(res[2])
-- 
GitLab