From 2386b50422d17c815b1745f5cef863f141e230dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Todor=20Kondi=C4=87?= <kontrapunkt@uclmail.net> Date: Wed, 29 Mar 2023 23:07:20 +0200 Subject: [PATCH] extr$ms? -> db$extr$cgm$ms? --- R/api.R | 10 +- R/mix.R | 254 ---------------------------------- tests/testthat/helper-state.R | 2 +- tests/testthat/test-metfrag.R | 2 +- tests/testthat/test-state.R | 2 +- 5 files changed, 8 insertions(+), 262 deletions(-) diff --git a/R/api.R b/R/api.R index ac87c6a..617efbb 100644 --- a/R/api.R +++ b/R/api.R @@ -464,8 +464,8 @@ create_plots <- function(m) { ## Select the data nedeed for plotting. flt_summ <- m$out$tab$flt_summ - ms2 <- m$extr$ms2 - ms1 <- m$extr$ms1 + ms2 <- m$db$extr$cgm$ms2 + ms1 <- m$db$extr$cgm$ms1 rt_min <- rt_in_min(m$conf$figures$rt_min) rt_max <- rt_in_min(m$conf$figures$rt_max) keytab <- flt_summ[,unique(.SD),.SDcol=c("adduct","ID")] @@ -606,8 +606,8 @@ report <- function(m) { dir.create(REP_TOPDIR,recursive = T,showWarnings = F) header <- knitr::knit_expand(fn_header) flt_summ <- m$out$tab$reptab - ms2 <- m$extr$ms2 - ms1 <- m$extr$ms1 + ms2 <- m$db$extr$cgm$ms2 + ms1 <- m$db$extr$cgm$ms1 rt_min <- rt_in_min(m$conf$figures$rt_min) rt_max <- rt_in_min(m$conf$figures$rt_max) keytab <- flt_summ[,unique(.SD),.SDcol=c("adduct","ID")] @@ -781,7 +781,7 @@ metfrag <- function(m) { path = m$run$metfrag$path, subpaths = m$run$metfrag$subpaths, db_file = m$run$metfrag$db_file, - stag_tab = stagtab, ms2 = m$extr$ms2, + stag_tab = stagtab, ms2 = m$db$extr$cgm$ms2, runtime=m$run$metfrag$runtime, java_bin=m$run$metfrag$java_bin, nproc = m$conf$metfrag$nproc) diff --git a/R/mix.R b/R/mix.R index 3846cff..4aa85fe 100644 --- a/R/mix.R +++ b/R/mix.R @@ -547,260 +547,6 @@ conf_trans_pres <- function(pres_list) { pres_list } -## PRESCREENING - -create_qa_table <- function(extr,conf_presc) { - ## The first input argument is the extracted `ms`, table - ## containing MS1 and MS2 spectra. The argument `conf_presc` is - ## m$conf$prescreen, the prescreening parameters given in the conf - ## file. - - ## The qa table is just a copy of ms with added quality control - ## columns QA_COLS. - - ## The QA_FLAGS columns are flags specifying which properties of - ## compounds are known well, or not. - - ## For each compound (mass) we ask the following questions: - ## qa_ms1_exists -- does the MS1 spectrum exist at all? - ## qa_ms2_exists -- do we have any MS2 spectra at all? - ## qa_ms1_above_noise -- is MS1 above the noise treshold? - ## qa_ms2_near -- is there any MS2 spectrum inside the tolerated - ## retention time window around the MS1 peak? That is, are we - ## non-RT-shifted? - ## qa_ms2_good_int -- Is there any MS2 spectral intensity greater - ## than the MS2 threshold and less than the MS1 peak? - ## qa_pass -- did the spectrum pass all the checks? - - - ## The columns in QA_NUM_REAL are: - ## - ## ms1_int -- the maximum intensity of MS1 spectrum over the - ## entire run; - ## - ## ms1_rt -- the retention time of the peak MS1. - - ## The columns in QA_NUM_INT are: - ## - ## ms2_sel -- index of the selected MS2 spectrum; if not NA, the - ## associated spectrum passed all the checks (qa_pass == T); the - ## spectrum itself is in one of the member sublists of the `spec' - ## column. The integer `ms2_sel' is then the index of the spectrum - ## in that sublist. - ## - ## ms1_rt_ind -- TODO (but not important to end users). - - - qa <- list(prescreen=conf_presc) - - ## checks <- extr$ms2[,{ - ## },keyby=BASE_KEY_MS2] - checks <- extr$ms2 - checks[,(QA_FLAGS):=F] - ## message("checks:") - ## print(checks) - ## message("done checks") - checks[,(QA_NUM_INT):=NA_integer_] - checks[,(QA_NUM_REAL):=NA_real_] - setkeyv(checks,BASE_KEY_MS2) - qa$checks <- checks - qa -} - -assess_ms1 <- function(m) { - qa <- m$qa - ms1 <- m$extr$ms1 - ## Calculate auxiliary variables and indices. - qa_ms1 <- ms1[,.(ms1_rt_ind=which.max(intensity)),keyby=BASE_KEY] - qa_ms1 <- ms1[qa_ms1,.(ms1_rt_ind=ms1_rt_ind, - ms1_int=intensity[[ms1_rt_ind]], - ms1_rt=rt[[ms1_rt_ind]], - ms1_mean=mean(intensity)),on=BASE_KEY,by=.EACHI] - - - qa_ms1[,qa_ms1_good_int := ms1_int > qa$prescreen$ms1_int_thresh] - qa_ms1[,qa_ms1_above_noise := F] - qa_ms1[qa_ms1_good_int==T,qa_ms1_above_noise := .(ms1_int > qa$prescreen$s2n*ms1_mean)] - - - - - - ## checks[(!qa_ms1_above_noise),c("qa_ms2_good_int","qa_ms2_near","qa_ms2_exists","qa_pass"):=F] - ## qa_ms1 <- check_ms1_noise(check_ms1(qa_ms1)) - m$qa$ms1 <- qa_ms1 - m -} - -assess_ms2 <- function(m) { - - presconf <- conf_trans_pres(m$conf$prescreen) - - ms1 <- m$extr$ms1 - ms2 <- m$extr$ms2 - qa_ms1 <- m$qa$ms1 - qa_ms2 <- ms2[qa_ms1[qa_ms1_above_noise==T],.(CE=unique(CE), - pc_rt=i.ms1_rt, - pc_int=i.ms1_int, - scan=unique(scan)),on=BASE_KEY,by=.EACHI,nomatch=NULL] - - rt_win2 <- presconf$ret_time_shift_tol - qa_ms2 <- ms2[qa_ms2,.(pc_rt=pc_rt, - pc_int=pc_int, - ms2_int=max(intensity), - ms2_rt=unique(rt), - qa_ms2_near=head(rt,1) < pc_rt + rt_win2 & head(rt,1) > pc_rt - rt_win2), - by=.EACHI,on=c(BASE_KEY_MS2,"scan")] - - qa_ms2$qa_ms2_good_int <-F - qa_ms2[qa_ms2_near==T, - qa_ms2_good_int := ms2_int > presconf$ms2_int_thresh & ms2_int < pc_int, - by=c(BASE_KEY_MS2,"scan")] - - - ## qa_ms2$qa_pass <- F - ## qa_ms2[qa_ms2_good_int==T,qa_pass:=T] - qa_ms2$ms2_sel <- F - qa_ms2[qa_ms2_good_int==T,ms2_sel:={ - ind<-which.min(abs(ms2_rt-pc_rt)) - z<-ms2_sel - z[[ind]]<-T - z - },by=BASE_KEY_MS2] - setkeyv(qa_ms2,BASE_KEY_MS2) - m$qa$ms2 <- qa_ms2 - m -} - -## Analyze extracted data. -analyse_extracted_data_old <- function(extr,prescreen_param) { - ms1 <- extr$ms1 - ms2 <- extr$ms2 - ## Parameters. - presconf <- conf_trans_pres(prescreen_param) - rt_shift <- presconf$ret_time_shift_tol - det_ms2_noise <- presconf$det_ms2_noise - ms2_int_thresh <- presconf$ms2_int_thresh - ms1_int_thresh <- presconf$ms1_int_thresh - - ## Detect MS2 noise. - ms2_clc_ns <- if (det_ms2_noise) { - if (ms2_int_thresh>0) { - warning("Ignore user specified ms2_int_thresh value and autodetect noise instead.") - } - ms2[,ms2_thr:=0.33333333*mean(intensity),by="tag"] - - - } else { - ms2[,ms2_thr:=ms2_int_thresh] - - } - - ## message('ms2_clc_ns:',key(ms2_clc_ns)) - ## ms2_clc_ns <- ms2_clc_ns[intensity>ms2_thr] - - - ## We drop mz info. - tab_ms2 <- ms2_clc_ns[,.(ms2_rt=first(rt),ms2_int=max(intensity),ms2_thr=first(ms2_thr)),by=c(BASE_KEY_MS2,'an')] - tab_ms2[,qa_ms2_good_int:=ms2_int>ms2_thr,by="scan"] - data.table::setkeyv(tab_ms2,BASE_KEY_MS2) - tab_ms2[,`:=`(rt_left = ms2_rt - rt_shift,rt_right = ms2_rt + rt_shift)] - - ## Get mean ms1 value. - tab_ms1 <- extr$ms1 - tab_ms1_mean <- tab_ms1[,.(ms1_mean=mean(intensity)),keyby=BASE_KEY] - - ## This function extracts intensity maxima on intervals given by - ## RT vectors rt_1 and rt_2. - find_ms1_max <- function(rt,intensity,rt_1,rt_2) - { - mapply(function (rt_1,rt_2) { - rt_ival <- c(rt_1,rt_2) - intv <- findInterval(rt,rt_ival) - lintv = length(intv) - if (intv[1]==0L && intv[lintv] == 2L) { - pos = match(c(1L,2L),intv) - } else if (intv[1]==1L && intv[lintv]!=1L) { - pos = c(1L,match(2L,intv)) - } else if (intv[1]==0L && intv[lintv]!=0L) { - pos = c(match(1L,intv),lintv) - } else { - pos = c(1L,lintv) - } - pmax = pos[[1]] + which.max(intensity[pos[[1]]:pos[[2]]]) - 1L - c(rt[pmax],intensity[pmax]) - }, rt_1, rt_2, USE.NAMES=F) - - } - - ## Perform MS1 maxima calculation in the neighbourhood of each - ## MS2 result. - tmp = tab_ms1[tab_ms2,{ - xx = find_ms1_max(rt,intensity,i.rt_left,i.rt_right) - .(scan=i.scan, - ms1_rt = xx[1,], - ms1_int = xx[2,]) - },by=.EACHI, nomatch=NULL] - ## Calculate QA values. - tab_ms2[tmp,c("ms1_rt","ms1_int"):=.(i.ms1_rt,i.ms1_int),on=c(BASE_KEY,'an')] - tab_ms2[,c("rt_left","rt_right"):=c(NULL,NULL)] - tab_ms2[tab_ms1_mean,ms1_mean:=i.ms1_mean] - tab_ms2[,`:=`(qa_ms1_good_int=fifelse(ms1_int>ms1_int_thresh,T,F), - qa_ms1_above_noise=F, - qa_ms2_near=F)] - - ## TODO: I wonder if so stupidly auto-calculated ms1 noise should - ## be taken into account at all? My recollection from earlier - ## times was that it was helpful, at least sometimes. - tab_ms2[qa_ms1_good_int==T,qa_ms1_above_noise:=fifelse(ms1_int>ms1_mean/3.,T,F)] - tab_ms2[qa_ms1_good_int==T & qa_ms1_above_noise==T & qa_ms2_good_int==T,qa_ms2_near:=T] - tab_ms2$qa_ms2_exists=T - - - ## Find MS1 with no corresponding MS2. - ms1key <- tab_ms1[,unique(.SD),.SDcol=BASE_KEY] - ms2key <- tab_ms2[,unique(.SD),.SDcol=BASE_KEY] - ms2key$mch=T - tabmatches <- ms2key[ms1key] - ms1woms2 <- tabmatches[is.na(mch)][,mch:=NULL] - - ## calculate the most intense peak, its location and the mean for - ## childless MS1. - tab_noms2 <- tab_ms1[ms1woms2,.(ms1_mean=mean(intensity),ms1_rt=rt[which.max(intensity)],ms1_int=max(intensity)),by=.EACHI,nomatch=NULL] - - ## QA for the above (lazy qa ... take only the max peak into account). - tab_noms2[,c("qa_ms1_good_int","qa_ms1_above_noise"):=.(ms1_int>ms1_int_thresh,ms1_int>ms1_mean/3.)] - - ## MS2 QA criteria all fail. - tab_noms2[,c("qa_ms2_exists","qa_ms2_good_int","qa_ms2_near"):=.(F,F,F)] - - ## Bind MS1-only and MS1/MS2 entries together. - res <- rbind(tab_ms2,tab_noms2,fill=T,use.names=T) - - ## Every single entry which was extracted has at least MS1. - res[,qa_ms1_exists:=T] - data.table::setkeyv(res,BASE_KEY) - - qflg <- QA_FLAGS[!(QA_FLAGS %in% "qa_pass")] - res[,qa_pass:=apply(.SD,1,all),.SDcols=qflg] - res[.(T),del_rt:=abs(ms2_rt - ms1_rt),on="qa_pass",by='an'] - resby <- BASE_KEY_MS2[! (BASE_KEY_MS2 %in% 'an')] - res[.(T),qa_tmp_ms1_max:= ms1_int==max(ms1_int),on="qa_pass",by=resby] - res[,ms2_sel:=F] - res[.(T,T),ms2_sel:= del_rt == del_rt[which.min(del_rt)],on=c("qa_pass","qa_tmp_ms1_max"),by=resby] - res[,qlt_ms1:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms1_exists", - "qa_ms1_above_noise", - "qa_ms1_good_int")] - res[,qlt_ms2:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms2_exists", - "qa_ms2_near", - "qa_ms2_good_int")] - res -} - - - - - gen_mz_err_f <- function(entry,msg) { eppm <- grab_unit(entry,"ppm") eda <- grab_unit(entry,"Da") diff --git a/tests/testthat/helper-state.R b/tests/testthat/helper-state.R index 7585ac0..48c9abf 100644 --- a/tests/testthat/helper-state.R +++ b/tests/testthat/helper-state.R @@ -28,7 +28,7 @@ make_dummy_mf_project <- function() { kkk = STATE_DATA$out$tab$summ[,.(ii=.GRP),by=key(STATE_DATA$out$tab$summ)][,ii:=NULL][1:4] m$out$tab$summ = STATE_DATA$out$tab$summ[kkk,on=key(STATE_DATA$out$tab$summ),nomatch=NULL] - m$extr = STATE_DATA$extr + m$db$extr = STATE_DATA$db$extr m } diff --git a/tests/testthat/test-metfrag.R b/tests/testthat/test-metfrag.R index adf762e..1e1d89f 100644 --- a/tests/testthat/test-metfrag.R +++ b/tests/testthat/test-metfrag.R @@ -68,7 +68,7 @@ ok_return_val("metfrag_run",{ path = m$run$metfrag$path, subpaths = m$run$metfrag$subpaths, db_file = m$run$metfrag$db_file, - stag_tab = stagtab, ms2 = m$extr$ms2, + stag_tab = stagtab, ms2 = m$db$extr$cgm$ms2, runtime=m$run$metfrag$runtime, java_bin=m$run$metfrag$java_bin, nproc = 2) diff --git a/tests/testthat/test-state.R b/tests/testthat/test-state.R index 26f6146..1d5f58f 100644 --- a/tests/testthat/test-state.R +++ b/tests/testthat/test-state.R @@ -12,7 +12,7 @@ test_that("Test empty project creation.",{ test_that("pack_ms2_w_summ",{ summ = STATE_DATA$out$tab$summ - ms2 = STATE_DATA$extr$ms2 + ms2 = STATE_DATA$db$extr$cgm$ms2 res = pack_ms2_w_summ(summ,ms2) expect_snapshot(res[1]) expect_snapshot(res[2]) -- GitLab