diff --git a/R/api.R b/R/api.R index 38f2c863ad8d1b45024f1b582ddb19ba28b3a46b..0ebf88ad382ba9a9756d6d557b7bf11fb7167f74 100644 --- a/R/api.R +++ b/R/api.R @@ -383,11 +383,10 @@ conf_trans <- function(conf) { prescreen <- function(m) { ## Top-level auto prescreening function. message("(prescreen): Start.") - ## confpres <- conf_trans_pres(m$conf$prescreen) - m$qa <- NULL - m$out$tab$summ <- NULL - m$qa <- analyse_extracted_data(m$extr,m$conf$prescreen) - m$out$tab$summ <- gen_summ(m$out$tab$comp,m$qa) + m$qa = NULL + m$out$tab$summ = NULL + m$qa = analyse_extracted_data(m$db,m$conf$prescreen) + m$out$tab$summ = gen_summ(m$out$tab$comp,m$qa) message("(prescreen): End.") m } diff --git a/R/extraction.R b/R/extraction.R index 1d12554986812a94278e63c6d9091f0274ca9814..845999d01ed18ed175b40ec1951bc0865c9daf9f 100644 --- a/R/extraction.R +++ b/R/extraction.R @@ -88,7 +88,7 @@ get_fdata <- function(ms) { res$ms2 = fdata[msLevel==2L,.(scan, idx=spIdx, an=acquisitionNum, - rt=retentionTime, + rt=retentionTime/60., intensity=basePeakIntensity, ce=collisionEnergy, prec_mz=precursorMZ, @@ -138,3 +138,155 @@ extract_spectra <- function(ms,cgram_ms2) { res[indices,on=.(scan),precid:=i.precid] } + + +## PRESCREENING + + +## This function extracts intensity maxima on intervals given by +## RT vectors rt_1 and rt_2. +find_ms1_max <- function(rt,intensity,rt_1,rt_2) +{ + mapply(function (rt_1,rt_2) { + rt_ival <- c(rt_1,rt_2) + intv <- findInterval(rt,rt_ival) + lintv = length(intv) + if (intv[1]==0L && intv[lintv] == 2L) { + pos = match(c(1L,2L),intv) + } else if (intv[1]==1L && intv[lintv]!=1L) { + pos = c(1L,match(2L,intv)) + } else if (intv[1]==0L && intv[lintv]!=0L) { + pos = c(match(1L,intv),lintv) + } else { + pos = c(1L,lintv) + } + pmax = pos[[1]] + which.max(intensity[pos[[1]]:pos[[2]]]) - 1L + if (length(pmax)==0L) pmax = pos[[1]] + c(rt[pmax],intensity[pmax]) + }, rt_1, rt_2, USE.NAMES=F) + +} + +analyse_extracted_data <- function(db,prescreen_param) { + ## Note + ## + ## I am working on this two days before the group meeting. The + ## point of a meeting is to have something to show, so I will just + ## minimally adapt the old `analyse_extracted_data' to the new + ## `db' entries in the state. I suspect, even this is not going to + ## be very easy. + ## + ## If no meeting was happening, then I'd create a nice, sleek + ## function that fully adheres to the new `data model' + ## philosophy. Alas, ... + + ms1 = db$extr$cgm$ms1 + ms2 = db$extr$cgm$ms2 + spectra = db$extr$spectra + precursors = db$precursors + + ## Get file info. + ms2_noise_table = precursors[spectra,.(file,intensity),on="precid",by=.EACHI,nomatch=NULL] + + ## Calculate threshold. + ms2_noise_table[,threshold:=0.33333333*mean(intensity),by="file"] + + ## Reduce table. + ms2_noise_table = ms2_noise_table[,.(threshold=first(threshold,1L)),keyby="precid"] + + + + ## Parameters. + presconf = conf_trans_pres(prescreen_param) + rt_shift = presconf$ret_time_shift_tol + det_ms2_noise = presconf$det_ms2_noise + ms2_int_thresh = presconf$ms2_int_thresh + ms1_int_thresh = presconf$ms1_int_thresh + + ## We start populating the ms2 qa table. + tab_ms2 = copy(ms2) + + ## Calculate noise. + tab_ms2[ms2_noise_table,qa_ms2_good_int:=intensity>threshold,on=.(precid)] + + ## Rename as downstream wants it. + setnames(tab_ms2,c("rt","intensity"),c("ms2_rt","ms2_int")) + tab_ms2[,`:=`(rt_left = ms2_rt - rt_shift,rt_right = ms2_rt + rt_shift)] + + ## Get mean ms1 value. + tab_ms1 <- copy(ms1) + tab_ms1_mean <- tab_ms1[,.(ms1_mean=mean(intensity,na.rm=T)),keyby="precid"] + + + + browser() + + ## Perform MS1 maxima calculation in the neighbourhood of each + ## MS2 result. + + tmp = tab_ms1[tab_ms2, + { + xx = find_ms1_max(rt,intensity,i.rt_left,i.rt_right) + .(scan=i.scan, + ms1_rt = xx[1,], + ms1_int = xx[2,]) + },on=.(precid), + by=.EACHI, + nomatch=NULL] + ## Calculate QA values. + tab_ms2[tmp,c("ms1_rt","ms1_int"):=.(i.ms1_rt,i.ms1_int),on=c(BASE_KEY,'an')] + tab_ms2[,c("rt_left","rt_right"):=c(NULL,NULL)] + tab_ms2[tab_ms1_mean,ms1_mean:=i.ms1_mean] + tab_ms2[,`:=`(qa_ms1_good_int=fifelse(ms1_int>ms1_int_thresh,T,F), + qa_ms1_above_noise=F, + qa_ms2_near=F)] + + ## TODO: I wonder if so stupidly auto-calculated ms1 noise should + ## be taken into account at all? My recollection from earlier + ## times was that it was helpful, at least sometimes. + tab_ms2[qa_ms1_good_int==T,qa_ms1_above_noise:=fifelse(ms1_int>ms1_mean/3.,T,F)] + tab_ms2[qa_ms1_good_int==T & qa_ms1_above_noise==T & qa_ms2_good_int==T,qa_ms2_near:=T] + tab_ms2$qa_ms2_exists=T + + + ## Find MS1 with no corresponding MS2. + ms1key <- tab_ms1[,unique(.SD),.SDcol=BASE_KEY] + ms2key <- tab_ms2[,unique(.SD),.SDcol=BASE_KEY] + ms2key$mch=T + tabmatches <- ms2key[ms1key] + ms1woms2 <- tabmatches[is.na(mch)][,mch:=NULL] + + ## calculate the most intense peak, its location and the mean for + ## childless MS1. + tab_noms2 <- tab_ms1[ms1woms2,.(ms1_mean=mean(intensity),ms1_rt=rt[which.max(intensity)],ms1_int=max(intensity)),by=.EACHI,nomatch=NULL] + + ## QA for the above (lazy qa ... take only the max peak into account). + tab_noms2[,c("qa_ms1_good_int","qa_ms1_above_noise"):=.(ms1_int>ms1_int_thresh,ms1_int>ms1_mean/3.)] + + ## MS2 QA criteria all fail. + tab_noms2[,c("qa_ms2_exists","qa_ms2_good_int","qa_ms2_near"):=.(F,F,F)] + + ## Bind MS1-only and MS1/MS2 entries together. + res <- rbind(tab_ms2,tab_noms2,fill=T,use.names=T) + + ## Every single entry which was extracted has at least MS1. + res[,qa_ms1_exists:=T] + data.table::setkeyv(res,BASE_KEY) + + qflg <- QA_FLAGS[!(QA_FLAGS %in% "qa_pass")] + res[,qa_pass:=apply(.SD,1,all),.SDcols=qflg] + res[.(T),del_rt:=abs(ms2_rt - ms1_rt),on="qa_pass",by='an'] + resby <- BASE_KEY_MS2[! (BASE_KEY_MS2 %in% 'an')] + res[.(T),qa_tmp_ms1_max:= ms1_int==max(ms1_int),on="qa_pass",by=resby] + res[,ms2_sel:=F] + res[.(T,T),ms2_sel:= del_rt == del_rt[which.min(del_rt)],on=c("qa_pass","qa_tmp_ms1_max"),by=resby] + res[,qlt_ms1:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms1_exists", + "qa_ms1_above_noise", + "qa_ms1_good_int")] + res[,qlt_ms2:=apply(.SD,1,function(rw) sum(c(5L,3L,2L)*rw)),.SDcol=c("qa_ms2_exists", + "qa_ms2_near", + "qa_ms2_good_int")] + res +} + + diff --git a/R/mix.R b/R/mix.R index 5adef4589d9ad608d3fc04fb4b7a03d7bd9be975..d0dc25558df078b53ad93f2c652d65a9f864abb2 100644 --- a/R/mix.R +++ b/R/mix.R @@ -673,7 +673,7 @@ assess_ms2 <- function(m) { } ## Analyze extracted data. -analyse_extracted_data <- function(extr,prescreen_param) { +analyse_extracted_data_old <- function(extr,prescreen_param) { ms1 <- extr$ms1 ms2 <- extr$ms2 ## Parameters. diff --git a/tests/testthat/test-extraction.R b/tests/testthat/test-extraction.R new file mode 100644 index 0000000000000000000000000000000000000000..28113373829b6ede814f1bb695c9009b83d107a7 --- /dev/null +++ b/tests/testthat/test-extraction.R @@ -0,0 +1,17 @@ +test_that("Test find_ms1_max behaviour.",{ + rt = c(1.,1.5,1.7,3.,5.6,7.,7.1,8.,9.5,10.) + intensity= c(NA,NA,NA,0.1,0.2,0.3,0.19,0.09,NA,NA) + x = find_ms1_max(rt,intensity,0.9,11.) + expect_equal(x[[1]],7.) + expect_equal(x[[2]],0.3) + + rt = c(1.,1.5,1.7,3.,5.6,7.,7.1,8.,9.5,10.) + intensity= c(NA,NA,NA,0.1,0.3,0.2,0.3,0.4,0.25,NA) + x = find_ms1_max(rt,intensity,c(3.,7.1),c(7.,10.)) + expect_equal(x[1,1],5.6) + expect_equal(x[1,2],8.) + expect_equal(x[2,1],0.3) + expect_equal(x[2,2],0.4) + x = find_ms1_max(rt,intensity,1.,1.7) + expect_true(is.na(x[[2]])) +}) diff --git a/tests/testthat/test-integration.R b/tests/testthat/test-integration.R index 628dfadbbe1f20340f7ec10444a368a778881922..3b46dc0ffb0c950440c359dd06e5b26959a0160c 100644 --- a/tests/testthat/test-integration.R +++ b/tests/testthat/test-integration.R @@ -12,5 +12,6 @@ test_that("Extraction returns what is needed.",{ cat = m$db$cat m = run(envopts=eo,m=m,phase="extract") + m = run(envopts=eo,m=m,phase="prescreen") expect_true(1==1) })