Newer
Older
## Copyright (C) 2020 by University of Luxembourg
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## http://www.apache.org/licenses/LICENSE-2.0
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
stripext<-function(fn) {
bits<-strsplit(fn,split="\\.")[[1]]
if (length(bits)> 1) paste(head(bits,-1),collapse=".") else fn}
get_mz_cmp_l<-function(id,adduct,cmpL) {
res<-if (!is.null(mz) && !is.na(mz)) {
mde<-as.character(adduct)
wh<-ADDUCTMAP[[mde]]
RChemMass::getSuspectFormulaMass(smiles)[[wh]]
} else stop("Both SMILES and mz fields, for ID ",id,", found empty in the compound list. Aborting.")
calc_mz_from_formula_outer <- function(chform,adduct,id) {
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
check_chform <- enviPat::check_chemform(ISOTOPES,chform)
wind <- which(check_chform$warning)
if (length(wind) > 0) stop("Cannot understand the following formulas: ",
paste(check_chform$new_formula[wind],collapse = ","))
mol_form <- check_chform$new_formula
l_mol <- length(mol_form)
l_add <- length(adduct)
adds <- ADDUCTS[Name %in% adduct,.(Name,
add=as.character(Formula_add),
ded=as.character(Formula_ded),
charge=Charge)]
dt <- dtable(ID = rep(id,each = l_add),
mol_form = rep(mol_form,each = l_add),
adduct = rep(adds$Name,l_mol),
add = rep(adds$add,l_mol),
ded = rep(adds$ded,l_mol),
charge= rep(adds$charge,l_mol))
merger <- function (mol_form,add,ded) {
full_form <- rep(NA_character_,length(mol_form))
both_ind <- which(add != 'FALSE' & ded != 'FALSE')
add_only_ind <- which(add != 'FALSE' & ded == 'FALSE')
ded_only_ind <- which(ded != 'FALSE' & add == 'FALSE')
ainds <- c(both_ind,add_only_ind)
full_form[ainds] <- vapply(ainds,function (i) enviPat::mergeform(mol_form[[i]],add[[i]]),FUN.VALUE = character(1), USE.NAMES = F)
dinds <- c(both_ind,ded_only_ind)
full_form[dinds] <- vapply(dinds,function (i) {
z <- check_ded2(mol_form[[i]],ded[[i]])
if (z) enviPat::subform(mol_form[[i]],ded[[i]]) else NA_character_
},
FUN.VALUE = character(1))
full_form
}
dt[,("full_form"):=.(merger(mol_form,add,ded))]
dt[!is.na(full_form),("mz"):=.(mapply(function(ff,ch) enviPat::isopattern(ISOTOPES,chemforms = ff,
charge = ch, verbose = F)[[1]][1],
full_form,
charge, USE.NAMES = F))]
dt[is.na(full_form),("mz"):=NA_real_]
dt
}
calc_mz_from_formula <- function(chform,adduct,id) {
check_chform <- enviPat::check_chemform(ISOTOPES,chform)
wind <- which(check_chform$warning)
if (length(wind) > 0) stop("Cannot understand the following formulas: ",
paste(check_chform$new_formula[wind],collapse = ","))
mol_form <- check_chform$new_formula
uad <- unique(adduct)
uadds <- lapply(uad,function(a) ADDUCTS[Name==a,.(Name,
add=as.character(Formula_add),
ded=as.character(Formula_ded),
charge=Charge),on=""])
names(uadds) <- uad
adds <- rbindlist(l=lapply(adduct,function(a) uadds[[a]]))
merger <- function (mol_form,add,ded) {
res <- numeric(length(mol_form))
both_ind <- which(add != 'FALSE' & ded != 'FALSE')
add_only_ind <- which(add != 'FALSE' & ded == 'FALSE')
ded_only_ind <- which(ded != 'FALSE' & add == 'FALSE')
ainds <- c(both_ind,add_only_ind)
res[ainds] <- vapply(ainds,function (i) enviPat::mergeform(mol_form[[i]],add[[i]]),FUN.VALUE = character(1), USE.NAMES = F)
dinds <- c(both_ind,ded_only_ind)
res[dinds] <- vapply(dinds,function (i) {
z <- check_ded2(mol_form[[i]],ded[[i]])
if (z) enviPat::subform(mol_form[[i]],ded[[i]]) else NA_character_
},
FUN.VALUE = character(1))
res
}
forms <- merger(mol_form,adds$add,adds$ded)
## Check if formulas actually calculated.
bad_idx <- which(forms=="0")
bad_adducts <- adduct[bad_idx]
bad_ids <- id[bad_idx]
non_dups <- !duplicated(bad_idx)
bad_ids <- bad_ids[non_dups]
bad_adducts <- bad_adducts[non_dups]
if (length(bad_idx)>0) stop(paste0("Unable to process the adducts:\n",
paste(bad_adducts,collapse = ","),
"\nfor id-s:",
paste(bad_ids,collapse = ",")))
mz <- the_ifelse(!is.na(forms),
mapply(function(ff,ch) enviPat::isopattern(ISOTOPES,chemforms = ff,
charge = ch, verbose = F)[[1]][1],
forms,
adds$charge, USE.NAMES = F),
NA_real_)
mz
}
calc_mz_from_smiles <- function(smiles,adduct,id) {
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
if (length(check) > 0)
stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
names(mol_form) <- id
calc_mz_from_formula(mol_form,adduct,id)
}
calc_mz_from_smiles_outer <- function(smiles,adduct,id) {
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
if (length(check) > 0)
stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
names(mol_form) <- id
calc_mz_from_formula_outer(mol_form,adduct,id)
}
x<-cmpL[[cname]][[ind]]
if (!is.null(x)) x else NA
gen_clean_state_summ<-function(summ) {
summ$Comments <- ""
summ[c("MS1","MS2","Alignment","AboveNoise")] <- T
summ["MS2rt"] <- NA_real_
summ["iMS2rt"] <- NA_integer_
summ["rt"]<-NA_real_
summ["checked"]<-'NONE'
summ
gen_empty_summ <- function() {
EMPTY_SUMM
}
gen_summ <- function(comp,qa_ms1,qa_ms2) {
comp_cols <- intersect(SUMM_COLS,colnames(comp))
summ <- comp[,..comp_cols]
data.table::setkeyv(summ,BASE_KEY)
ms1_cols <- intersect(SUMM_COLS,colnames(qa_ms1))
ms1_cols <- setdiff(ms1_cols,colnames(summ))
summ <- qa_ms1[summ,c(..comp_cols,..ms1_cols),on=BASE_KEY]
ms2_cols <- intersect(colnames(qa_ms2),SUMM_COLS)
ms2_cols <- setdiff(ms2_cols,colnames(summ))
summ <- qa_ms2[summ,c(..comp_cols,..ms1_cols,..ms2_cols),on=BASE_KEY]
data.table::setkeyv(summ,c(BASE_KEY_MS2,"an"))
summ[,qa_ms1_exists:=the_ifelse(!is.na(qa_ms1_good_int),T,F)]
summ[,qa_ms2_exists:=the_ifelse(!is.na(CE),T,F)]
summ[,qa_pass:=apply(.SD,1,all),.SDcols=QA_FLAGS[!(QA_FLAGS %in% "qa_pass")]]
summ$Comments<-""
data.table::setkeyv(summ,DEF_KEY_SUMM)
data.table::setcolorder(summ,SUMM_COLS)
summ
}
## Returns indices that are ok to be auto processed.
which(summ$checked==SUMM_CHK_NONE | summ$checked==SUMM_CHK_AUTO)
preProc <- function (summ,noiseFac=3,errRT=0.5,intThreshMS1=1e5,intThreshMS2=5000.) {
wds<-unique(summ$wd)
fn_spec<-function(wd) readRDS(file.path(wd,FN_SPEC))
message("Loading RDS-es ...")
allData<-lapply(wds,fn_spec)
names(allData)<-wds
message("... done with RDSs")
## QA check plan:
##
## If MS1 does not exist, set MS1 to F, as well as everything else except MS2.
## If it exists, proceed to noise check.
## If noise check fails, set AboveNoise and Alignment to F.
##
##
## MS2 will be checked independently.
## If MS2 does not exist, set MS2 and Alignment to F.
## If it does, check the Alignment.
## If Alignment is wrong, set Alignment to F.
##
## Terminology: MS1 does not exist if the intensity is below the
## intensity threshold. MS2 does not exist if it was not picked up
## during the dataframe generation stage. In this case, the file
## with the corresponding ID will not be there.
wd <- summ$wd[ind]
id <- summ$ID[ind]
ii<-match(nid,MSnbase::fData(eics)[["ID"]]) #id, because id-s, not nid-s are in fData for ms1 eics;
eic1<-eics[[ii]]
eic<-data.frame(rt=MSnbase::rtime(eic1)/60.,intensity=MSnbase::intensity(eic1))
colnames(eic)<-c("rt","intensity")
if (nrow(eic)==0) {
warning("No chromatogram for id ",id," found in", wd, " . Skipping.")
ms1MaxInd<-which.max(eic$intensity)
maxInt<-eic$intensity[[ms1MaxInd]]
summ[ind,"rt"]<-eic$rt[[ms1MaxInd]]
if (maxInt < intThreshMS1) {
summ[ind,"Alignment"] <- F
summ[ind,"AboveNoise"] <- F
mInt <- mean(eic$intensity)
if (maxInt < noiseFac*mInt) {
summ[ind,"AboveNoise"] <- F
summ[ind,"Alignment"] <- F ## If noisy, this is
## probably meaningles, so
## F.
ms2nids<-names(ms2)
if (! (nid %in% ms2nids)) {
summ[ind,"MS2"] <- F
summ[ind,"Alignment"] <- F
## rtInd <- ms1MaxInd #match(maxInt,eic$intensity)
rtMS1Peak <- eic$rt[[ms1MaxInd]]
msms<-MSnbase::fData(sp)[,c("rtm","maxI")]
colnames(msms)<-c("rt","intensity")
rtInd <- which((msms$rt > rtMS1Peak - errRT) &
(msms$rt < rtMS1Peak + errRT)) #Close enough?
rtIndMS1 <- which((eic$rt > rtMS1Peak - errRT) &
(eic$rt < rtMS1Peak + errRT)) #Filter the relevant MS1 part.
eicFilt<- eic[rtIndMS1,]
eicFilt<- eicFilt[which(eicFilt$intensity>intThreshMS1),]
mInt<- maxInt #mean(eicFilt$intensity)
rtInd <- rtInd[which(msms$intensity[rtInd]>intThreshMS2)] #Intense enough?
msmsInt<- msms$intensity[rtInd]
msmsRTind <- which.min(abs(msmsRT - rtMS1Peak))
summ[ind,"iMS2rt"] <- rtInd[msmsRTind]
summ[ind,"MS2rt"] <- msmsRT[msmsRTind]
smiles2img <- function(smiles, kekulise=TRUE, width=300, height=300,
zoom=1.3,style="cow", annotate="off", abbr="on",suppressh=TRUE,
showTitle=FALSE, smaLimit=100, sma=NULL) {
dep <- rcdk::get.depictor(width = width, height = height, zoom = zoom, style = style, annotate = annotate,
abbr = abbr, suppressh = suppressh, showTitle = showTitle, smaLimit = smaLimit,
sma = NULL)
mol <- RMassBank::getMolecule(smiles)
z<-rcdk::view.image.2d(mol, depictor=dep)
grid::rasterGrob(z)
gen_ms2_spec_data <- function(id,tag,iMS2rt,data,luckyN=NA) {
## Given the id, tag and the index of the MS2 spectrum, return the
## dataframe of the spectrum, with luckyN number of lagerst
## intensity peaks.
nid<-id2name(id)
if (!is.na(iMS2rt)) {
d <- data$ms2[[nid]][[iMS2rt]]
if (!is.null(d)) {
x<-data.frame(mz=MSnbase::mz(d),intensity=MSnbase::intensity(d))
res<-if (!is.na(luckyN)) {
ord<-order(x$intensity,decreasing = T)
len<-length(ord)
sz<-min(len,luckyN)
nx<-x[ord,]
nx<-nx[1:sz,]
ord<-order(nx$mz)
nx[ord,]
} else x
return(res)
gen_ms2_spec_fn <- function(id,tag,adduct,set,width=6) {
suppressWarnings({
iid<-as.numeric(id)
iid<- if (!is.na(iid)) iid else id
num <- formatC(iid,width = width,format='d',flag='0')
ss<-trimws(paste(num,adduct,tag,set,sep="_"),which='both')
plot_id_msn <- function(ni,
data,
rtMS1,
rtMS2,
prop,
theme,
pal="Dark2",
cex=0.75,
rt_digits=2,
m_digits=4) {
clean_range<-function(def,rng) {
x1 <- rng[1]
x2 <- rng[2]
if (is.na(x1) || x1 == 0) x1 <- def[1]
if (is.na(x2) || x2 == 0) x2 <- def[2]
c(x1,x2)
mk_title<-function() paste("EIC (",
"m/z = ",
formatC(mass,format='f',digits=m_digits),
")",sep='')
mk_leg_lab<-function(tag,rt) {paste(tag,"; rt= ",formatC(rt[[tag]],format='f',digits=rt_digits),"min")}
sci10<-function(x) {ifelse(x==0, "0", parse(text=gsub("[+]", "", gsub("e", " %*% 10^", scales::scientific_format()(x)))))}
dfChrMS1<-NULL
dfChrMS2<-NULL
dfSpecMS2<-NULL
## MS1 time series.
dfschrms1<-lapply(tags,function(tag) {d<-data[[tag]]$eic
data.frame(rt=MSnbase::rtime(cg)/60.,
intensity=MSnbase::intensity(cg),tag=as.character(tag),legend=mk_leg_lab(tag,rtMS1))
dfChrMS1<-do.call(rbind,c(dfschrms1,list(make.row.names=F)))
dfsChrMS2<-lapply(tags,function(tag) {
if (!is.null(d)) {
df<-MSnbase::fData(d)[,c("rtm","maxI")]
colnames(df)<-c("rt","intensity")
df$tag<-as.character(tag)
if (!all(sapply(dfsChrMS2,is.null))) dfChrMS2<-do.call(rbind,c(dfsChrMS2,list(make.row.names=F)))
if (!all(sapply(dfsChrMS2,is.null))) {
dfsSpecMS2<-lapply(tags,function(tag) {
d<-data[[tag]]$ms2[[ni]]
if (!is.null(d)) {
if (!is.na(ind)) {
x<-data.frame(mz=MSnbase::mz(d[[ind]]),intensity=MSnbase::intensity(d[[ind]]))
x$tag<-tag
x
} else NULL
}
})
dfsSpecMS2<-dfsSpecMS2[!is.null(dfsSpecMS2)]
dfSpecMS2<-do.call(rbind,c(dfsSpecMS2,list(make.row.names=F)))
}
## Ranges
if (!is.null(dfChrMS1)) {
rrtMS1<-range(dfChrMS1$rt)
rrtMS1 <- if (is.null(prop$ms1$rt)) rrtMS1 else clean_range(rrtMS1,prop$ms1$rt)
rintMS1 <- if (is.null(prop$ms1$irng)) rintMS2 else clean_range(rintMS1,prop$ms1$irng)
rrtMS2 <- if (is.null(prop$ms2$rt)) rrtMS2 else clean_range(rrtMS2,prop$ms2$rt)
rintMS2<-range(dfChrMS2$intensity)
rintMS2 <- if (is.null(prop$ms2$irng)) rintMS2 else clean_range(rintMS2,prop$ms2$irng)
}
rmzSpMS2<-range(dfSpecMS2$mz)
rintSpMS2<-range(dfSpecMS2$intensity)
rmzSpMS2<- if (is.null(prop$spec$mzrng)) rmzSpMS2 else clean_range(rmzSpMS2,prop$spec$mzrng)
rintSpMS2<- if (is.null(prop$spec$irng)) rintSpMS2 else clean_range(rintSpMS2,prop$spec$irng)
}
ch_ms1_deco<-function(ggobj) {
titMS1<-mk_title()
scale_y<-if (!prop$ms1$axis=="log") {
ggplot2::scale_y_continuous
} else {
ggplot2::scale_y_log10
}
ggobj+
ggplot2::geom_line(ggplot2::aes(colour=legend),key_glyph=KEY_GLYPH)+
ggplot2::coord_cartesian(xlim = rrtMS1,
ylim = rintMS1)+
ggplot2::labs(x=CHR_GRAM_X,y=CHR_GRAM_Y,
title=titMS1,tag=i,
colour=PLOT_MS1_LEG_TIT)+
scale_y(labels=sci10)+theme()
}
ch_ms2_deco<-function(ggobj) {
scale_y<-if (!prop$ms2$axis=="log") {
ggplot2::scale_y_continuous
} else {
ggplot2::scale_y_log10
}
ggobj+
ggplot2::geom_linerange(ggplot2::aes(colour=legend),key_glyph=KEY_GLYPH)+
ggplot2::coord_cartesian(xlim = rrtMS2,
ylim = rintMS2)+
ggplot2::labs(x=CHR_GRAM_X,y=CHR_GRAM_Y,title=NULL,subtitle = "MS2",tag = " ")+
scale_y(labels=sci10)+
ggplot2::labs(colour=PLOT_MS2_LEG_TIT)+theme()
}
ch_spec_deco<-function(ggobj) {
scale_y<-if (!prop$spec$axis=="log") {
ggplot2::scale_y_continuous
} else {
ggplot2::scale_y_log10
}
ggobj+
ggplot2::geom_linerange(ggplot2::aes(colour=tag),key_glyph=KEY_GLYPH)+
ggplot2::coord_cartesian(xlim = rmzSpMS2,
ylim = rintSpMS2)+
ggplot2::labs(subtitle="MS2",y="intensity")+
scale_y(labels=sci10)+theme()
}
## MS1 time series.
plMS1<- if(is.data.frame(dfChrMS1) && nrow(dfChrMS1)>0) {
ch_ms1_deco(ggplot2::ggplot(data=dfChrMS1,ggplot2::aes(x=rt,y=intensity,group=legend)))
} else NULL
## Empty
plEmpty<-ggplot2::ggplot(data=dfChrMS1,ggplot2::aes(x=rt,y=intensity))+ggplot2::theme_void()
## MS2 time series.
plMS2 <- if (!all(sapply(dfsChrMS2,is.null))) {
ch_ms2_deco(ggplot2::ggplot(data=dfChrMS2,ggplot2::aes(x=rt,ymin=0,ymax=intensity,group=legend)))
} else plEmpty
## Structure
if (!is.null(smile) && !is.na(smile) && !nchar(smile)<1) {
g<-smiles2img(smile,width=500,height=500,zoom=4.5)
plStruc<-ggplot2::ggplot(data=dfChrMS1,ggplot2::aes(x=rt,y=intensity))+
ggplot2::geom_blank()+ggplot2::annotation_custom(g)+ggplot2::theme_void()
} else plStruc<-plEmpty
## MS2 Spectrum
if (!all(sapply(dfsChrMS2,is.null))) {
plSpecMS2<-if (is.data.frame(dfSpecMS2)) { #sometimes
#dfSpecMS2 ends up
#as a list of
#logicals; this
#probably happens
#when either MS2 is
#bad in some way,
#or the RT
#intervals are
ch_spec_deco(ggplot2::ggplot(data=dfSpecMS2,
ggplot2::aes(x=mz,
ymin=0,
ymax=intensity,group=tag)))
## Lucky N the most intense N TODO
## lckN<-if (is.data.frame(dfSpecMS2)) {
## ord<-order(dfSpecMS2$intensity,decreasing=T)
## ll<-length(ord)
## theL<-min(ll,MS2_1ST_N)
## mzN<-dfSpecMS2$mz[ord][1:theL]
## inN<-dfSpecMS2$intensity[ord][1:theL]
## df<-data.frame("m/z"=mzN,"intensity"=inN)
## message("DF:")
## str(df)
## message("---DF")
## gridExtra::tableGrob(df) #+ggplot2::labs(subtitle="Top m/z")
res<- if (!is.null(plMS1)) cowplot::plot_grid(plMS1,plStruc,plMS2,plEmpty,plSpecMS2,align = "hv",axis='l',ncol = 2,nrow=3,rel_widths=c(3,1)) else NULL
add_wd_to_mzml <- function(fn,proj) {
wd<-basename(tools::file_path_sans_ext(fn))
file.path(proj,wd)
getEntryFromComp<-function(entry,id,set,adduct,compTab) {
ind <- which(compTab$ID %in% id &
compTab$set %in% set &
res<- if (length(ind)==1) compTab[ind,entry] else {
if (length(ind)>1) {
warning("Nonunique selection in comprehensive table:")
for (i in ind) {
message('ID: ',compTab$ID[[i]],' set: ',compTab$set[[i]],' adduct: ',compTab$adduct[[i]])
warning("The compound set table likely containes duplicate IDs per set/adduct combination. Please correct this.")
warning("Entries not found for id ", id,"set ",set, "and adduct ", adduct, " .")
## add_comp_summ <- function(ft,ctab) {
## nR<-nrow(ft)
## mzCol<-rep(NA,nR)
## nmCol<-rep("",nR)
## rtCol<-rep(NA,nR)
## for (ir in 1:nR) {
## id<-ft[ir,"ID"]
## set<-ft[ir,"set"]
## entries<-getEntryFromComp(c("mz","Name","rt"),id,set,m,ctab)
## mzCol[[ir]]<- entries[["mz"]]
## nm<-entries[["Name"]]
## nmCol[[ir]]<- if (!is.na(nm)) nm else ""
## rtCol[[ir]]<- entries[["rt"]]
## }
## ft$mz<-mzCol
## ft$Name<-nmCol
## ft$rt<-rtCol
## ft
## }
get_set_adduct <- function(s,mzml) {
unique(mzml[set == s,adduct])
vald_comp_tab<-function(df,ndf,checkSMILES=F,checkMz=F,checkNames=F) {
## Fields.
if (is.null(df$ID)) stop("Column ID missing in ",ndf," .")
if (checkMz && is.null(df$mz)) stop("Column mz missing in ", ndf, " .")
if (checkSMILES && is.null(df$SMILES)) stop("Column SMILES missing in", ndf, " .")
if (checkNames && is.null(df$Name)) warning("Column Name missing in ", ndf," , continuing without.")
if (is.null(df$RT) && is.null(df$rt)) {
warning("Column RT (alternatively, rt) missing in ", ndf, ", continuing without.")
} else {
if (is.null(df$rt)) {
df$rt<-df$RT
df$RT<-NULL
}
}
## Missing IDs?
ind<-which(is.na(df$ID))
if (length(ind)>0) {
for (i in ind) {
warning("ID missing at row: ",i," . Big trouble ahead.")
}
## Unique IDs?
luids<-length(unique(df$ID))
if (length(df$ID) > luids) {
warning("Duplicate IDs in ", ndf, " are not allowed.")
return(NULL)
}
## Missing SMILES?
if (checkSMILES) {
ind<-which(is.na(df$SMILES))
if (length(ind)>0) {
for (i in ind) {
warning("SMILES missing at row: ",i, "; ID: ",df$ID[[i]]," .")
}
}
lsmiles<-nrow(df)
ll<-length(unique(df$SMILES))
if (ll<lsmiles) {
warning("There are duplicate SMILES in the compound list. Trouble ahead.")
}
}
## Missing mz?
if (checkMz) {
ind<-which(is.na(df$mz))
if (length(ind)>0) {
for (i in ind) {
warning("mz missing at row: ",i, "; ID: ",df$ID[[i]]," .")
}
read_setid <- function(fn,cmpds) {
assert(file.exists(fn),msg=paste("Please provide valid compounds set table:", fn))
assert(nrow(cmpds) > 0,msg="Please provide at least one compounds list.")
setid <- file2tab(fn,colClasses=c(ID="character"))
x<-cmpds[setid,on='ID'][,.SD,.SDcols=c(colnames(setid),'known')]
sids <- unique(setid$ID)
cids <- unique(cmpds$ID)
diff <- setdiff(sids,cids)
assert(length(diff)==0,msg=paste("The following IDs from set table have not been found in the compound table:","------",print_table(dtable(diff)),"------",sep = "\n"))
x
if (NROW(m$input$tab$mzml)>0) tab2file(tab=m$input$tab$mzml,file=file.path(m$conf$project,FN_DATA_TAB))
yaml::write_yaml(x=m$conf,file=fn)
write_state <- function(m,fn_conf) {
write_conf(m,fn_conf)
tab2file(tab=m$input$tab$mzml,file=file.path(m$conf$project,FN_DATA_TAB))
read_conf <- function(fn) {
cf <- yaml::yaml.load_file(fn)
fnl <- cf$compound$lists
nms <- character(0)
for (i in 1:length(fnl)) {
nms <- gen_uniq_lab(nms,pref = 'L')
}
names(fnl) <- nms
}
cf$compound$lists <- fnl
## conf_trans(cf)
cf
##' @export
get_fn_comp <- function(m) {
file.path(m$conf$project,FN_COMP_TAB)
}
##' @export
get_fn_summ <- function(m) {
file.path(m$conf$project, FN_SUMM)
}
##' @export
get_fn_extr <- function(m) {
file.path(m$conf$project, "extracted.rds")
}
##' @export
get_fn_conf <- function(m) {
file.path(m$conf$project, FN_CONF)
}
##' @export
get_fn_ftab <- function(m) {
file.path(m$conf$project, FN_DATA_TAB)
}
init_state <- function(m) {
m$out$tab <- list()
m$input$tab$mzml <- EMPTY_MZML
lab <- gen_uniq_lab(list(),pref="L")
m$input$tab$lists <- list()
m$input$tab[[lab[[1]]]] <- EMPTY_CMPD_LIST
base_conf <- function () {
m <- list()
m$conf <- list(project=getwd(),
compounds=list(lists=list(),
sets="",
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
m
}
extr_conf <- function(m) {
m$conf$tolerance <- list("ms1 coarse"=MS1_ERR_COARSE,
"ms1 fine"=MS1_ERR_FINE,
"eic"=EIC_ERR,
"rt"=RT_EXTR_ERR)
m
}
presc_conf <- function(m) {
m$conf$prescreen <- list("ms1_int_thresh"=1e5,
"ms2_int_thresh"=2.5e3,
"s2n"=3,
"ret_time_shift_tol"=0.5)
m
}
new_conf <- function() presc_conf(
extr_conf(
base_conf()))
verify_cmpd_l <- function(dt,fn) {
fields <- colnames(EMPTY_CMPD_LIST)
dtflds <- colnames(dt)
assert('ID' %in% dtflds, msg = paste('ID column must be present and filled in', fn))
ess <- c('SMILES','Formula','mz')
pres <- ess %in% dtflds
assert(length(pres) > 0,
msg = paste('Compound list from ',fn,
'does not contain any of "SMILES", "Formula", or "mz". \nThe compound list needs at least one of those to be valid.'))
exst <- ess[pres]
x <- lapply(exst,function (nm) do.call(all,as.list(is.na(dt[[nm]]))))
assert(!do.call(all,x), msg = paste('At least one of', paste(exst,collapse = ','),
'\nmust contain some values in compound list from',fn))
## INPUT TRANSLATORS
grab_unit <- function(entry,unit) {
what <- paste0("\\<",unit,"\\>$")
entry <- trimws(entry,which="both")
if (grepl(what,entry))
suppressWarnings(as.numeric(sub(paste0("^(.*)",unit),"\\1",entry))) else NA_real_
}
rt_in_min <- function(entry) {
xs <- grab_unit(entry,"s")
xm <- grab_unit(entry,"min")
x <- if (is.na(xm)) xs/60. else xm
x
}
conf_trans_pres <- function(pres_list) {
## Translate and validate prescreening input.
pres_list[CONF_PRES_NUM] <- sapply(pres_list[CONF_PRES_NUM],as.numeric)
for (par in CONF_PRES_NUM) {
assert(!suppressWarnings(is.na(pres_list[[par]])),msg=paste("Prescreen parameter",par,"is not a number."))
}
for (par in CONF_PRES_TU) {
xs <- grab_unit(pres_list[[par]],"s")
xm <- grab_unit(pres_list[[par]],"min")
x <- if (is.na(xm)) xs else xm
assert(!is.na(x),msg = paste("Time unit parameter error for",par,"Only s(econds) or min(utes) allowed."))
pres_list[[par]] <- x
}
pres_list
}
create_qa_table <- function(extr,conf_presc) {
## The first input argument is the extracted `ms`, table
## containing MS1 and MS2 spectra. The argument `conf_presc` is
## m$conf$prescreen, the prescreening parameters given in the conf
## file.
## The qa table is just a copy of ms with added quality control
## columns QA_COLS.
## The QA_FLAGS columns are flags specifying which properties of
## compounds are known well, or not.
## For each compound (mass) we ask the following questions:
## qa_ms1_exists -- does the MS1 spectrum exist at all?
## qa_ms2_exists -- do we have any MS2 spectra at all?
## qa_ms1_above_noise -- is MS1 above the noise treshold?
## qa_ms2_near -- is there any MS2 spectrum inside the tolerated
## retention time window around the MS1 peak? That is, are we
## non-RT-shifted?
## qa_ms2_good_int -- Is there any MS2 spectral intensity greater
## than the MS2 threshold and less than the MS1 peak?
## qa_pass -- did the spectrum pass all the checks?
## The columns in QA_NUM_REAL are:
##
## ms1_int -- the maximum intensity of MS1 spectrum over the
## The columns in QA_NUM_INT are:
##
## ms2_sel -- index of the selected MS2 spectrum; if not NA, the
## associated spectrum passed all the checks (qa_pass == T); the
## spectrum itself is in one of the member sublists of the `spec'
## column. The integer `ms2_sel' is then the index of the spectrum
## ms1_rt_ind -- TODO (but not important to end users).
checks <- extr$ms2[,{
z <-..QA_FLAGS
z[1:length(z)]<-F
names(z)<-..QA_FLAGS
z
},keyby=BASE_KEY_MS2]
checks[,(QA_NUM_INT):=NA_integer_]
checks[,(QA_NUM_REAL):=NA_real_]
setkeyv(checks,BASE_KEY_MS2)
qa$checks <- checks
qa
}
assess_ms1 <- function(m) {
qa <- m$qa
## Calculate auxiliary variables and indices.
qa_ms1 <- ms1[,.(ms1_rt_ind=which.max(intensity)),keyby=BASE_KEY]
qa_ms1 <- ms1[qa_ms1,.(ms1_rt_ind=ms1_rt_ind,
ms1_int=intensity[[ms1_rt_ind]],
ms1_rt=rt[[ms1_rt_ind]],
ms1_mean=mean(intensity)),on=BASE_KEY,by=.EACHI]
qa_ms1[,qa_ms1_good_int := ms1_int > qa$prescreen$ms1_int_thresh]
qa_ms1[,qa_ms1_above_noise := F]
qa_ms1[qa_ms1_good_int==T,qa_ms1_above_noise := .(ms1_int > qa$prescreen$s2n*ms1_mean)]
## checks[(!qa_ms1_above_noise),c("qa_ms2_good_int","qa_ms2_near","qa_ms2_exists","qa_pass"):=F]
## qa_ms1 <- check_ms1_noise(check_ms1(qa_ms1))
m$qa$ms1 <- qa_ms1
m
presconf <- conf_trans_pres(m$conf$prescreen)
ms1 <- m$extr$ms1
ms2 <- m$extr$ms2
qa_ms1 <- m$qa$ms1
qa_ms2 <- ms2[qa_ms1[qa_ms1_above_noise==T],.(CE=unique(CE),
pc_rt=i.ms1_rt,
pc_int=i.ms1_int,
an=unique(an)),on=BASE_KEY,by=.EACHI,nomatch=NULL]
rt_win2 <- presconf$ret_time_shift_tol
qa_ms2 <- ms2[qa_ms2,.(pc_rt=pc_rt,
pc_int=pc_int,
ms2_int=max(intensity),
qa_ms2_near=head(rt,1) < pc_rt + rt_win2 & head(rt,1) > pc_rt - rt_win2),
by=.EACHI,on=c(BASE_KEY_MS2,"an")]
qa_ms2$qa_ms2_good_int <-F
qa_ms2[qa_ms2_near==T,
qa_ms2_good_int := ms2_int > presconf$ms2_int_thresh & ms2_int < pc_int,
by=c(BASE_KEY_MS2,"an")]
## qa_ms2$qa_pass <- F
## qa_ms2[qa_ms2_good_int==T,qa_pass:=T]
qa_ms2[qa_ms2_good_int==T,ms2_sel:={
ind<-which.min(abs(ms2_rt-pc_rt))
z<-ms2_sel
z[[ind]]<-T
z
},by=BASE_KEY_MS2]
setkeyv(qa_ms2,BASE_KEY_MS2)
m$qa$ms2 <- qa_ms2