Skip to content
Snippets Groups Projects
Unverified Commit fd436356 authored by Todor Kondic's avatar Todor Kondic
Browse files

api, mix, resources: Try to get mk_comp_tab to work

parent c0bf64ca
No related branches found
No related tags found
No related merge requests found
......@@ -49,14 +49,14 @@ load_compound_input <- function(m) {
for (l in 1:length(fns)) {
fn <- fns[[l]]
fnfields <- colnames(fn)
dt <- file2tab(fn)
dt <- file2tab(fn, colClasses=c(ID="character"))
verify_cmpd_l(dt=dt,fn=fn)
nonexist <- setdiff(colnames,fields)
coll[[l]] <- dt[,(nonexist) := NULL]
nonexist <- setdiff(fnfields,fields)
coll[[l]] <- if (length(nonexist)==0) dt else dt[,(nonexist) := NULL]
coll[[l]]$ORIG <- fn
}
cmpds <- if (length(fns)>0) rbindlist(l=coll,use.names = T, fill = T) else EMPTY_CMPD_LIST
cmpds <- if (length(fns)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST
cmpds[,("known"):=.(the_ifelse(!is.na(SMILES),"structure",the_ifelse(!is.na(Formula),"formula","mz")))]
m$input$tab$cmpds <- cmpds
m$input$tab$setid <- read_setid(m$conf$compounds$sets,
......@@ -80,40 +80,34 @@ load_inputs <- function(m) {
##' @export
mk_comp_tab <- function(m) {
message("Started assembling the lists of knowns and unknowns into the `comprehensive' table.")
setid <- m$input$tab$setid
setkey(setid,set)
mzml<- m$input$tab$mzml
setkey(mzml,set)
unk<-m$input$tab$unknown
setkey(unk,ID)
known<-m$input$tab$known
setkey(known,ID)
cmpds<-m$input$tab$cmpds
setkey(cmpds,ID)
mzml[,`:=`(wd=sapply(Files,add_wd_to_mzml,m$conf$project))]
assert(xor(nrow(unk)==0,nrow(known)==0),msg="No compound lists have been provided. At least one of the known, or unknown compound lists is required.")
message("Begin generation of comp table.")
## knowns
setidKnown<- mzml[setid[origin %in% "known"],.(tag,adduct,ID,set,Files,wd),on="set",allow.cartesian=T]
tab2file(tab=setidKnown,file="setidKnown.csv")
compKnown <- known[setidKnown,on=c("ID"),allow.cartesian=T]
setkey(compKnown,set,ID)
tab2file(tab=compKnown,file="compKnown.csv")
compKnown[,`:=`(mz=mapply(get_mz_from_smiles,SMILES,adduct,USE.NAMES = F))]
message("Generation of comp table: knowns done.")
## unknows
setidUnk<-mzml[setid[origin %in% "unknown"],.(tag,adduct,ID,set,Files,wd),on="set",allow.cartesian=T]
compUnk <- unk[setidUnk,on="ID"]
message("Generation of comp table: unknowns done.")
df<-rbindlist(l=list(compKnown, compUnk),fill = T)
setnames(df,names(COMP_NAME_MAP),
assert(nrow(cmpds)>0,msg="No compound lists have been provided.")
message("Begin generation of the comprehensive table.")
comp <- cmpds[setid,on="ID"][mzml,.(tag,adduct,ID,RT,set,Name,Files,wd,SMILES,Formula,mz,known),on="set",allow.cartesian=T]
tab2file(tab=comp,file=paste0("setidmerge",".csv"))
setkey(comp,known,set,ID)
## Known structure.
## comp[,`:=`(mz=mapply(calc_mz_from_smiles,SMILES,adduct,ID,USE.NAMES = F))]
comp[known=="structure",`:=`(mz=calc_mz_from_smiles(SMILES,adduct,ID))]
## Known formula.
comp[known=="formula",`:=`(mz=calc_mz_from_formula(Formula,adduct,ID))]
setnames(comp,names(COMP_NAME_MAP),
function(o) COMP_NAME_MAP[[o]])
setcolorder(df,COMP_NAME_FIRST)
setcolorder(comp,COMP_NAME_FIRST)
fn_out <- file.path(m$conf$project,FN_COMP_TAB)
tab2file(tab=df,file=fn_out)
tab2file(tab=comp,file=fn_out)
message("Generation of comp table finished.")
setkeyv(df,c("set","tag","mz"))
m$out$tab$comp <- df
setkeyv(comp,c("set","tag","mz"))
m$out$tab$comp <- comp
m
}
......
......@@ -31,7 +31,7 @@ get_mz_cmp_l<-function(id,adduct,cmpL) {
res
}
calc_mz_from_formula <- function(chform,adduct,id) {
calc_mz_from_formula_outer <- function(chform,adduct,id) {
check_chform <- enviPat::check_chemform(ISOTOPES,chform)
wind <- which(check_chform$warning)
if (length(wind) > 0) stop("Cannot understand the following formulas: ",
......@@ -76,12 +76,50 @@ calc_mz_from_formula <- function(chform,adduct,id) {
dt
}
calc_mz_from_formula <- function(chform,adduct,id) {
check_chform <- enviPat::check_chemform(ISOTOPES,chform)
wind <- which(check_chform$warning)
if (length(wind) > 0) stop("Cannot understand the following formulas: ",
paste(check_chform$new_formula[wind],collapse = ","))
mol_form <- check_chform$new_formula
uad <- unique(adduct)
uadds <- lapply(uad,function(a) ADDUCTS[Name==a,.(Name,
add=as.character(Formula_add),
ded=as.character(Formula_ded),
charge=Charge),on=""])
names(uadds) <- uad
adds <- rbindlist(l=lapply(adduct,function(a) uadds[[a]]))
merger <- function (mol_form,add,ded) {
res <- numeric(length(mol_form))
both_ind <- which(add != 'FALSE' & ded != 'FALSE')
add_only_ind <- which(add != 'FALSE' & ded == 'FALSE')
ded_only_ind <- which(ded != 'FALSE' & add == 'FALSE')
ainds <- c(both_ind,add_only_ind)
res[ainds] <- vapply(ainds,function (i) enviPat::mergeform(mol_form[[i]],add[[i]]),FUN.VALUE = character(1), USE.NAMES = F)
dinds <- c(both_ind,ded_only_ind)
res[dinds] <- vapply(dinds,function (i) {
z <- check_ded2(mol_form[[i]],ded[[i]])
if (z) enviPat::subform(mol_form[[i]],ded[[i]]) else NA_character_
},
FUN.VALUE = character(1))
res
}
forms <- merger(mol_form,adds$add,adds$ded)
mz <- the_ifelse(!is.na(forms),
mapply(function(ff,ch) enviPat::isopattern(ISOTOPES,chemforms = ff,
charge = ch, verbose = F)[[1]][1],
forms,
adds$charge, USE.NAMES = F),
NA_real_)
mz
}
calc_mz_from_smiles <- function(smiles,adduct,id) {
mol <- try(getMolecule(smiles), silent = T)
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
if (length(check) > 0)
stop("Errors in SMILES with IDs:",paste(id[which],collapse = T))
stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
names(mol_form) <- id
......@@ -90,6 +128,19 @@ calc_mz_from_smiles <- function(smiles,adduct,id) {
}
calc_mz_from_smiles_outer <- function(smiles,adduct,id) {
mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
check <- which(is.atomic(mol))
if (length(check) > 0)
stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
names(mol_form) <- id
calc_mz_from_formula_outer(mol_form,adduct,id)
}
......@@ -98,15 +149,7 @@ calc_mz_from_smiles <- function(smiles,adduct,id) {
## }
get_mz_from_smiles <- function(smiles,Formula,mz,adduct,id) {
mapply(function (sm,frm,mz) {
if (!is.na(sm)) {
RChemMass::getSuspectMasses(smiles=sm,adduct_list = adduct)
} else if (!is.na(frm)) {
RChemMass::getAdductMassesFromFormula
} })
RChemMass::getSuspectFormulaMass(smiles)[[ADDUCTMAP[[adduct]]]]
}
get_col_from_cmp_l<-function(id,cname,cmpL) {
ind<-match(id,cmpL$ID)
......@@ -642,7 +685,7 @@ vald_comp_tab<-function(df,ndf,checkSMILES=F,checkMz=F,checkNames=F) {
read_setid <- function(fn,cmpds) {
assert(file.exists(fn),msg=paste("Please provide valid compounds set table:", fn))
assert(nrow(cmpds) > 0,msg="Please provide at least one compounds list.")
setid <- file2tab(fn)
setid <- file2tab(fn,colClasses=c(ID="character"))
x<-cmpds[setid,on='ID'][,.SD,.SDcols=c(colnames(setid),'known')]
sids <- unique(setid$ID)
......@@ -685,8 +728,8 @@ verify_cmpd_l <- function(dt,fn) {
msg = paste('Compound list from ',fn,
'does not contain any of "SMILES", "Formula", or "mz". \nThe compound list needs at least one of those to be valid.'))
exst <- ess[pres]
x <- lapply(exst,function (nm) all(is.na(dt[[nm]])))
assert(!all(x), msg = paste('At least one of', paste(exst,collapse = T),
x <- lapply(exst,function (nm) do.call(all,as.list(is.na(dt[[nm]]))))
assert(!do.call(all,x), msg = paste('At least one of', paste(exst,collapse = ','),
'\nmust contain some values in compound list from',fn))
invisible(T)
......
......@@ -39,15 +39,16 @@ data(isotopes,package = "enviPat", envir = .envp)
ADDUCTS <- dtable(.envp$adducts)
ISOTOPES <- dtable(.envp$isotopes)
.envp <- NULL
ADDUCTMAP <- RChemMass:::adducts$Name
names(ADDUCTMAP) <- apply(ADDUCTS,1,function(row) {
nm <- row[["Name"]]
sgn <- row[["Charge"]]
suff <- if (sgn > 0) "+" else if (sgn < 0) "-" else ""
paste0("[",nm,"]",suff)
})
DISP_ADDUCTMAP <- c(c("UNSET"="UNSET_ADDUCT_ERROR"),ADDUCTMAP)
ADDUCTMAP <- ADDUCTS$Name
ADDUCTS$Name <- the_ifelse(ADDUCTS$Charge>0,paste0("[",ADDUCTS$Name,"]+"),paste0("[",ADDUCTS$Name,"]-"))
## names(ADDUCTMAP) <- apply(ADDUCTS,1,function(row) {
## nm <- row[["Name"]]
## sgn <- row[["Charge"]]
## suff <- if (sgn > 0) "+" else if (sgn < 0) "-" else ""
## paste0("[",nm,"]",suff)
## })
## ADDUCTS$Name <- names(ADDUCTMAP)
DISP_ADDUCTS <- c("UNSET",ADDUCTS$Name)
TAG_NA <- "::UNSET::"
SET_NA <- "::UNSET::"
TAG_DEF <- TAG_NA
......@@ -134,12 +135,11 @@ EMPTY_CMPD_LIST <- dtable(ID=character(),
COMP_LIST_COLS <- c("ID","Name","SMILES","Formula","RT","mz")
## Comprehensive table properties
COMP_NAME_MAP <- list(RT="rt")
## COMP_NAMES <-c("ID","mz","rt","adduct","set","origin","Name","SMILES")
COMP_NAME_FIRST <- c("ID","mz","rt","adduct","tag","set","Name","SMILES","Files","wd")
COMP_NAME_FIRST <- c("ID","mz","rt","adduct","tag","set","Name","known","SMILES","Formula","Files","wd")
## File table properties
FTAB_KEY=c("set","tag","mz")
FTAB_NAMES=c("ID", "mz", "rt", "tag", "adduct", "set", "Name", "SMILES", "Files" , "wd","origin")
FTAB_NAMES=c("ID", "mz", "rt", "tag", "adduct", "set", "Name", "SMILES", "Files" , "wd","known")
EMPTY_UNKNOWN <- dtable(ID=character(0),mz=numeric(0),RT=numeric(0),Name=character(0),CAS=character(0))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment