api, mix, resources: Try to get mk_comp_tab to work

fd436356 · Todor Kondic · c0bf64ca · fd436356 · fd436356 · fd436356
Unverified Commit fd436356 authored 4 years ago by Todor Kondic
--- a/R/api.R
+++ b/R/api.R
@@ -49,14 +49,14 @@ load_compound_input <- function(m) {
    for (l in 1:length(fns)) {
        fn <- fns[[l]]
        fnfields <- colnames(fn)
-        dt <- file2tab(fn)
+        dt <- file2tab(fn, colClasses=c(ID="character"))
        verify_cmpd_l(dt=dt,fn=fn)
-        nonexist <- setdiff(colnames,fields)
-        coll[[l]] <- dt[,(nonexist) := NULL]
+        nonexist <- setdiff(fnfields,fields)
+        coll[[l]] <- if (length(nonexist)==0) dt else dt[,(nonexist) := NULL]
        coll[[l]]$ORIG <- fn
        
    }
-    cmpds <- if (length(fns)>0) rbindlist(l=coll,use.names = T, fill = T) else EMPTY_CMPD_LIST
+    cmpds <- if (length(fns)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST
    cmpds[,("known"):=.(the_ifelse(!is.na(SMILES),"structure",the_ifelse(!is.na(Formula),"formula","mz")))]
    m$input$tab$cmpds <- cmpds
    m$input$tab$setid <- read_setid(m$conf$compounds$sets,
@@ -80,40 +80,34 @@ load_inputs <- function(m) {

 ##' @export
 mk_comp_tab <- function(m) {
-    message("Started assembling the lists of knowns and unknowns into the `comprehensive' table.")
    setid <- m$input$tab$setid
    setkey(setid,set)
    mzml<- m$input$tab$mzml
    setkey(mzml,set)
-    unk<-m$input$tab$unknown
-    setkey(unk,ID)
-    known<-m$input$tab$known
-    setkey(known,ID)
+    cmpds<-m$input$tab$cmpds
+    setkey(cmpds,ID)
    mzml[,`:=`(wd=sapply(Files,add_wd_to_mzml,m$conf$project))]
-    assert(xor(nrow(unk)==0,nrow(known)==0),msg="No compound lists have been provided. At least one of the known, or unknown compound lists is required.")
-    message("Begin generation of comp table.")
-    ## knowns
-    setidKnown<- mzml[setid[origin %in% "known"],.(tag,adduct,ID,set,Files,wd),on="set",allow.cartesian=T]
-    tab2file(tab=setidKnown,file="setidKnown.csv")
-    compKnown <- known[setidKnown,on=c("ID"),allow.cartesian=T]
-    setkey(compKnown,set,ID)
-    tab2file(tab=compKnown,file="compKnown.csv")
-    compKnown[,`:=`(mz=mapply(get_mz_from_smiles,SMILES,adduct,USE.NAMES = F))]
-    message("Generation of comp table: knowns done.")
-    ## unknows
-    setidUnk<-mzml[setid[origin %in% "unknown"],.(tag,adduct,ID,set,Files,wd),on="set",allow.cartesian=T]
-    compUnk <- unk[setidUnk,on="ID"]
-    message("Generation of comp table: unknowns done.")
-    df<-rbindlist(l=list(compKnown, compUnk),fill = T)
-    setnames(df,names(COMP_NAME_MAP),
+    assert(nrow(cmpds)>0,msg="No compound lists have been provided.")
+    message("Begin generation of the comprehensive table.")
+   
+    comp <- cmpds[setid,on="ID"][mzml,.(tag,adduct,ID,RT,set,Name,Files,wd,SMILES,Formula,mz,known),on="set",allow.cartesian=T]
+    tab2file(tab=comp,file=paste0("setidmerge",".csv"))
+    setkey(comp,known,set,ID)
+
+    ## Known structure.
+    ## comp[,`:=`(mz=mapply(calc_mz_from_smiles,SMILES,adduct,ID,USE.NAMES = F))]
+    comp[known=="structure",`:=`(mz=calc_mz_from_smiles(SMILES,adduct,ID))]
+
+    ## Known formula.
+    comp[known=="formula",`:=`(mz=calc_mz_from_formula(Formula,adduct,ID))]
+    setnames(comp,names(COMP_NAME_MAP),
             function(o) COMP_NAME_MAP[[o]])
-    setcolorder(df,COMP_NAME_FIRST)
+    setcolorder(comp,COMP_NAME_FIRST)
    fn_out <- file.path(m$conf$project,FN_COMP_TAB)
-    
-    tab2file(tab=df,file=fn_out)
+    tab2file(tab=comp,file=fn_out)
    message("Generation of comp table finished.")
-    setkeyv(df,c("set","tag","mz"))
-    m$out$tab$comp <- df
+    setkeyv(comp,c("set","tag","mz"))
+    m$out$tab$comp <- comp
    m
 }


--- a/R/mix.R
+++ b/R/mix.R
@@ -31,7 +31,7 @@ get_mz_cmp_l<-function(id,adduct,cmpL) {
    res
 }

-calc_mz_from_formula <- function(chform,adduct,id) {
+calc_mz_from_formula_outer <- function(chform,adduct,id) {
    check_chform <- enviPat::check_chemform(ISOTOPES,chform)
    wind <- which(check_chform$warning)
    if (length(wind) > 0) stop("Cannot understand the following formulas: ",
@@ -76,12 +76,50 @@ calc_mz_from_formula <- function(chform,adduct,id) {
    dt
 }

+calc_mz_from_formula <- function(chform,adduct,id) {
+    check_chform <- enviPat::check_chemform(ISOTOPES,chform)
+    wind <- which(check_chform$warning)
+    if (length(wind) > 0) stop("Cannot understand the following formulas: ",
+                               paste(check_chform$new_formula[wind],collapse = ","))
+    mol_form <- check_chform$new_formula
+    uad <- unique(adduct)
+    uadds <- lapply(uad,function(a) ADDUCTS[Name==a,.(Name,
+                                                      add=as.character(Formula_add),
+                                                      ded=as.character(Formula_ded),
+                                                      charge=Charge),on=""])
+    names(uadds) <- uad
+    adds <- rbindlist(l=lapply(adduct,function(a) uadds[[a]]))
+    
+    merger <- function (mol_form,add,ded) {
+        res <- numeric(length(mol_form))
+        both_ind <- which(add != 'FALSE' & ded != 'FALSE')
+        add_only_ind <- which(add != 'FALSE' & ded == 'FALSE')
+        ded_only_ind <- which(ded != 'FALSE' & add == 'FALSE')
+        ainds <- c(both_ind,add_only_ind)
+        res[ainds] <- vapply(ainds,function (i) enviPat::mergeform(mol_form[[i]],add[[i]]),FUN.VALUE = character(1), USE.NAMES = F)
+        dinds <- c(both_ind,ded_only_ind)
+        res[dinds] <- vapply(dinds,function (i) {
+            z <- check_ded2(mol_form[[i]],ded[[i]])
+            if (z) enviPat::subform(mol_form[[i]],ded[[i]]) else NA_character_
+        },
+        FUN.VALUE = character(1))
+        res
+    }
+    forms  <- merger(mol_form,adds$add,adds$ded)
+    mz <- the_ifelse(!is.na(forms),
+                     mapply(function(ff,ch) enviPat::isopattern(ISOTOPES,chemforms = ff,
+                                                                charge = ch, verbose = F)[[1]][1],
+                            forms,
+                            adds$charge, USE.NAMES = F),
+                     NA_real_)
+    mz
+}
+
 calc_mz_from_smiles <- function(smiles,adduct,id) {
-    mol <- try(getMolecule(smiles), silent = T)
    mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
    check <- which(is.atomic(mol))
    if (length(check) > 0)
-        stop("Errors in SMILES with IDs:",paste(id[which],collapse = T))
+        stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))

    mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
    names(mol_form) <- id
@@ -90,6 +128,19 @@ calc_mz_from_smiles <- function(smiles,adduct,id) {
    
 }

+calc_mz_from_smiles_outer <- function(smiles,adduct,id) {
+    mol <- lapply(smiles,function(s) try(RMassBank::getMolecule(s), silent = T))
+    check <- which(is.atomic(mol))
+    if (length(check) > 0)
+        stop("Errors in SMILES with IDs:",paste(id[which],collapse = ','))
+
+    mol_form <- sapply(mol,function(x) (rcdk::get.mol2formula(x))@string,USE.NAMES = F)
+    names(mol_form) <- id
+    calc_mz_from_formula_outer(mol_form,adduct,id)
+    
+    
+}
+
    
    
    
@@ -98,15 +149,7 @@ calc_mz_from_smiles <- function(smiles,adduct,id) {
    
 ## }

-get_mz_from_smiles <- function(smiles,Formula,mz,adduct,id) {
-    mapply(function (sm,frm,mz) {
-        if (!is.na(sm)) {
-            RChemMass::getSuspectMasses(smiles=sm,adduct_list = adduct)
-        } else if (!is.na(frm)) {
-            RChemMass::getAdductMassesFromFormula
-        } })
-    RChemMass::getSuspectFormulaMass(smiles)[[ADDUCTMAP[[adduct]]]]
-}
+

 get_col_from_cmp_l<-function(id,cname,cmpL) {
    ind<-match(id,cmpL$ID)
@@ -642,7 +685,7 @@ vald_comp_tab<-function(df,ndf,checkSMILES=F,checkMz=F,checkNames=F) {
 read_setid <- function(fn,cmpds) {
    assert(file.exists(fn),msg=paste("Please provide valid compounds set table:", fn))
    assert(nrow(cmpds) > 0,msg="Please provide at least one compounds list.")
-    setid <- file2tab(fn)
+    setid <- file2tab(fn,colClasses=c(ID="character"))
    x<-cmpds[setid,on='ID'][,.SD,.SDcols=c(colnames(setid),'known')]

    sids <- unique(setid$ID)
@@ -685,8 +728,8 @@ verify_cmpd_l <- function(dt,fn) {
           msg = paste('Compound list from ',fn,
                       'does not contain any of "SMILES", "Formula", or "mz". \nThe compound list needs at least one of those to be valid.'))
    exst <- ess[pres]
-    x <- lapply(exst,function (nm) all(is.na(dt[[nm]])))
-    assert(!all(x), msg = paste('At least one of', paste(exst,collapse = T),
+    x <- lapply(exst,function (nm) do.call(all,as.list(is.na(dt[[nm]]))))
+    assert(!do.call(all,x), msg = paste('At least one of', paste(exst,collapse = ','),
                                '\nmust contain some values in compound list from',fn))
    
    invisible(T)

--- a/R/resources.R
+++ b/R/resources.R
@@ -39,15 +39,16 @@ data(isotopes,package = "enviPat", envir = .envp)
 ADDUCTS <- dtable(.envp$adducts)
 ISOTOPES <- dtable(.envp$isotopes)
 .envp <- NULL
-ADDUCTMAP <- RChemMass:::adducts$Name
-names(ADDUCTMAP) <- apply(ADDUCTS,1,function(row) {
-    nm <- row[["Name"]]
-    sgn <- row[["Charge"]]
-    suff <- if (sgn > 0) "+" else if (sgn < 0) "-" else ""
-    paste0("[",nm,"]",suff)
-})
-
-DISP_ADDUCTMAP <- c(c("UNSET"="UNSET_ADDUCT_ERROR"),ADDUCTMAP)
+ADDUCTMAP <- ADDUCTS$Name
+ADDUCTS$Name <- the_ifelse(ADDUCTS$Charge>0,paste0("[",ADDUCTS$Name,"]+"),paste0("[",ADDUCTS$Name,"]-"))
+## names(ADDUCTMAP) <- apply(ADDUCTS,1,function(row) {
+##     nm <- row[["Name"]]
+##     sgn <- row[["Charge"]]
+##     suff <- if (sgn > 0) "+" else if (sgn < 0) "-" else ""
+##     paste0("[",nm,"]",suff)
+## })
+## ADDUCTS$Name <- names(ADDUCTMAP)
+DISP_ADDUCTS <- c("UNSET",ADDUCTS$Name)
 TAG_NA <- "::UNSET::"
 SET_NA <- "::UNSET::"
 TAG_DEF <- TAG_NA
@@ -134,12 +135,11 @@ EMPTY_CMPD_LIST <- dtable(ID=character(),
 COMP_LIST_COLS <- c("ID","Name","SMILES","Formula","RT","mz")
 ## Comprehensive table properties
 COMP_NAME_MAP <- list(RT="rt")
-## COMP_NAMES <-c("ID","mz","rt","adduct","set","origin","Name","SMILES")
-COMP_NAME_FIRST <- c("ID","mz","rt","adduct","tag","set","Name","SMILES","Files","wd")
+COMP_NAME_FIRST <- c("ID","mz","rt","adduct","tag","set","Name","known","SMILES","Formula","Files","wd")

 ## File table properties
 FTAB_KEY=c("set","tag","mz")
-FTAB_NAMES=c("ID", "mz", "rt", "tag", "adduct", "set", "Name", "SMILES", "Files" , "wd","origin")
+FTAB_NAMES=c("ID", "mz", "rt", "tag", "adduct", "set", "Name", "SMILES", "Files" , "wd","known")


 EMPTY_UNKNOWN <- dtable(ID=character(0),mz=numeric(0),RT=numeric(0),Name=character(0),CAS=character(0))