Fun process_cmpd_sets correctly "slugifies" filenames.

c0ec5dc9 · Todor Kondić · 2eb43aed · c0ec5dc9 · c0ec5dc9 · c0ec5dc9
Commit c0ec5dc9 authored 2 years ago by Todor Kondić
--- a/R/api.R
+++ b/R/api.R
@@ -98,38 +98,11 @@ run_in_dir <- function(m) {
    
 }

-
-
-
-
 ##' @export
 load_compound_input <- function(m) {
-    coll <- list()
-    fields <- colnames(EMPTY_CMPD_LIST)
-    fns <- file.path(m$run$paths$project,m$conf$compounds$lists)
-    message("fns:",paste0(fns,collapse=","))
-    coltypes <- c(ID="character",
-                  SMILES="character",
-                  Formula="character",
-                  Name="character",
-                  RT="numeric",
-                  mz="numeric")
-    for (l in 1:length(fns)) {
-        fn <- fns[[l]]
-
-        ## Figure out column headers.
-        nms <- colnames(file2tab(fn,nrows=0))
-
-        ## Read the table. Knowing column headers prevents unnecessary
-        ## warnings.
-        dt <- file2tab(fn, colClasses=coltypes[nms])
-        verify_cmpd_l(dt=dt,fn=fn)
-                                        # nonexist <- setdiff(fnfields,fields)
-        coll[[l]] <- dt #if (length(nonexist)==0) dt else dt[,(nonexist) := NULL]
-        coll[[l]]$ORIG <- fn
-    }
-    cmpds <- if (length(fns)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST

+    fns <- file.path(m$run$paths$project,m$conf$compounds$lists)
+    cmpds = join_compound_lists(fns)
    ## Process sets.
    cmpds = process_cmpd_sets(cmpds)
    

--- a/R/base.R
+++ b/R/base.R
@@ -143,14 +143,3 @@ uniqy_slugs <- function(slugs) {
    dt = data.table::data.table(slug=slugs)
    dt[,slug:=fifelse(rep(.N==1L,.N),slug,paste0(slug,"_",seq(1L,.N))),by="slug"]$slug
 }
-
-process_cmpd_sets <- function(cmpdlist) {
-    ## Process sets.
-    if (! ("set" %in% colnames(cmpdlist))) cmpdlist$set=NA_character_ else cmpdlist[,set:=as.character(set)]
-    ## Extract set names and fill out the empty ones.
-    slugs = cmpdlist[,.(slug=fifelse(is.na(set),gen_fname_slug(ORIG),set)),by=ORIG]
-    slugs[,slug:=uniqy_slugs(slug)]
-
-    cmpdlist[slugs,set:=i.slug,on="ORIG"]
-    cmpdlist
-}
--- a/R/resources.R
+++ b/R/resources.R
@@ -124,13 +124,14 @@ NUM_INP_HEIGHT="5%"

 ## Possible compound list fields
 EMPTY_CMPD_LIST = dtable(ID=character(),
-                          SMILES=character(),
-                          Name=character(),
-                          Formula=character(),
-                          RT=numeric(),
-                          mz=numeric(),
-                          known=character(),
-                          ORIG=character())
+                         SMILES=character(),
+                         Name=character(),
+                         Formula=character(),
+                         RT=numeric(),
+                         mz=numeric(),
+                         known=character(),
+                         set=character(),
+                         ORIG=character())
 COMP_LIST_COLS = c("ID","Name","SMILES","Formula","RT","mz")
 ## Comprehensive table properties
 COMP_NAME_MAP = list(RT="rt")

--- a/R/shiny-state.R
+++ b/R/shiny-state.R
@@ -453,14 +453,10 @@ app_state2state <- function(input,gui,envopts,m=NULL) {
 }
    
 get_sets <- function(gui) {
-    ## TODO FIXME
-    ## Think about this
-    ## fn_lists <- file.path(gui$paths$project,gui$compounds$lists)
-
-    ## df <- fread(file=fn_lists)
-    ## if (!
-    ## res = df[,unique(set)]
-    if (length(res)==0L) res = "ALL"
+    fn_lists <- file.path(gui$paths$project,gui$compounds$lists)
+    cmpds = join_compound_lists(fn_lists)
+    cmpds = process_cmpd_sets(cmpds)
+    cmpds[,unique(set)]
 }



--- a/R/state.R
+++ b/R/state.R
@@ -431,3 +431,40 @@ pack_project <- function(m,fn_arch) {
    })
    fn_arch
 }
+
+join_compound_lists <- function(fname) {
+    coll <- list()
+    fields <- colnames(EMPTY_CMPD_LIST)
+    coltypes <- c(ID="character",
+                  SMILES="character",
+                  Formula="character",
+                  Name="character",
+                  RT="numeric",
+                  mz="numeric")
+    l=0L
+    for (fn in fname) {
+        l = l + 1L
+        ## Figure out column headers.
+        nms <- colnames(file2tab(fn,nrows=0))
+        
+        ## Read the table. Knowing column headers prevents unnecessary
+        ## warnings.
+        dt <- file2tab(fn, colClasses=coltypes[nms])
+        verify_cmpd_l(dt=dt,fn=fn)
+                                        # nonexist <- setdiff(fnfields,fields)
+        coll[[l]] <- dt #if (length(nonexist)==0) dt else dt[,(nonexist) := NULL]
+        coll[[l]]$ORIG <- fn
+    }
+    if (length(fname)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST
+
+}
+
+process_cmpd_sets <- function(cmpdlist) {
+    if (nrow(cmpdlist)==0L) return(EMPTY_CMPD_LIST)
+    ## Process sets.
+    if (! ("set" %in% colnames(cmpdlist))) cmpdlist$set=NA_character_ else cmpdlist[,set:=as.character(set)]
+    slugs = cmpdlist[,.(slug=gen_fname_slug(ORIG)),by="ORIG"]
+    slugs[,slug:=uniqy_slugs(slug)]
+    cmpdlist[slugs,set:=fifelse(is.na(set),i.slug,set),on="ORIG"]
+    cmpdlist
+}
--- a/tests/testthat/_snaps/base.md
+++ b/tests/testthat/_snaps/base.md
-# process_cmpd_sets
+# uniqy_slugs

    Code
-      x
+      out
    Output
-         ID   ORIG  set
-      1:  1 f1.csv f1_2
-      2:  2 f1.csv f1_2
-      3:  3 f2.csv  aks
-      4:  4 f3.csv se32
-      5:  5 f3.csv se32
-
---
-
-    Code
-      x
-    Output
-         ID         ORIG  set
-      1:  1     b/f2.csv   f2
-      2:  2     a/f1.csv   f1
-      3:  3 q/f/g/f2.csv  aks
-      4:  4 d/e/f/f3.csv se31
-      5:  5 m/n/q/f3.csv se32
+       [1] "f1"   "f2_1" "f2_2" "f2_3" "f3_1" "f4_1" "f4_2" "f5"   "f3_2" "f6"  

--- a/tests/testthat/_snaps/state.md
+++ b/tests/testthat/_snaps/state.md
@@ -135,7 +135,7 @@
      list()
      
      $input$tab$L0
-      Empty data.table (0 rows and 8 cols): ID,SMILES,Name,Formula,RT,mz...
+      Empty data.table (0 rows and 9 cols): ID,SMILES,Name,Formula,RT,mz...
      
      
      
@@ -219,3 +219,50 @@
      [1] "test_project/file1.csv"             "test_project/file2.csv"            
      [3] "test_project/subdir/filesubdir.csv"

+# process_cmpd_sets
+
+    Code
+      x
+    Output
+         ID   ORIG  set
+      1:  1 f1.csv   f1
+      2:  2 f1.csv   f1
+      3:  3 f2.csv  aks
+      4:  4 f3.csv se31
+      5:  5 f3.csv se32
+
+---
+
+    Code
+      x
+    Output
+         ID         ORIG  set
+      1:  1     b/f2.csv f2_1
+      2:  2     a/f1.csv   f1
+      3:  3 q/f/g/f2.csv  aks
+      4:  4 d/e/f/f3.csv se31
+      5:  5 m/n/q/f3.csv se32
+
+---
+
+    Code
+      x
+    Output
+         ID     ORIG  set
+      1:  1 a/fn.csv fn_1
+      2:  2 a/fn.csv fn_1
+      3:  3 a/fn.csv  set
+      4:  4 a/fn.csv set2
+      5:  5 a/fn.csv fn_1
+      6:  6 b/fn.csv fn_2
+      7:  7 b/fn.csv set2
+      8:  8 b/fn.csv set2
+      9:  9 b/fn.csv fn_2
+
+---
+
+    Code
+      x
+    Output
+      Empty data.table (0 rows and 9 cols): ID,SMILES,Name,Formula,RT,mz...
+
--- a/tests/testthat/test-base.R
+++ b/tests/testthat/test-base.R
@@ -13,20 +13,5 @@ test_that("gen_fname_slug",{
 test_that("uniqy_slugs",{
    inp = c('f1','f2','f2','f2','f3','f4','f4','f5','f3','f6')
    out = uniqy_slugs(inp)
-    message(paste(out,coll=','))
-    expect_true(1==1)
-})
-
-test_that("process_cmpd_sets",{
-
-    ## Test case when no base filename is the same.
-    cmpdl = data.table(ID=1:5,ORIG=c("f1.csv","f1.csv","f2.csv","f3.csv","f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32"))
-    x = process_cmpd_sets(cmpdl)
-    expect_snapshot(x)
-
-    ## Test case with similar base filenames.
-    cmpdl = data.table(ID=1:5,ORIG=c("b/f2.csv","a/f1.csv","q/f/g/f2.csv","d/e/f/f3.csv","m/n/q/f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32"))
-    x = process_cmpd_sets(cmpdl)
-    expect_snapshot(x)
-
+    expect_snapshot(out)
 })
--- a/tests/testthat/test-state.R
+++ b/tests/testthat/test-state.R
@@ -33,3 +33,35 @@ test_that("pack_project",{
    })
    expect_snapshot(fls)
 })
+
+test_that("join_compound_lists with empty input returns empty output",{
+    x = join_compound_lists(character(0))
+    expect_true(nrow(x)==0L)
+})
+
+test_that("process_cmpd_sets",{
+
+    ## Test case when no base filename is the same.
+    cmpdl = data.table(ID=1:5,ORIG=c("f1.csv","f1.csv","f2.csv","f3.csv","f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32"))
+    x = process_cmpd_sets(cmpdl)
+    expect_snapshot(x)
+
+    ## Test case with similar base filenames.
+    cmpdl = data.table(ID=1:5,ORIG=c("b/f2.csv","a/f1.csv","q/f/g/f2.csv","d/e/f/f3.csv","m/n/q/f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32"))
+    x = process_cmpd_sets(cmpdl)
+    expect_snapshot(x)
+
+    ## Test semi-filled cmpd.
+    cmpdl1 = data.table(ID=1:5,ORIG=rep("a/fn.csv",5),set=c(NA_character_,NA_character_,"set","set2",NA_character_))
+    cmpdl2 = data.table(ID=6:9,ORIG=rep("b/fn.csv",4),set=c(NA_character_,"set2","set2",NA_character_))
+    cmpdl = rbindlist(list(cmpdl1,cmpdl2))
+    x = process_cmpd_sets(cmpdl)
+    expect_snapshot(x)
+    
+    ## Empty input.
+    cmpdl = EMPTY_CMPD_LIST
+    x = process_cmpd_sets(cmpdl)
+    expect_snapshot(x)
+
+
+})