From c0ec5dc9f55d93ee2b028e6ffd3038464b9cad10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Todor=20Kondi=C4=87?= <kontrapunkt@uclmail.net> Date: Sun, 5 Mar 2023 11:49:04 +0100 Subject: [PATCH] Fun process_cmpd_sets correctly "slugifies" filenames. --- R/api.R | 31 ++------------------- R/base.R | 11 -------- R/resources.R | 15 ++++++----- R/shiny-state.R | 12 +++------ R/state.R | 37 +++++++++++++++++++++++++ tests/testthat/_snaps/base.md | 23 +++------------- tests/testthat/_snaps/state.md | 49 +++++++++++++++++++++++++++++++++- tests/testthat/test-base.R | 17 +----------- tests/testthat/test-state.R | 32 ++++++++++++++++++++++ 9 files changed, 135 insertions(+), 92 deletions(-) diff --git a/R/api.R b/R/api.R index d14964c..f976db7 100644 --- a/R/api.R +++ b/R/api.R @@ -98,38 +98,11 @@ run_in_dir <- function(m) { } - - - - ##' @export load_compound_input <- function(m) { - coll <- list() - fields <- colnames(EMPTY_CMPD_LIST) - fns <- file.path(m$run$paths$project,m$conf$compounds$lists) - message("fns:",paste0(fns,collapse=",")) - coltypes <- c(ID="character", - SMILES="character", - Formula="character", - Name="character", - RT="numeric", - mz="numeric") - for (l in 1:length(fns)) { - fn <- fns[[l]] - - ## Figure out column headers. - nms <- colnames(file2tab(fn,nrows=0)) - - ## Read the table. Knowing column headers prevents unnecessary - ## warnings. - dt <- file2tab(fn, colClasses=coltypes[nms]) - verify_cmpd_l(dt=dt,fn=fn) - # nonexist <- setdiff(fnfields,fields) - coll[[l]] <- dt #if (length(nonexist)==0) dt else dt[,(nonexist) := NULL] - coll[[l]]$ORIG <- fn - } - cmpds <- if (length(fns)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST + fns <- file.path(m$run$paths$project,m$conf$compounds$lists) + cmpds = join_compound_lists(fns) ## Process sets. cmpds = process_cmpd_sets(cmpds) diff --git a/R/base.R b/R/base.R index d64b4ef..320d6b6 100644 --- a/R/base.R +++ b/R/base.R @@ -143,14 +143,3 @@ uniqy_slugs <- function(slugs) { dt = data.table::data.table(slug=slugs) dt[,slug:=fifelse(rep(.N==1L,.N),slug,paste0(slug,"_",seq(1L,.N))),by="slug"]$slug } - -process_cmpd_sets <- function(cmpdlist) { - ## Process sets. - if (! ("set" %in% colnames(cmpdlist))) cmpdlist$set=NA_character_ else cmpdlist[,set:=as.character(set)] - ## Extract set names and fill out the empty ones. - slugs = cmpdlist[,.(slug=fifelse(is.na(set),gen_fname_slug(ORIG),set)),by=ORIG] - slugs[,slug:=uniqy_slugs(slug)] - - cmpdlist[slugs,set:=i.slug,on="ORIG"] - cmpdlist -} diff --git a/R/resources.R b/R/resources.R index a28ff1b..746b2a6 100644 --- a/R/resources.R +++ b/R/resources.R @@ -124,13 +124,14 @@ NUM_INP_HEIGHT="5%" ## Possible compound list fields EMPTY_CMPD_LIST = dtable(ID=character(), - SMILES=character(), - Name=character(), - Formula=character(), - RT=numeric(), - mz=numeric(), - known=character(), - ORIG=character()) + SMILES=character(), + Name=character(), + Formula=character(), + RT=numeric(), + mz=numeric(), + known=character(), + set=character(), + ORIG=character()) COMP_LIST_COLS = c("ID","Name","SMILES","Formula","RT","mz") ## Comprehensive table properties COMP_NAME_MAP = list(RT="rt") diff --git a/R/shiny-state.R b/R/shiny-state.R index 826f5bc..930f6eb 100644 --- a/R/shiny-state.R +++ b/R/shiny-state.R @@ -453,14 +453,10 @@ app_state2state <- function(input,gui,envopts,m=NULL) { } get_sets <- function(gui) { - ## TODO FIXME - ## Think about this - ## fn_lists <- file.path(gui$paths$project,gui$compounds$lists) - - ## df <- fread(file=fn_lists) - ## if (! - ## res = df[,unique(set)] - if (length(res)==0L) res = "ALL" + fn_lists <- file.path(gui$paths$project,gui$compounds$lists) + cmpds = join_compound_lists(fn_lists) + cmpds = process_cmpd_sets(cmpds) + cmpds[,unique(set)] } diff --git a/R/state.R b/R/state.R index 0914d93..9bec760 100644 --- a/R/state.R +++ b/R/state.R @@ -431,3 +431,40 @@ pack_project <- function(m,fn_arch) { }) fn_arch } + +join_compound_lists <- function(fname) { + coll <- list() + fields <- colnames(EMPTY_CMPD_LIST) + coltypes <- c(ID="character", + SMILES="character", + Formula="character", + Name="character", + RT="numeric", + mz="numeric") + l=0L + for (fn in fname) { + l = l + 1L + ## Figure out column headers. + nms <- colnames(file2tab(fn,nrows=0)) + + ## Read the table. Knowing column headers prevents unnecessary + ## warnings. + dt <- file2tab(fn, colClasses=coltypes[nms]) + verify_cmpd_l(dt=dt,fn=fn) + # nonexist <- setdiff(fnfields,fields) + coll[[l]] <- dt #if (length(nonexist)==0) dt else dt[,(nonexist) := NULL] + coll[[l]]$ORIG <- fn + } + if (length(fname)>0) rbindlist(l=c(list(EMPTY_CMPD_LIST), coll), use.names = T, fill = T) else EMPTY_CMPD_LIST + +} + +process_cmpd_sets <- function(cmpdlist) { + if (nrow(cmpdlist)==0L) return(EMPTY_CMPD_LIST) + ## Process sets. + if (! ("set" %in% colnames(cmpdlist))) cmpdlist$set=NA_character_ else cmpdlist[,set:=as.character(set)] + slugs = cmpdlist[,.(slug=gen_fname_slug(ORIG)),by="ORIG"] + slugs[,slug:=uniqy_slugs(slug)] + cmpdlist[slugs,set:=fifelse(is.na(set),i.slug,set),on="ORIG"] + cmpdlist +} diff --git a/tests/testthat/_snaps/base.md b/tests/testthat/_snaps/base.md index 3383291..5a43f71 100644 --- a/tests/testthat/_snaps/base.md +++ b/tests/testthat/_snaps/base.md @@ -1,24 +1,7 @@ -# process_cmpd_sets +# uniqy_slugs Code - x + out Output - ID ORIG set - 1: 1 f1.csv f1_2 - 2: 2 f1.csv f1_2 - 3: 3 f2.csv aks - 4: 4 f3.csv se32 - 5: 5 f3.csv se32 - ---- - - Code - x - Output - ID ORIG set - 1: 1 b/f2.csv f2 - 2: 2 a/f1.csv f1 - 3: 3 q/f/g/f2.csv aks - 4: 4 d/e/f/f3.csv se31 - 5: 5 m/n/q/f3.csv se32 + [1] "f1" "f2_1" "f2_2" "f2_3" "f3_1" "f4_1" "f4_2" "f5" "f3_2" "f6" diff --git a/tests/testthat/_snaps/state.md b/tests/testthat/_snaps/state.md index f6e6849..349134f 100644 --- a/tests/testthat/_snaps/state.md +++ b/tests/testthat/_snaps/state.md @@ -135,7 +135,7 @@ list() $input$tab$L0 - Empty data.table (0 rows and 8 cols): ID,SMILES,Name,Formula,RT,mz... + Empty data.table (0 rows and 9 cols): ID,SMILES,Name,Formula,RT,mz... @@ -219,3 +219,50 @@ [1] "test_project/file1.csv" "test_project/file2.csv" [3] "test_project/subdir/filesubdir.csv" +# process_cmpd_sets + + Code + x + Output + ID ORIG set + 1: 1 f1.csv f1 + 2: 2 f1.csv f1 + 3: 3 f2.csv aks + 4: 4 f3.csv se31 + 5: 5 f3.csv se32 + +--- + + Code + x + Output + ID ORIG set + 1: 1 b/f2.csv f2_1 + 2: 2 a/f1.csv f1 + 3: 3 q/f/g/f2.csv aks + 4: 4 d/e/f/f3.csv se31 + 5: 5 m/n/q/f3.csv se32 + +--- + + Code + x + Output + ID ORIG set + 1: 1 a/fn.csv fn_1 + 2: 2 a/fn.csv fn_1 + 3: 3 a/fn.csv set + 4: 4 a/fn.csv set2 + 5: 5 a/fn.csv fn_1 + 6: 6 b/fn.csv fn_2 + 7: 7 b/fn.csv set2 + 8: 8 b/fn.csv set2 + 9: 9 b/fn.csv fn_2 + +--- + + Code + x + Output + Empty data.table (0 rows and 9 cols): ID,SMILES,Name,Formula,RT,mz... + diff --git a/tests/testthat/test-base.R b/tests/testthat/test-base.R index 60792b9..87905dd 100644 --- a/tests/testthat/test-base.R +++ b/tests/testthat/test-base.R @@ -13,20 +13,5 @@ test_that("gen_fname_slug",{ test_that("uniqy_slugs",{ inp = c('f1','f2','f2','f2','f3','f4','f4','f5','f3','f6') out = uniqy_slugs(inp) - message(paste(out,coll=',')) - expect_true(1==1) -}) - -test_that("process_cmpd_sets",{ - - ## Test case when no base filename is the same. - cmpdl = data.table(ID=1:5,ORIG=c("f1.csv","f1.csv","f2.csv","f3.csv","f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32")) - x = process_cmpd_sets(cmpdl) - expect_snapshot(x) - - ## Test case with similar base filenames. - cmpdl = data.table(ID=1:5,ORIG=c("b/f2.csv","a/f1.csv","q/f/g/f2.csv","d/e/f/f3.csv","m/n/q/f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32")) - x = process_cmpd_sets(cmpdl) - expect_snapshot(x) - + expect_snapshot(out) }) diff --git a/tests/testthat/test-state.R b/tests/testthat/test-state.R index c62a49f..26f6146 100644 --- a/tests/testthat/test-state.R +++ b/tests/testthat/test-state.R @@ -33,3 +33,35 @@ test_that("pack_project",{ }) expect_snapshot(fls) }) + +test_that("join_compound_lists with empty input returns empty output",{ + x = join_compound_lists(character(0)) + expect_true(nrow(x)==0L) +}) + +test_that("process_cmpd_sets",{ + + ## Test case when no base filename is the same. + cmpdl = data.table(ID=1:5,ORIG=c("f1.csv","f1.csv","f2.csv","f3.csv","f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32")) + x = process_cmpd_sets(cmpdl) + expect_snapshot(x) + + ## Test case with similar base filenames. + cmpdl = data.table(ID=1:5,ORIG=c("b/f2.csv","a/f1.csv","q/f/g/f2.csv","d/e/f/f3.csv","m/n/q/f3.csv"),set=c(NA_character_,NA_character_,"aks","se31","se32")) + x = process_cmpd_sets(cmpdl) + expect_snapshot(x) + + ## Test semi-filled cmpd. + cmpdl1 = data.table(ID=1:5,ORIG=rep("a/fn.csv",5),set=c(NA_character_,NA_character_,"set","set2",NA_character_)) + cmpdl2 = data.table(ID=6:9,ORIG=rep("b/fn.csv",4),set=c(NA_character_,"set2","set2",NA_character_)) + cmpdl = rbindlist(list(cmpdl1,cmpdl2)) + x = process_cmpd_sets(cmpdl) + expect_snapshot(x) + + ## Empty input. + cmpdl = EMPTY_CMPD_LIST + x = process_cmpd_sets(cmpdl) + expect_snapshot(x) + + +}) -- GitLab