Commit 847397af authored by Emma Schymanski's avatar Emma Schymanski
Browse files

Day 1 BioHackEU20

...traceable keyword file with vitamin example, using caching function to get CIDs. To expand on day 2 ...
parent a452d624
......@@ -155,15 +155,143 @@ length(unique(agro_list))
# length=29 with depth=2
# length=96 with depth=3
#### try to extract data (ignore XML and try JSON) ####
#### try to extract data via cache key with hid and hnid ####
#https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi?format=json&hid=101&hnid=4041191&cache_uid_type=cid
#https://pubchem.ncbi.nlm.nih.gov/list_gateway/list_gateway.cgi?format=json&action=cache_get&cache_key=UhX0mAgqbZZavG-l7d0mtIuf2v99qaTN3ui_gcX5rYDF4JE
# try to run these over vitamin_info
i <- 1
vitamin_info$nodeCIDs <- ""
#for (i in 1:3) {
for (i in 1:length(vitamin_info$hid)) {
hid <- vitamin_info$hid[i]
hnid <- vitamin_info$Node[i]
node_hnid <- vitamin_info$nodeHNID[i]
cache_key <- getPcHidCacheID(hid,node_hnid)
if (!is.na(cache_key)) {
row_cids <- getPcListCache(cache_key)
vitamin_info$nodeCIDs[i] <- paste(row_cids,collapse="|")
} else {
row_cids <- NULL
}
if (i == 1) {
cids <- row_cids
} else if (length(row_cids)>0) {
cids <- c(row_cids,cids)
}
}
length(cids)
unique_cids <- unique(cids)
length(unique_cids)
write.csv(vitamin_info,"vitamin_info.csv",row.names = F)
write.table(sort(unique_cids),"vitamin_unique_CIDs.txt",row.names = F,col.names=F)
# Evan would like two col file: keyword cid and then merge many keyword files together
write.table(cbind("vitamin",sort(unique_cids)),"vitamin_unique_CIDs_keyword.txt",
row.names = F,col.names=F,quote = F)
## This function returns a cache ID to get a CID list
getPcListCache <- function(cache_key) {
baseURL <- "https://pubchem.ncbi.nlm.nih.gov/list_gateway/list_gateway.cgi?"
# if (!is.na(depth)) {
url <- paste0(baseURL, "format=json","&action=cache_get","&cache_key=", cache_key)
# } else {
# url <- paste0(baseURL, "format=json&hid=", query)
# }
errorvar <- 0
currEnvir <- environment()
tryCatch(
{# data <- getURL(URLencode(url),timeout=8),
res <- GET(URLencode(url))
data <- httr::content(res, type="text", encoding="UTF-8")
},
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
# Otherwise proceed
r <- fromJSON(data)
if(!is.null(r$Fault))
return(NA)
# all this information comes with the call
id_type <- r$ID_Container$identifier_type
number_ids <- r$ID_Container$number_of_ids
# but only return this for now
ids <- r$ID_Container$ids$uint32
return(ids)
}
hid_url <- "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi?format=json&hid=101&hnid=4041191"
query="101"
hid <- "101"
hnid <- "4041191"
cache_uid <- "cid"
getPcHidCacheID(101,4041191)
getPcListCache(getPcHidCacheID(101,4041191))
getPcListCache(getPcHidCacheID(4,2291199))
## This function returns a cache ID to get a CID list
getPcHidCacheID <- function(hid,hnid,cache_uid="cid") {
baseURL <- "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi?"
# if (!is.na(depth)) {
url <- paste0(baseURL, "format=json","&hid=",hid,"&hnid=", hnid,
"&cache_uid_type=",cache_uid)
# } else {
# url <- paste0(baseURL, "format=json&hid=", query)
# }
errorvar <- 0
currEnvir <- environment()
tryCatch(
{# data <- getURL(URLencode(url),timeout=8),
res <- GET(URLencode(url))
data <- httr::content(res, type="text", encoding="UTF-8")
},
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
exception_check <- grep("Exception during execution: ",data)
if (length(exception_check)>0 && exception_check==1) {
warn_string <- paste0("No data file for HID ",hid," and HNID ",hnid)
warning(warn_string)
return(NA)
}
# Otherwise proceed
r <- fromJSON(data)
if(!is.null(r$Fault))
return(NA)
cache_key <- r$Hierarchies$CacheKey
return(cache_key)
}
##### Functions and Testing ######
......
# Functions to extract data from PubChem Classification Trees
# E. Schymanski, 30/10/2020 - 7/11/2020
# E. Schymanski, 30/10/2020 - 9/11/2020
# Plus Evan Bolton and Paul Thiessen
# Functions ported from hid_tree_JSON.R
......@@ -271,3 +271,103 @@ getPcHidTreeInfo <- function(query, depth=2)
}
## This function uses a cache ID from getPcHidCacheID to get a CID list
getPcListCache <- function(cache_key) {
baseURL <- "https://pubchem.ncbi.nlm.nih.gov/list_gateway/list_gateway.cgi?"
# if (!is.na(depth)) {
url <- paste0(baseURL, "format=json","&action=cache_get","&cache_key=", cache_key)
# } else {
# url <- paste0(baseURL, "format=json&hid=", query)
# }
errorvar <- 0
currEnvir <- environment()
tryCatch(
{# data <- getURL(URLencode(url),timeout=8),
res <- GET(URLencode(url))
data <- httr::content(res, type="text", encoding="UTF-8")
},
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
# Otherwise proceed
r <- fromJSON(data)
if(!is.null(r$Fault))
return(NA)
# all this information comes with the call
id_type <- r$ID_Container$identifier_type
number_ids <- r$ID_Container$number_of_ids
# but only return this for now
ids <- r$ID_Container$ids$uint32
return(ids)
}
# # hid_url <- "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi?format=json&hid=101&hnid=4041191"
# #
# # hid <- "101"
# # hnid <- "4041191"
# # cache_uid <- "cid"
# getPcHidCacheID(101,4041191)
#
# getPcListCache(getPcHidCacheID(101,4041191))
# getPcListCache(getPcHidCacheID(4,2291199))
## This function returns a cache ID to get a CID list given a hid and hnid
getPcHidCacheID <- function(hid,hnid,cache_uid="cid") {
baseURL <- "https://pubchem.ncbi.nlm.nih.gov/classification/cgi/classifications.fcgi?"
# if (!is.na(depth)) {
url <- paste0(baseURL, "format=json","&hid=",hid,"&hnid=", hnid,
"&cache_uid_type=",cache_uid)
# } else {
# url <- paste0(baseURL, "format=json&hid=", query)
# }
errorvar <- 0
currEnvir <- environment()
tryCatch(
{# data <- getURL(URLencode(url),timeout=8),
res <- GET(URLencode(url))
data <- httr::content(res, type="text", encoding="UTF-8")
},
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
exception_check <- grep("Exception during execution: ",data)
if (length(exception_check)>0 && exception_check==1) {
warn_string <- paste0("No data file for HID ",hid," and HNID ",hnid)
warning(warn_string)
return(NA)
}
# Otherwise proceed
r <- fromJSON(data)
if(!is.null(r$Fault))
return(NA)
cache_key <- r$Hierarchies$CacheKey
return(cache_key)
}
This diff is collapsed.
1
253
461
598
838
864
892
936
938
978
1024
1050
1051
1052
1054
1130
1132
1593
2034
2091
2116
2320
2524
2557
2812
2972
3070
3249
4034
4055
4485
4677
4678
4760
4763
4873
5234
5382
5526
5743
5757
5816
5852
5886
5893
5957
5970
6019
6037
6042
6112
6116
6176
6202
6221
6613
6719
7436
8629
8714
8989
9068
9903
10209
10214
10359
10761
10762
10917
14457
14791
14985
14986
15032
17506
18989
20353
25517
27990
34755
36314
38882
47528
53232
55245
61711
62640
65036
65045
66250
68942
70846
71102
71124
71406
71433
83831
86472
87642
90484
92094
92140
92729
92745
92980
101989
102424
104817
104957
108150
121396
124886
125859
126941
131204
134070
155256
159247
160520
166583
171548
176168
184933
197855
214348
247704
363417
439423
440667
441298
443753
443894
443968
443990
444097
444679
445354
446313
449196
450783
452306
493570
638035
643975
643976
643989
644168
688382
2724354
2734019
2735208
3002119
3002120
3013212
3032279
3032771
3036928
3036929
3038097
3043678
3080672
3648667
4289526
4369188
4866774
5280374
5280447
5280453
5280483
5280489
5280531
5280540
5280585
5280675
5280793
5280795
5280838
5280845
5281010
5281058
5281104
5281107
5281234
5281235
5281915
5282168
5282181
5282190
5282347
5282348
5282349
5282350
5282367
5282368
5282375
5283547
5283674
5283675
5283676
5283677
5283678
5283679
5283681
5283682
5283683
5283684
5283685
5283686
5283687
5283688
5283689
5283690
5283691
5283692
5283693
5283694
5283695
5283696
5283697
5283698
5283699
5283700
5283703
5283704
5283705
5283706
5283707
5283708
5283709
5283710
5283711
5283712
5283713
5283714
5283715
5283716
5283717
5283718
5283719
5283720
5283721
5283722
5283723
5283724
5283725
5283726
5283727
5283728
5283729
5283730
5283731
5283732
5283733
5283734
5283735
5283736
5283737
5283738
5283739
5283740
5283741
5283742
5283743
5283744
5283745
5283746
5283747
5283748
5283749
5283750
5283751
5283752
5283753
5283754
5283755
5283756
5283757
5283758
5283759
5283760
5283761
5283762
5283763
5283764
5283765
5283766
5283767
5283768
5283769
5283770
5283771
5283772
5283773
5283774
5283775
5283776
5283777
5283778
5283779
5283780
5283781
5283782
5283783
5283784
5283785
5283786
5283787
5283788
5283789
5283790
5283791
5283792
5283793
5283794
5283795
5283796
5283797
5283798
5283799
5283800
5283906
5284515
5284607
5288149
5288670
5288783
5289547
5289548
5289549
5311071
5311498
5314030
5314031
5315257
5315463
5315709
5329098
5353325
5353466
5353527
5353584
5353610
5353843
5371993
5376004
5460135
5460204
5460373
5460702
5460703
5462265
5479203
5702045
5702050
5710148
5748487
5912745
5927371
6347547
6398761
6419724
6419725
6419877
6420088