Commit a041e764 authored by Emma Schymanski's avatar Emma Schymanski
Browse files

Update extractAnnotations.R

added getPcAnno.Transformations
parent a34bafa3
......@@ -38,6 +38,7 @@ library(RChemMass)
#'
#' @examples
#' n_pages <- getPcAnno.TotalPages('HSDB','Metabolism/Metabolites')
#' n_pages <- getPcAnno.TotalPages("NORMAN Suspect List Exchange","Transformations")
#'
#' @export
getPcAnno.TotalPages <- function(source, heading, timeout=100) {
......@@ -46,6 +47,8 @@ getPcAnno.TotalPages <- function(source, heading, timeout=100) {
midURL <- '&heading='
endURL <- '&heading_type=Compound&page=1'
url <- paste0(baseURL, source, midURL, heading, endURL)
#Example:
#https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading/JSON/?source=NORMAN%20Suspect%20List%20Exchange&heading_type=Compound&heading=Transformations&page=1
errorvar <- 0
currEnvir <- environment()
......@@ -409,6 +412,111 @@ getPcAnno.TPcids <- function(cid,anno_csv_name,tp_csv=TRUE) {
}
#' Retrieve CIDs with Transformations Entries (by page)
#'
#' Retrieve the CIDs with Transformations entries from the
#' annotation information at the Transformation
#' section of data sources, per page. As this requires
#' retrieving the entire JSON file (up to 1000 entries), this can take
#' some time (hence large default timeout). Thanks to Paul Thiessen,
#' Jeff Zhang and Evan Bolton from PubChem team for assistance.
#'
#' @usage getPcAnno.Transformations(source, heading, page=1, file_name="", timeout=100)
#'
#' @param source Data source name of the annotations data, e.g. \code{"NORMAN Suspect List Exchange"}
#' @param heading Annotation category desired, e.g. \code{"Transformations"}
#' @param page Page number to retrieve. Use \link{getPcAnno.TotalPages} to
#' determine page numbers available
#' @param timeout The timeout, in seconds. Should be generous as these are large files
#' @return The CIDs mentioned in the record.
#'
#' @details
#' This function will only extract CIDs and thus probably only works for
#' Transformations sections (and/or any other section where only CIDs are
#' needed)
#'
#' @author Emma Schymanski <emma.schymanski@@uni.lu>
#'
#' @references
#' PubChem: \url{http://pubchem.ncbi.nlm.nih.gov/}
#'
#' PubChem Data Sources (with annotations):
#' \url{https://pubchem.ncbi.nlm.nih.gov/source/#type=Annotations}
#'
#' Transformations Example:
#' \url{https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading/JSON/?source=NORMAN%20Suspect%20List%20Exchange&heading_type=Compound&heading=Transformations&page=1}
#'
#' @seealso \code{\link{getPcAnno.TotalPages}}, \code{\link{getPcAnno.Metabolism}}
#'
#' @examples
#' # This retrieves only the CIDs from the first page
#' getPcAnno.Transformations(source = 'NORMAN Suspect List Exchange',heading = 'Transformations')
#' # This will retrieve all as a list of CIDs
#' source <- "NORMAN Suspect List Exchange"
#' heading <- "Transformations"
#' n_pages <- getPcAnno.TotalPages(source, heading)
#' for (i in 1:n_pages) {
#' if (i == 1) {
#' cids <- getPcAnno.Transformations(source, heading, page=i)
#' } else {
#' cids_px <- getPcAnno.Transformations(source, heading, page=i)
#' cids <- c(cids,cids_px)
#' }
#' }
#' # May 2021 this was 4730 CIDs
#'
#' @export
getPcAnno.Transformations <- function(source, heading, page=1, timeout=100) {
# set URL
baseURL <- 'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/heading/JSON/?source='
#source <- 'HSDB'
midURL <- '&heading='
#heading <- 'Metabolism/Metabolites'
#endURL <- '&heading_type=Compound&page=1&response_type=display'
endURL <- '&heading_type=Compound&page='
url <- paste0(baseURL, source, midURL, heading, endURL, page)
errorvar <- 0
currEnvir <- environment()
tryCatch(
#data <- getURL(URLencode(url),timeout=timeout),
{
res <- GET(URLencode(url))
data <- httr::content(res, type="text", encoding="UTF-8")
},
error=function(e){
currEnvir$errorvar <- 1
})
if(errorvar){
return(NA)
}
r <- fromJSON(data)
if(!is.null(r$Fault)) {
return(NA)
}
n_annos <- length(r$Annotations$Annotation) #1000 per page
cids <- vector(mode="character",length=n_annos)
for (i in 1:n_annos) {
#for (i in 1:3) {
l1 <- r$Annotations$Annotation[[i]]
cid <- l1$LinkedRecords$CID
if (is.null(cid)) {
cids[i] <- NA
} else {
cids[i] <- l1$LinkedRecords$CID
}
}
return(cids)
}
#' Retrieve Transformations Table from PubChem Transformation Sections
#'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment