rpubchem: An Interface to the PubChem Collection

Documented in decodeCACTVS get.cid get.cid.list get.sid get.sid.list

## Taken from BMS package
.hexcode.binvec.convert <-
function (length.of.binvec) 
{
    if (length(length.of.binvec) > 1) 
        length.of.binvec = length(length.of.binvec)
    addpositions = 4 - length.of.binvec%%4
    positionsby4 = (length.of.binvec + addpositions)/4
    hexvec = c(0:9, "a", "b", "c", "d", "e", "f")
    hexcodelist = list(`0` = numeric(4), `1` = c(0, 0, 0, 1), 
        `2` = c(0, 0, 1, 0), `3` = c(0, 0, 1, 1), `4` = c(0, 
            1, 0, 0), `5` = c(0, 1, 0, 1), `6` = c(0, 1, 1, 0), 
        `7` = c(0, 1, 1, 1), `8` = c(1, 0, 0, 0), `9` = c(1, 
            0, 0, 1), a = c(1, 0, 1, 0), b = c(1, 0, 1, 1), c = c(1, 
            1, 0, 0), d = c(1, 1, 0, 1), e = c(1, 1, 1, 0), f = c(1, 
            1, 1, 1))
    return(list(as.hexcode = function(binvec) {
        incl = c(numeric(addpositions), binvec)
        dim(incl) = c(4, positionsby4)
        return(paste(hexvec[crossprod(incl, 2L^(3:0)) + 1], collapse = ""))
    }, as.binvec = function(hexcode) {
        return(unlist(hexcodelist[unlist(strsplit(hexcode, "", 
            fixed = TRUE), recursive = FALSE, use.names = FALSE)], 
            recursive = FALSE, use.names = FALSE)[-(1:addpositions)])
    }))
}
## Taken from BMS package
.hex2bin <-
function (hexcode) 
{
    if (!is.character(hexcode)) 
        stop("please input a character like '0af34c'")
    hexcode <- paste("0", tolower(hexcode), sep = "")
    hexobj <- .hexcode.binvec.convert(length(hexcode) * 16L)
    return(hexobj$as.binvec(hexcode))
}

## Convert base64 encoded CACTVS fp to fingerprint object
## As described in ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
## last 7 bits are padding

#' Convert a Base64 encoded Pubchem 881-bit fingerprint to a \code{fingerprint} object
#'
#' Pubchem computes 881-bit structural keys using the CACTVS toolkit, which are made
#' available as Base64 encoded strings. This method converts the Pubchem string to
#' a \code{fingerprint} object, which can be manipulated using the \code{fingerprint}
#' package.
#'
#' @param cactvs A character string containing the Base64 encoded fingerprint
#' @return A \code{fingerprint} object
#' @seealso \code{\link{get.cid}}
#'
#' @export 
decodeCACTVS <- function(cactvs) {
  h <- base64decode(cactvs)
  bits <- unlist(lapply(h[-c(1:4)], function(x) .hex2bin(as.character(x))))
  bits <- bits[1:881]
  fp <- methods::new('fingerprint', nbit=881, bits=which(bits==1), provider='pubchem')
  return(fp)
}

.extract.fields <- function(doc) {

  .itemNames <- c('IUPACName','CanonicalSmiles','MolecularFormula','MolecularWeight', 'TotalFormalCharge',
                  'XLogP', 'HydrogenBondDonorCount', 'HydrogenBondAcceptorCount',
                  'HeavyAtomCount', 'TPSA')
  docsums <- getNodeSet(doc, '/eSummaryResult/DocSum')
  ret <- lapply(docsums, function(docsum) {
    nodes <- Filter(function(x) xmlGetAttr(x, 'Name') %in% .itemNames, docsum['Item'])

    ## Need to handle Pubchem weirdness
    if (length(nodes) != length(.itemNames)) {
      tmp <- .itemNames
      tmp[2] <- 'CanonicalSmile'
      nodes <- Filter(function(x) xmlGetAttr(x, 'Name') %in% tmp, docsum['Item'])      
      if (length(nodes) != length(tmp)) stop("Pubchem eUtils XML format has changed (?)")
    }

    values <- sapply(nodes, xmlValue)
    values <- t(values)
    dat <- data.frame(values)
    names(dat) <- .itemNames

    ## set types appropriately
    types <- sapply(nodes, function(x) xmlGetAttr(x, 'Type'))
    for (i in 1:length(types)) {
      if (types[i] == 'String') dat[,i] <- as.character(dat[,i])
      else if (types[i] == 'Integer') dat[,i] <- as.integer(dat[,i])
      else if (types[i] == 'Float') dat[,i] <- as.numeric(dat[,i])    
    }
    
    ## Look for the CompoundIdList item
    cid <- NA
    cidl <- Filter(function(x) xmlGetAttr(x, 'Name') == 'CompoundIDList', docsum['Item'])
    if (length(cidl) > 0) {
      cid <- xmlValue(cidl['Item'][[1]])
    }
    return(cbind(CID=cid, dat))
  })
  return(do.call(rbind, ret))
}

get.sid <- function(sid, quiet=TRUE, from.file=FALSE) {
  
  datafile <- NA
  
  if (!from.file) {
    sidURL <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?tool=rpubchem&db=pcsubstance&id='
    url <- paste(sidURL, paste(sid,sep='',collapse=','), sep='', collapse='')
    datafile <- tempfile(pattern = 'sid')
    .get.xml.file(url, datafile, quiet)
  } else {
    datafile <- sid
  }

  doc <- xmlParse(datafile)
  .extract.fields(doc)
}

.isinchikey <- function(s) { 
     (s==toupper(s))&(nchar(s) == 27)&(substr(s,15,15)=="-")&(substr(s,26,26)=="-") }

get.cid <- function(cid, quiet=TRUE) {
  url <- sprintf('https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/%d/JSON', cid)
  page <- .read.url(url)
  if (is.null(page)) {
    warning(sprintf("No data found for %d", cid))
    return(NULL)
  }
  record <- fromJSON(content=page)$Record
  sections <- record$Section

  ## Identifiers
  ids <- .section.by.heading(sections, "Names and Identifiers")
  ids <- .section.by.heading(ids$Section, "Computed Descriptors")
  ivals <- lapply(ids$Section, .section.handler)
  ivals.unlisted <- unlist(Filter(function(x) !is.null(x), ivals), recursive=TRUE)
  ivals <- data.frame(do.call(cbind, as.list(ivals.unlisted)), stringsAsFactors=FALSE)

  ## Process chemprops
  props <- .section.by.heading(sections, "Chemical and Physical Properties")
  if (is.null(props)) {
    warning(sprintf("No phys/chem properties section for %d", cid))
    return(NULL)
  }
  
  propnames <- c("Molecular Weight", "XLogP3", "Hydrogen Bond Donor Count",
                 "Hydrogen Bond Acceptor Count", "Rotatable Bond Count",
                 "Exact Mass", "Monoisotopic Mass", "Topological Polar Surface Area",
                 "Heavy Atom Count", "Formal Charge", "Complexity")
  computed <- .section.by.heading(props$Section, "Computed Properties")
  cvals <- lapply(propnames, function(x) {
    sec <- .section.by.heading(computed$Section, x)
    return(unlist(.section.handler(sec)))
  })
  cvals <- data.frame(t(sapply(cvals,c)))

  experimental <- .section.by.heading(props$Section, "Experimental Properties")
  if (is.null(experimental)) {
    evals <- data.frame(pKa=NA,"Kovats Retention Index"=NA)
  } else {
    evals <- lapply(experimental$Section, .section.handler,
                    keep = c('pKa', "Kovats Retention Index"))
    evals <- unlist(Filter(function(x) !is.null(x), evals), recursive=FALSE)
    if (is.null(evals))
      evals <- data.frame(pKa=NA, "Kovats Retention Index"=NA)
    else
      evals <- do.call(cbind, evals)
  }

  ## Record descriptions, added as attrs to the final data.frame
  .find.ref.by.id <- function(rid) Filter(function(x) as.numeric(x$ReferenceNumber) == rid, record$Reference)
  
  ids <- .section.by.heading(sections, "Names and Identifiers")
  ids <- .section.by.heading(ids$Section, "Record Description")
  si.items <- lapply(ids$Information, function(x) {
      if ("Description" %in% names(x)) {
          desc <- x$Description
      } else desc <- ""
      ref <- as.numeric(x$ReferenceNumber)
      val <- x$StringValue

      refitem <- .find.ref.by.id(ref)
      if (length(refitem) == 1)
          refitem = refitem[[1]]
      
      list(refnum=ref, desc=desc, value=val,
           refsource=refitem$SourceName,
           refsourceid=refitem$SourceID,
           refurl=refitem$URL)
  })

  ret <- data.frame(CID=cid, ivals, cvals, evals, stringsAsFactors=FALSE)
  attr(ret, "Summary.Information") <- si.items
  
  return(ret)
}

## .get.cid.old  <- function(cid, quiet=TRUE, from.file=FALSE) {

##   datafile <- NA
  
##   if (!from.file) {
##     cidURL <- 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?tool=rpubchem&db=pccompound&id='
##     url <- paste(cidURL, paste(cid,sep='',collapse=','), sep='', collapse='')
##     datafile <- tempfile(pattern = 'cid')
##     .get.xml.file(url, datafile, quiet)
##   } else {
##     datafile <- cid
##   }

##   doc <- xmlParse(datafile)
##   dat <- .extract.fields(doc)
##   dat$CID <- cid
##   return(dat)
## }

get.cid.list <- function(sid,  quiet=TRUE) {
  return(.cmpd.id2id(sid, 'sid', 'cids', quiet))
}

get.sid.list <- function(cid, quiet=TRUE) {
  return(.cmpd.id2id(cid, 'cid', 'sids', quiet))    
}