R/rc.cmpd.get.pubchem.R

Defines functions rc.cmpd.get.pubchem

Documented in rc.cmpd.get.pubchem

#' rc.cmpd.get.pubchem
#'
#' use pubchem rest and view APIs to retrieve structures, CIDs (if a name or inchikey is given), synonyms, and optionally vendor data, when available. 
#' @details useful for moving from chemical name to digital structure representation. greek letters are assumed to be 'UTF-8' encoded, and are converted to latin text before searching.   if you are reading in your compound name list, do so with 'encoding' set to 'UTF-8'. 
#' @param ramclustObj RAMClust Object input.  if used, ramclustObj$CID, ramclustObj$inchikey, and ramclustObj$ann are used as input, in that order, and ramclustObj is returned with $pubchem slot appended.
#' @param cmpd.names character vector.  i.e. c("caffeine", "theobromine", "glucose")
#' @param cmpd.cid numeric integer vector.  i.e. c(2519, 5429, 107526)
#' @param cmpd.inchikey character vector.  i.e. c("RYYVLZVUVIJVGH-UHFFFAOYSA-N", "YAPQBXQYLJRXSA-UHFFFAOYSA-N", "GZCGUPFRVQAUEE-SLPGGIOYSA-N")
#' @param cmpd.smiles character vector.  i.e. c("CN1C=NC2=C1C(=O)N(C(=O)N2C)C", "CN1C=NC2=C1C(=O)NC(=O)N2C")
#' @param use.parent.cid logical.  If TRUE, the CID for each supplied name/inchikey is used to retrieve its parent CID (i.e. the parent of sodium palmitate is palmitic acid).  The parent CID is used to retrieve all other names, properties.
#' @param manual.entry logical.  if TRUE, user input is enabled for compounds not matched by name. A browser window will open with the pubchem search results in your default browser. 
#' @param get.vendors logical.  if TRUE, vendor data is returned for each compound with a matched CID.  Includes vendor count and vendor product URL, if available
#' @param priority.vendors charachter vector.  i.e. c("MyFavoriteCompany", "MySecondFavoriteCompany").  If these vendors are found, the URL returned is from priority vendors. Priority is given by order input by user. 
#' @param get.properties logical.  if TRUE, physicochemical property data are returned for each compound with a matched CID.
#' @param get.synonyms = TRUE. logical.  if TRUE, retrieve pubchem synonyms.  returned to $synonyms slot
#' @param find.short.lipid.name = TRUE. logical.  If TRUE, and get.synonyms = TRUE, looks for lipid short hand names in synonyms list (i.e. PC(36:6)). returned to $short.name slot.  Short names are assigned only if assign.short.names = TRUE.
#' @param find.short.synonym = TRUE. logical.  If TRUE, and get.synonyms = TRUE, looks for lipid short synonyms, with prioritization for names with fewer numeric characters (i.e. database accession numbers or CAS numbers). returned to $short.name slot.  Short names are assigned only if assign.short.names = TRUE.
#' @param max.name.length = 20.  integer.  If names are longer than this value, short names will be searched for, else, retain original name.
#' @param assign.short.name = TRUE.  If TRUE, short names from find.short.lipid.name and/or find.short.synonym = TRUE, short names are assigned the be the default annotation name ($ann slot), and original annotations are moved to $long.name slot.
#' @param all.props logical.  If TRUE, all pubchem properties (https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest$_Toc494865567) are returned.  If false, only a subset (faster).
#' @param get.bioassays logical. If TRUE, return a table summarizing existing bioassay data for that CID. 
#' @param get.pathways logical.  If TRUE, return a table of metabolic pathways for that CID.
#' @param write.csv logical.  If TRUE, write csv files of all returned pubchem data. 
#' @param search.name character.  optional name to assign to pubchem search to name output .csv files.   
#' @return returns a list with one or more of $pubchem (compound name and identifiers) - one row in dataframe per CID; $properties contains physicochemical properties - one row in dataframe per CID; $vendors contains the number of vendors for a given compound and selects a vendor based on 'priority.vendors' supplied, or randomly choses a vendor with a HTML link - one row in dataframe per CID;  $bioassays contains a summary of bioassay activity data from pubchem - zero to many rows in dataframe per CID
#' @author Corey Broeckling
#' 
#' @export 
#' 

rc.cmpd.get.pubchem <- function(
    ramclustObj = NULL,
    search.name = NULL,
    cmpd.names = NULL,
    cmpd.cid = NULL,
    cmpd.inchikey = NULL,
    cmpd.smiles = NULL,
    use.parent.cid = FALSE,
    manual.entry = FALSE,
    get.vendors = FALSE,
    priority.vendors = c("Sigma Aldrich", "Alfa Chemistry", "Acros Organics", "VWR", 
                         "Alfa Aesar", "molport", "Key Organics", "BLD Pharm"),
    get.properties = TRUE,
    all.props = FALSE,
    get.synonyms = TRUE,
    find.short.lipid.name = TRUE,
    find.short.synonym = TRUE,
    max.name.length = 30,
    assign.short.name = TRUE,
    get.bioassays = TRUE,
    get.pathways = TRUE,
    write.csv = TRUE
    
) {
  
  ## function to close failed pubchem queries to prevent 
  ## all connections are in use error
  closePubchemConnections <- function (desc.rem = "pubchem") {
    d <- showConnections(all = TRUE)
    desc <- d[,"description"]
    desc <- desc[grepl(desc.rem, desc)]
    set <- as.integer(as.numeric(names(desc)))
    if(length(set) > 0) {
      for (i in seq_along(set)) close(getConnection(set[i]))
    }
    gc()
    invisible()
  }
  
  
  ## test connection to pubchem servers
  html <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/58-08-2/property/inchikey/JSON"
  out <- tryCatch(
    {
      jsonlite::read_json(html)
    },
    error=function(cond) {
      closePubchemConnections()
      stop("pubchem rest connection could not be established. This may be due to:", '\n',
           "  -  lack of internet access", '\n',
           "  -  pubchem server is down", '\n',
           "  -  pubchem server has blocked access for this IP address (try restarting your R and Rstudio session)",
           '\n')
    },
    warning=function(cond) {
      warning("pubchem rest triggered a warning.", '\n')
    },
    finally={
      cons <- suppressWarnings(showConnections(all = TRUE)); rm(cons)
    }
  )
  rm(out)
  
  if(!is.null(ramclustObj)) {
    cmpd.names <- ramclustObj$ann
    if(!is.null(ramclustObj$inchikey)) {
      cmpd.inchikey <- ramclustObj$inchikey
    }
  }
  
  
  ## check that lengths of cmpd.* make sense, and standardize
  l <- c("cmpd.names" = length(cmpd.names), 
         "cmpd.cid" = length(cmpd.cid), 
         "cmpd.inchikey" = length(cmpd.inchikey),
         "cmpd.smiles" = length(cmpd.smiles)
  )
  if(l["cmpd.names"] != max(l) & l["cmpd.names"] > 0) {
    stop(
      paste(" Length cmpd.names = ", l['cmpd.names'], "while length", names(l)[which.max(l)], "=", max(l), '\n',
            " - these must be either exactly the same length or one must be NULL", sep = " ")
    )
  }
  
  if(l["cmpd.cid"] != max(l) & l["cmpd.cid"] > 0) {
    stop(
      paste(" Length cmpd.cid = ", l['cmpd.cid'], "while length", names(l)[which.max(l)], "=", max(l), '\n',
            " - these must be either exactly the same length or one must be NULL", sep = " ")
    )
  }
  
  if(l["cmpd.inchikey"] != max(l) & l["cmpd.inchikey"] > 0) {
    stop(
      paste(" Length cmpd.inchikey = ", l['cmpd.inchikey'], "while length", names(l)[which.max(l)], "=", max(l), '\n',
            " - these must be either exactly the same length or one must be NULL", sep = " ")
    )
  }
  
  if(l["cmpd.smiles"] != max(l) & l["cmpd.smiles"] > 0) {
    stop(
      paste(" Length cmpd.smiles = ", l['cmpd.smiles'], "while length", names(l)[which.max(l)], "=", max(l), '\n',
            " - these must be either exactly the same length or one must be NULL", sep = " ")
    )
  }
  
  if(l["cmpd.names"] < max(l)) cmpd.names <- rep(NA, max(l))
  if(l["cmpd.cid"] < max(l)) cmpd.cid <- rep(NA, max(l))
  if(l["cmpd.inchikey"] < max(l)) cmpd.inchikey <- rep(NA, max(l))
  if(l["cmpd.smiles"] < max(l)) cmpd.smiles <- rep(NA, max(l))
  
  ## store original data in data.frame
  d <- data.frame("user.cmpd" = cmpd.names, 
                  "user.cid" = cmpd.cid,
                  "user.inchikey" = cmpd.inchikey,
                  "use.smiles" = cmpd.smiles)
  
  pubchem <- list()
  
  ## clean up text
  cmpd.names <- trimws(cmpd.names)
  cmpd.names[which(nchar(cmpd.names) < 1)] <- NA
  cmpd.names <- gsub(" ", "%20", cmpd.names)
  cmpd.inchikey <- trimws(cmpd.inchikey)
  cmpd.inchikey[which(nchar(cmpd.inchikey) < 1)] <- NA
  cmpd.cid <- trimws(cmpd.cid)
  cmpd.cid[which(nchar(cmpd.cid) < 1)] <- NA
  cmpd.smiles <- trimws(cmpd.smiles)
  cmpd.smiles[which(nchar(cmpd.smiles) < 1)] <- NA
  
  
  greek <- read.csv(paste(find.package("RAMClustR"), "/params/greek.csv", sep=""), header=TRUE, encoding = "UTF-8", stringsAsFactors = FALSE)
  
  
  
  for(i in 1:nrow(greek)) {
    cmpd.names <- gsub(greek[i,2], greek[i,1], cmpd.names)
  }  
  
  if(!all(is.na(cmpd.smiles))) {
    html.smiles <- cmpd.smiles
    html.translation <- rbind(
      c("[", "%5B"),
      c("]", "%5D"),
      c("@", "%40"),
      c("/", "%2F"),
      c("\\", "%5C"),
      c("=", "%3D")
    )
    
    for(i in 1:length(html.smiles)) {
      if(is.na(html.smiles[i])) {next}
      html.smiles[i] <- as.character(html.smiles[i])
      html.smiles[i] <- trimws(unlist(strsplit(html.smiles[i], "|", fixed = TRUE))[1])
      for(j in 1:nrow(html.translation)) {
        html.smiles[i] <- gsub(html.translation[j,1], html.translation[j,2], html.smiles[i], fixed = TRUE)
      }
    }
    
  } else {
    html.smiles <- cmpd.smiles
  }
  
  
  
  ## get missing CIDs from inchikeys first
  ## if more than one inchikey per compound, lowest value CID is used
  do <- which(is.na(cmpd.cid) & !is.na(cmpd.inchikey))
  if(length(do) > 0) {
    cat("getting cid from inchikey", '\n')
    do <- cmpd.inchikey[do]
    do.l <- split(do, ceiling(seq_along(do)/1))
    for(i in 1:length(do.l)) {
      keep <- which(!do.l[[i]]=="NA")
      # cat(do.l[[i]][keep], '\n')
      Sys.sleep(0.2)
      if(length(keep)==0) next
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/",
                     paste0(do.l[[i]][keep], collapse = ","),
                     "/property/", "inchikey", "/JSON")
      out <- tryCatch(
        {
          jsonlite::read_json(html)
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
          cons <- suppressWarnings(showConnections(all = TRUE)); rm(cons)
        }
      )
      if(is.na(out[1])) next
      tmp <- lapply(1:length(out$PropertyTable$Properties),
                    FUN = function(x) {
                      unlist(out$PropertyTable$Properties[[x]])
                    })
      tmp <- data.frame(t(data.frame(tmp, stringsAsFactors = FALSE)), stringsAsFactors = FALSE)
      tmp <- tmp[order(tmp[,"CID"], decreasing = TRUE),]
      
      for(x in 1:nrow(tmp)) {
        rp <- which(cmpd.inchikey == tmp[x,"InChIKey"])[1]
        if(!is.na(rp)) {cmpd.cid[rp] <- tmp[x, "CID"]}
      }
    }
  }
  
  
  ## get missing CIDs from smiles next
  do.ind <- which(is.na(cmpd.cid) & !is.na(cmpd.smiles))
  do <- html.smiles[do.ind]
  if(length(do) > 0) {
    for(i in 1:length(do)) {
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/", do[i], "/cids/TXT")
      tmp <- tryCatch(
        {
          as.integer(as.character(readLines(html)))
        },
        error=function(cond) {
          closePubchemConnections()
          stop("pubchem rest connection could not be established. This may be due to:", '\n',
               "  -  lack of internet access", '\n',
               "  -  pubchem server is down", '\n',
               "  -  pubchem server has blocked access for this IP address (try restarting your R and Rstudio session)",
               '\n')
        },
        warning=function(x) {
          closePubchemConnections()
          NA
        },
        finally={
          closePubchemConnections()
          cons <- suppressWarnings(showConnections(all = TRUE)); rm(cons)
        }
        
      )
      if(!is.na(tmp)) cmpd.cid[do.ind[i]] <- tmp 
    }
  }
  
  ## get missing CIDs from names next
  do.ind <- which(is.na(cmpd.cid) & !is.na(cmpd.names))
  do <- cmpd.names[do.ind]
  # cat(do)
  if(length(do) > 0) {
    cat("getting cid from names", '\n')
    for(i in 1:length(do)) {
      Sys.sleep(0.2)
      # cat(do[i])
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
                     do[i],
                     "/property/", "inchikey", "/JSON")
      # html <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/8007-46-3/property/inchikey/JSON"
      out <- tryCatch(
        {
          jsonlite::read_json(html)
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
          cons <- suppressWarnings(showConnections(all = TRUE)); rm(cons)
        }
      )
      if(is.na(out[1])) next
      tmp <- lapply(1:length(out$PropertyTable$Properties),
                    FUN = function(x) {
                      unlist(out$PropertyTable$Properties[[x]])
                    })
      tmp <- data.frame(t(data.frame(tmp, stringsAsFactors = FALSE)), stringsAsFactors = FALSE)
      tmp <- tmp[order(tmp[,"CID"], decreasing = TRUE),]
      cmpd.cid[do.ind[i]] <- tmp[1, "CID"]
      # cat(i, tmp[1, "CID"], do[i], '\n')
    }
  }
  
  
  ##
  cid <- cmpd.cid
  
  ### offer opportunity to revise CIDs with manual input, if need be. use name only (inchikey should be unambiguous)
  if(manual.entry){
    missing <- which(is.na(cid) & !is.na(cmpd.names)) 
    
    cat(length(missing), "missing CIDs", '\n',
        "  enter '1' to skip these compounds", '\n', 
        "  enter '2' to enter CIDs manually", '\n')
    readback <- readline()
    
    if(readback == '2') {
      for(i in missing) {
        cat("please enter CID number or hit 'enter' to skip to next (no CID found)", '\n',
            "If you wish to quit manual entry, enter 'q' to return current Output only.", '\n',
            cmpd.names[i], '\n')
        Sys.sleep(0.2)
        utils::browseURL(paste0("https://pubchem.ncbi.nlm.nih.gov/#query=", cmpd.names[i]))
        readback <- readline()
        if(readback == "q") {break}
        cmpd.cid[i] <- readback
        
      }
    }
  }
  
  cmpd.cid[which((cmpd.cid == "NA"))] <- NA
  cid <- cmpd.cid
  d <- data.frame(d, "cmpd.cid" = cmpd.cid, stringsAsFactors = FALSE)
  
  ## find.parent.cid, if TRUE
  if(use.parent.cid) {
    cat("getting parent cid from cid", '\n')
    parent.cid <- cmpd.cid
    # do.ind <- which(!is.na(cmpd.cid) & !is.na(cmpd.names))
    do.ind <- which(!is.na(cmpd.cid))
    for(i in do.ind) {
      Sys.sleep(0.2)
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/",
                     cmpd.cid[i],
                     "/cids/TXT?cids_type=parent")
      out <- tryCatch(
        {
          readLines(html)
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
        }
      )
      if(is.na(out[1])) next
      out <- sort(as.numeric(out))
      parent.cid[i] <- out[1]
      rm(out)
    }
    cid <- parent.cid
    d <- data.frame(d, "parent.cid" = parent.cid, stringsAsFactors = FALSE)
  }
  
  d <- data.frame(d, 'cid' = cid, stringsAsFactors = FALSE)
  #  pubchem$compounds <- d
  
  cid.l <- split(cid, ceiling(seq_along(cid)/1))
  
  ## pubchem URL
  do <- which(!is.na(d$cid))
  urls <- paste0("https://pubchem.ncbi.nlm.nih.gov/compound/", d$cid[do])
  d$pubchem.url <- rep(NA, nrow(d))
  d$pubchem.url[do] <- urls
  
  ## get pubchem name
  pubchem.name <- rep(NA, length(cid))
  cat("getting pubchem compound name from cid", '\n')
  for(i in 1:length(cid.l)) {
    keep <- which(!cid.l[[i]]=="NA")
    if(length(keep) == 0) next
    # urls <- paste0("https://pubchem.ncbi.nlm.nih.gov/compound/", d$CID[do])
    # d$pubchem.url <- rep("", nrow(d))
    # d$pubchem.url[do] <- urls
    html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/",
                   paste0(cid.l[[i]][keep], collapse = ","),
                   "/description/", "JSON")
    # out <- jsonlite::read_json(html)
    out <- tryCatch(
      {
        jsonlite::read_json(html)
      },
      error=function(cond) {
        closePubchemConnections()
        return(NA)
      },
      warning=function(cond) {
        closePubchemConnections()
        return(NA)
      },
      finally={
        closePubchemConnections()
      }
    )
    if(is.na(out[1])) next
    
    for(j in 1:length(out$InformationList$Information)) {
      tmp <- out$InformationList$Information[[j]]
      if(is.null(tmp$Title)) next
      tmp.title <- tmp$Title
      pubchem.name[which(cid == tmp$CID)] <- tmp.title
    }
  }
  
  d <- data.frame(d, "pubchem.name" = pubchem.name, stringsAsFactors = FALSE)
  pubchem$pubchem <- d
  
  ## Get properties
  if(get.properties) {
    cat("getting physicochemical properties and structure representations from cid", '\n')
    if(all.props) {
      props <- c(
        'MolecularFormula',
        'MolecularWeight',
        'CanonicalSMILES',
        'IsomericSMILES',
        'InChI',
        'InChIKey',
        'IUPACName',
        'XLogP',
        'ExactMass',
        'MonoisotopicMass',
        'TPSA',
        'Complexity',
        'Charge',
        'HBondDonorCount',
        'HBondAcceptorCount',
        'RotatableBondCount',
        'HeavyAtomCount',
        'IsotopeAtomCount',
        'AtomStereoCount',
        'DefinedAtomStereoCount',
        'UndefinedAtomStereoCount',
        'BondStereoCount',
        'DefinedBondStereoCount',
        'UndefinedBondStereoCount',
        'CovalentUnitCount',
        'Volume3D',
        'XStericQuadrupole3D',
        'YStericQuadrupole3D',
        'ZStericQuadrupole3D',
        'FeatureCount3D',
        'FeatureAcceptorCount3D',
        'FeatureDonorCount3D',
        'FeatureAnionCount3D',
        'FeatureCationCount3D',
        'FeatureRingCount3D',
        'FeatureHydrophobeCount3D',
        'ConformerModelRMSD3D',
        'EffectiveRotorCount3D',
        'ConformerCount3D',
        'Fingerprint2D'
      )
    } else {
      props <- c(
        'MolecularFormula',
        'MolecularWeight',
        'CanonicalSMILES',
        'IsomericSMILES',
        'InChI',
        'InChIKey',
        'XLogP',
        'ExactMass',
        'MonoisotopicMass',
        'TPSA',
        'HBondDonorCount',
        'HBondAcceptorCount'
      )
    }
    
    properties <- d[,0]
    for(i in 1:length(cid.l)) {
      Sys.sleep(0.2)
      keep <- which(!cid.l[[i]]=="NA")
      if(length(keep) == 0) next
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/",
                     paste0(cid.l[[i]][keep], collapse = ","),
                     "/property/", paste0(props, collapse =","), "/JSON")
      # out <- jsonlite::read_json(html)
      out <- tryCatch(
        {
          jsonlite::read_json(html)
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
        }
      )
      if(is.na(out)) next
      for(j in 1:length(out$PropertyTable$Properties)) {
        tmp <- data.frame(out$PropertyTable$Properties[[j]], stringsAsFactors = FALSE)
        properties[which(cid == tmp$CID),names(tmp)] <- tmp
      }
    }
    dimnames(properties)[[2]][1] <- "cid"
    pubchem$properties <- properties
  }
  
  if(get.vendors) {
    cat("getting vendor data from cid", '\n')
    vendors <- d[,"cid", drop = FALSE]
    do <- which(!is.na(d$cid))
    urls <- paste0(pubchem$pubchem$pubchem.url[do], "#section=Chemical-Vendors")
    n.vendors <- rep(NA, nrow(d))
    vendor.urls <- rep(NA, nrow(d))
    pubchem.vendors.url <-rep(NA, nrow(d))
    for(i in do) {
      Sys.sleep(0.2)
      out <- tryCatch(
        {
          jsonlite::read_json(paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/categories/compound/",
                                     cid[i],
                                     "/JSON"))
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
        }
      )
      if(is.na(out)) next
      cats <- sapply(1:length(out$SourceCategories$Categories), FUN = function(x) {
        out$SourceCategories$Categories[[x]]$Category
      })
      cat.select <- which(cats == "Chemical Vendors")
      if(length(cat.select) == 0) next
      n.vendors[i] <- length(out$SourceCategories$Categories[[cat.select]]$Sources)
      
      vendor.names <- sapply(1:length(out$SourceCategories$Categories[[cat.select]]$Sources),
                             FUN = function(x) {
                               out$SourceCategories$Categories[[cat.select]]$Sources[[x]]$"SourceName"
                             }
      )
      for(j in priority.vendors) {
        use <- agrep(j, vendor.names,  max.distance = 0.2)
        if(length(use) > 0) {
          vendor.url <- out$SourceCategories$Categories[[cat.select]]$Sources[[use[1]]]$SourceRecordURL[1]
          if(length(vendor.url) == 0) vendor.url <- NA
          vendor.urls[i] <- vendor.url
          break
        }
      }
      if(is.na(vendor.urls[i]))  {
        vendor.url <- out$SourceCategories$Categories[[cat.select]]$Sources[[sample((1:length(vendor.names)), 1)]]$"SourceRecordURL"[1]
        if(length(vendor.url) == 0) vendor.url <- ""
        vendor.urls[i] <- vendor.url
      }
      pubchem.vendors.url[do] <- urls
    }
    
    vendors <- data.frame(vendors, 
                          "pubchem.vendors.url" = pubchem.vendors.url,
                          "n.vendors" = n.vendors, 
                          "vendor.url" = vendor.urls, 
                          stringsAsFactors = FALSE)
    pubchem$vendors <- vendors
    
  }
  
  if(get.synonyms) {
    cat("getting synonym data from cid", '\n')
    
    pubchem$synonyms <- as.list(rep(NA, length(ramclustObj$cmpd)))
    names(pubchem$synonyms) <- ramclustObj$cmpd
    
    if(find.short.lipid.name | find.short.synonym) {
      pubchem$short.name <- rep(NA, nrow(d))
    }
    
    for(i in 1:length(cid.l)) {
      keep <- which(!cid.l[[i]]=="NA")
      if(length(keep) == 0) next
      html <- paste0("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/",
                     paste0(cid.l[[i]][keep], collapse = ","),
                     "/synonyms/JSON")
      #out <- jsonlite::read_json(html)
      out <- tryCatch(
        {
          jsonlite::read_json(html)
        },
        error=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        warning=function(cond) {
          closePubchemConnections()
          return(NA)
        },
        finally={
          closePubchemConnections()
        }
      )
      if(is.na(out)) next
      
      for(j in 1:length(out$InformationList$Information)) {
        on <- which(cid ==  cid.l[[i]][keep[j]])
        if(length(on)==0) next
        if(length(on) > 1) {
          if(!(any(ls() == "multi.cid"))) {
            multi.cid <- rep(1, 1)
            names(multi.cid)[1] <- cid.l[[i]][keep[j]]
            on <- on[1]
          } else {
            if(any(names(multi.cid) == cid.l[[i]][keep[j]])) {
              which.multi <- which(names(multi.cid) == cid.l[[i]][keep[j]])
              multi.cid[which.multi] <- multi.cid[which.multi] + 1
              on <- on[multi.cid[which.multi]]
            } else {
              multi.cid <- c(multi.cid, 1)
              names(multi.cid)[length(multi.cid)] <- cid.l[[i]][keep[j]]
              on <- on[1]
            }
          }
        }
        tmp <- out$InformationList$Information[[j]]
        if(is.null(tmp$Synonym)) next
        syns <- unlist(tmp$Synonym)
        pubchem$synonyms[[on]] <- syns
        
        if(find.short.lipid.name) {
          if(length(pubchem$synonyms[[on]]) > 1) {
            tars <- pubchem$synonyms[[on]][which(
              stringr::str_detect(pubchem$synonyms[[on]], "\\([0-9]{1,2}\\:[0-9]{1,2}\\)") && (nchar(pubchem$synonyms[[on]]) <= max.name.length)
            )]
            if(length(tars) > 0) {
              nc <- nchar(tars)
              pubchem$short.name[on] <- tars[which.min(nc)]
            }
          }
        }
        
        if(find.short.synonym & is.na(pubchem$short.name[on])) {
          if(length(pubchem$synonyms[[on]]) > 1) {
            letter.rich <- nchar(pubchem$synonyms[[on]])
            nchars <- nchar(pubchem$synonyms[[on]])
            pattern <- "[[:digit:]]+"
            numbers <- sapply(1:length(pubchem$synonyms[[on]]), FUN = function(k) {
              nchar(paste(unlist(regmatches(pubchem$synonyms[[on]][k], 
                                            gregexpr(pattern, pubchem$synonyms[[on]][k]))), 
                          collapse = "", sep = ""))
            })
            letter.rich <- 1- (numbers/nchars)
            use <- which(nchars < max.name.length & letter.rich > 0.6)
            if(length(use) == 0) {
              use <- which(nchars < max.name.length & letter.rich > 0.3)
            }
            if(length(use) == 0) {
              use <- which(nchars < max.name.length)
            }
            if(length(use) == 0) {
              use <- which.min(nchars)
            }
            if(length(use) == 0) next
            use <- use[1]
            pubchem$short.name[on] <- pubchem$synonyms[[on]][use]
          }
        }
      }
      
      
      
      if(assign.short.name & !is.null(ramclustObj)) {
        ramclustObj$original.ann <- ramclustObj$ann
        ramclustObj$ann[which(!(is.na(pubchem$short.name)))] <- pubchem$short.name[which(!(is.na(pubchem$short.name)))]
      }
    }
  }
  
  
  if(get.bioassays) {
    cat("getting bioasssay from cid", '\n')
    for(i in 1:length(cid.l)) {
      keep <- which(!cid.l[[i]]=="NA")
      if(length(keep) == 0) next
      url <- paste0(
        "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/",
        paste(cid.l[[i]][keep], collapse = ","),
        "/assaysummary/CSV"
      )
      
      bd <- tryCatch(
        {
          read.csv(url)
        },
        error=function(cond) {
          closePubchemConnections()
          return( data.frame("cid" = rep(NA, 0)))
        },
        warning=function(cond) {
          closePubchemConnections()
          return( data.frame("cid" = rep(NA, 0)))
        },
        finally={
          closePubchemConnections()
        }
      )
      if(nrow(bd)==0) next
      if(any(ls()=="bioassays")) {
        # cat('registered TRUE')
        bioassays <- rbind(bioassays, bd)
        
      } else {
        # cat('registered FALSE')
        bioassays <- bd       
      }
    }
    if(any(ls()=="bioassays")) {
      dimnames(bioassays)[[2]][which(dimnames(bioassays)[[2]] == "CID")] <- "cid"
      pubchem$bioassays <- bioassays
    } else {
      bioassays <- data.frame("cid" = rep(NA, 0))
      pubchem$bioassays <- bioassays
    }
  }
  
  if(get.pathways) {
    cat("getting pathway from cid", '\n')

    for(i in 1:length(cid)) {
      pathway <- get.pathways(cid = cid[i])
      pathway <- cbind("cid" = rep(cid[i]), pathway)
      if(!any(ls()=="pathways")) pathways <- pathway[0,]
      if(is.na(pathway[1,'name'])) next
      pathways <- rbind(pathways, pathway)
      
    }
    pubchem$pathways <- pathways
  }
  
  for(i in 1:length(pubchem)) {
    if(!is.data.frame(pubchem[[i]])) next
    if(nrow(pubchem[[i]]) == length(ramclustObj$cmpd)) {
      pubchem[[i]] <- data.frame("cmpd" = ramclustObj$cmpd, pubchem[[i]])
    }
  }
  
  if(write.csv) {
    if(is.null(search.name)) {search.name = "pubchem.data"}
    dir.create(search.name)
    write.csv(pubchem[[1]], file = paste0(search.name,"/",names(pubchem)[1], ".csv"))
    if(length(pubchem)>1) {
      for(i in 2:length(pubchem)) {
        if(is.data.frame(pubchem[[i]])) {
          write.csv(pubchem[[i]], 
                    file = paste0(search.name,"/",names(pubchem)[i], ".csv"),
                    row.names = FALSE)
        }
      }
    }
  }
  
  if(!is.null(ramclustObj)) {
    ramclustObj$pubchem <- pubchem
    return(ramclustObj)
  } else {
    return(pubchem)
  }
  
}

Try the RAMClustR package in your browser

Any scripts or data that you put into this service are public.

RAMClustR documentation built on Oct. 20, 2023, 5:08 p.m.