R/NCBI_getters.R

Defines functions buildEntrezGeneDbFromWebServices getDataAndAddToDb makeOrgDB makeUnWoundGOTables unwindGOs makeSimpleDF convertNullToNA combineTwoLists mergeLists getEntrezGenesFromTaxId getGeneStuff getGOInfo modifyGOs padGOIds getSubNodeInfo checkRetrieveNode getDBTag getOtherSrcDocs

## This file is for code that calls NCBI and then retrieves data to populate a
## local DB (eventually on the fly).

## I need to have functions that will make tables in a DB, but I also need
## (separate) function(s)? that get the "stuff" from NCBI.  Let's start with
## that.

## Lets start by just making this thing get me some PMIDs for a string of EGs

## getNucleotideStuff <- function(x, ret="acc"){
##   baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
##   xsep <- paste(x, collapse=",")
##   url <- paste(baseUrl,"db=nucleotide&id=",xsep,"&rettype=",ret, sep="")
##   readLines(url)
## }

## egs = c("10","4557270")
## getNucleotideStuff(egs)
## getNucleotideStuff(egs, "gb")
## getNucleotideStuff(egs, "gp")
## getNucleotideStuff(egs, "seqid")
## getNucleotideStuff(egs, "ft")



## Another way to approach this is something like this:
## str(sapply(EG, function(elt) unlist(xpathApply(xmlDoc(elt), "//PubMedId", xmlValue))))

## where EGSet is the whole doc ie.
## EGSet = xmlParse(url)
## for (i in 1:n) print(xpathApply(EGSet, sprintf("count(//Entrezgene[%d]//PubMedId)", i)))
## for (i in 1:n) print(xpathApply(EGSet, sprintf("//Entrezgene[%d]//PubMedId/text()", i), xmlValue))

## or
## make queries
## xpq = sprintf("//Entrezgene[%d]//PubMedId", 1:n)
## lapply(xpq, xpathApply, doc=EGSet, fun=xmlValue)

## Other notes from discussion of Xpath with Martin on how to use XPath to
## retrieve more from a node without allocating a node...
## library(XML)
## base <-
## url <-
## sprintf("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=%s&retmode=xml",
##                "1,10,100")
## eg <- xmlParse(url)
## sets <- getNodeSet(eg, "//Entrezgene")
## res <-
##     xpathApply(sets[[1]],
## "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)

## xpathApply(eg, "count(//Dbtag_db[text()='GO'])")
## xpathApply(eg, "//Dbtag_db[text()='GO']/text()")



#####################################################################
## helper functions for retrieving data from nodes 

## The following is a generic solution for getting stuff from sub-nodes
## Generic way to get src Docs

.getOtherSrcDocs <- function(doc, otherSourcePath){
  ## for each doc, get other Source nodes and make mini-docs
  srcDocs <- lapply(getNodeSet(doc, otherSourcePath), xmlDoc)
  srcDocs
}

## retrieve Db tag from such an other source doc.
.getDBTag <- function(srcDoc, dbTagPath){
  dbTag <- unlist(xpathApply(srcDoc, dbTagPath, xmlValue))
  if(is.null(dbTag)) dbTag <- "Bummer, the dbTag is NULL."
  dbTag
}


## check the node and return the value requested
.checkRetrieveNode <- function(doc, dbTagPath, resIDPath, type){
  results = vector("list",length(dbTagPath))
  for(i in seq_len(length(dbTagPath))){
    if(.getDBTag(doc, dbTagPath[i])==type
       || (is.null(type) & dbTagPath==resIDPath)){
      results[[i]] <- (unlist(xpathApply(doc, resIDPath[i], xmlValue)))
    }
  }
  if(!is.null(results))return(results)
}



## default values just get you KEGG gene IDs (for now)
.getSubNodeInfo <- function(doc,
                           type= "KEGG",
                           otherSourcePath = "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
                           dbTagPath = "/Other-source/Other-source_src/Dbtag/Dbtag_db",
                           resIDPath = "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"){
  ## Get sub-nodes for looped parsing
  srcDocs <- .getOtherSrcDocs(doc, otherSourcePath)

  ## check to make sure we don't have unequal arguments
  if(length(dbTagPath) != length(resIDPath))
    stop("result and dbTag (checking) paths must have same number of elements")

  if(length(dbTagPath) == 1){
    results <- unlist(lapply(srcDocs, .checkRetrieveNode,
                             dbTagPath, resIDPath, type))
  }else{  ##compound result is expected so don't simplify it.
    results <- lapply(srcDocs, .checkRetrieveNode,
                             dbTagPath, resIDPath, type)
  }
}




## GO attempt #2 (still fails since there are apparently some GO IDs where
## there is not a GO evidence code available. So I need to fully vectorize the generic code.
## A convenience function to pad GO Ids
.padGOIds <- function(GOIds){
  pads <- 7-nchar(GOIds)
  res<-character(length(GOIds))
  for(i in seq_len(length(GOIds))){
    res[i] <- paste(paste(rep("0",pads[i]),collapse=""),GOIds[i],sep="")
  }
  paste("GO:",res,sep="")
}

.modifyGOs <- function(elem){
  elem[[1]] <- .padGOIds(elem[[1]])
  elem[[2]] <- gsub("evidence: ", "", elem[[2]])
  elem
}

## for GO retrieval I have partially vectorized the helper functions.
getGOInfo <- function(doc){
  
  GOIds <- .getSubNodeInfo(doc, type = "GO",
                    otherSourcePath = "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
                           dbTagPath = rep("/Other-source/Other-source_src/Dbtag/Dbtag_db", 2),
                           resIDPath = c("/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id", "/Other-source/Other-source_post-text"))

  GOIds <- lapply(GOIds, .modifyGOs)  
  GOIds
}

getGeneStuff <- function(entrezGenes, dir = "files"){
  baseUrl <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
  xsep <- paste(entrezGenes, collapse=",")
  url <- paste(baseUrl,"db=gene&id=",xsep,"&retmode=xml", sep="")

  ## NOW we have to parse the available XML
  EGSet <- xmlParse(getURL(url, followlocation=TRUE))

  ## Here we will save the files out to a dir
  miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
  wd <- getwd()
  path <- paste(wd, dir, sep="/")
  if(!file.exists(path)){dir.create(path)}
  setwd(path)
  paths <- paste(paste(path, entrezGenes, sep="/"),".xml",sep="")
  for(i in seq_len(length(miniDocs))){
    saveXML( miniDocs[[i]], file = paths[[i]])
  }
  setwd(wd)
 
  ## attempt to remove miniDocs each time?
  ## (something tells me this is a token gesture only)
  rm(miniDocs)
  ## even just doing this (saving the files) takes nearly 30 gigs of RAM before
  ## too long (but at least it completed).


  
##   ## Some tags can only occur once per gene
##   entrezGeneID <- unlist(xpathApply(EGSet, "//Gene-track_geneid", xmlValue))
##   speciesName <- unlist(xpathApply(EGSet, "//Org-ref_taxname", xmlValue))

##   ## But most information is either more complex than that or is stored in a
##   ## more complex way by the XML
##   ## miniDocs are the individual entrez Gene records
## ##   miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)

## ## ##   ## Sometimes we are lucky and the information we want has an obviously
## ## ##   ## unique tag.
## ## ##   ## Like pubmed Ids
## ##   pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ##   pmIds <- lapply(miniDocs, function(x)
## ##                   unlist(xpathApply(x, pmIdPath, xmlValue)))


## ###########################################################################
## ## Lets try to get rid of the minidocs

## ## miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
  
##    ##pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ##   pmIds <- lapply(miniDocs, function(x)
## ##                   unlist(xpathApply(x, pmIdPath, xmlValue)))

##   ## Can try this way: - no improvement?
## ##   egs <- getNodeSet(EGSet, "//Entrezgene")
## ##   ###pmIds <- xpathApply(xmlDoc(egs[[1]]), "//PubMedId", xmlValue)
## ##   pmIds <- sapply(egs, function(x) unlist(xpathApply(xmlDoc(x), "//PubMedId", xmlValue)))

##   ## Can also try this way: - but for 800 genes in a chunk,
##   ## - at least it tops out at 2 GB (for 800 genes/chunk = better on usage AND
##   ## - speed with smaller chunks, but it means we bug some server more
##   ## - often...
##   ## IT also takes MUCH longer though! (probably because of larger parse space))
##   n <- length(entrezGenes)
##   paths <- sprintf("//Entrezgene[%d]//PubMedId", 1:n)
##   pmids <- lapply(paths, xpathApply, doc=EGSet, fun=xmlValue)
##   pmIds <- lapply(pmids, unlist)

## ## This is frustrating.  Total process looks to take about 6-12 Gigs of ram (and be pretty slow for parsing only PMIDs ~ 1 hour).  I mean the ram usage is WAY down when I stop calling getNodeSet() (or maybe NOT - since it seems to climb more as things progress), but I am seriously tempted to just save out individual XML files to a tempDir, in an initial step, and then parse one at a time.  Worst part is that it seems to eat MORE ram the further along it goes...

##   ## I should really do this.  At least if I am working from files in a dir, I
##   ## can have better control over how the parsing is done and where the time
##   ## is going.

  
  
## ##   ## For GO I should be able to do something like this:
## ##   res <- xpathApply(egs[[1]],
## ##        "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)
## ##   ## checks
## ##   res1 <- xpathApply(egs[[1]], "count(//Dbtag_db[text()='GO'])")
## ##   res2 <- xpathApply(egs[[1]], "//Dbtag_db[text()='GO']/text()")
    



  
## ##   ## But sometimes we have to do some more checking to make sure that what we
## ##   ## are retrieving is in the context of certain kinds of tags, or of tags
## ##   ## that contain certain information.

## ##   ## custom helper to get and process the GO terms
## ##   GOIds <- lapply(miniDocs, getGOInfo)

## ##   ## KEGG Gene IDs are NOT the path IDs.
## ##   otherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source"
## ##   dbTagPath <- "/Other-source/Other-source_src/Dbtag/Dbtag_db"
## ##   resIDPath_str <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"
## ##   KEGGGeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)

## ##   KEGGPathIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG pathway",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)

## ##   unigeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "UniGene",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)
## ##   resIDPath_id <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id"
## ##   ## May only want to look for MIM if we are tax ID 9606 (or can be empty here)
## ##   MIMIds <- lapply(miniDocs, .getSubNodeInfo, type = "MIM",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_id)

## ##   ## Refseqs are in another couple of places (we can merge these later, or I
## ##   ## can write a helper like for GO and merge them as we go.)
## ##   RSOtherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary"
## ##   RSdbTagPath <- "/Gene-commentary/Gene-commentary_heading" 
## ##   RSResIdTagPathRNA <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession" 
## ##   RSRNAIds <- lapply(miniDocs, .getSubNodeInfo,
## ##     type = "RefSeqs maintained independently of Annotated Genomes",
## ##                     otherSourcePath = RSOtherSourcePath,
## ##                            dbTagPath = RSdbTagPath,
## ##                            resIDPath = RSResIdTagPathRNA)
  
## ##   RSResIdTagPathProt <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession"   
## ##   RSProtIds <- lapply(miniDocs, .getSubNodeInfo,
## ##     type = "RefSeqs maintained independently of Annotated Genomes",
## ##                     otherSourcePath = RSOtherSourcePath,
## ##                            dbTagPath = RSdbTagPath,
## ##                            resIDPath = RSResIdTagPathProt)

## ##   ## Get official Symbol
## ##   symbolSourcePath <- "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_properties/Gene-commentary"
## ##   symbolDbTag <- "/Gene-commentary/Gene-commentary_label"
## ##   symbolresIDPath <- "/Gene-commentary/Gene-commentary_text"
## ##   symbolIds <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = "Official Symbol",
## ##                     otherSourcePath = symbolSourcePath,
## ##                            dbTagPath = symbolDbTag,
## ##                            resIDPath = symbolresIDPath)
## ##   ## Get official Name
## ##   fullNames <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = "Official Full Name",
## ##                     otherSourcePath = symbolSourcePath, ##same path as symbols
## ##                            dbTagPath = symbolDbTag,
## ##                            resIDPath = symbolresIDPath)
## ##   ## Get alternate symbols (merge with official ones when making table)
## ##   ## TODO: would be SAFER to do some of the 1:1 stuff like this too
## ##   ## Therefore redo how I get EG, PMID, and species name.
## ##   aliasSourcePath <- "/Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_syn"
## ##   aliasDbTag <- "/Gene-ref_syn/Gene-ref_syn_E"
## ##   aliasresIDPath <- "/Gene-ref_syn/Gene-ref_syn_E"
## ##   aliasIds <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = NULL,
## ##                     otherSourcePath = aliasSourcePath,
## ##                            dbTagPath = aliasDbTag,
## ##                            resIDPath = aliasresIDPath)


  
##   ## Data sanity checks:
##   ## All genes should be from the same critter:
##   ## TODO: move the checks on EG uniqueness to outside of this function
##   if(length(unique(entrezGeneID)) != length(entrezGeneID))
##      stop("Some of the entrez gene IDs have been repeated.")
##   if(length(unique(speciesName))>1)
##     stop("The IDs being processed need to all be from the same species.")
     
##   ## The following checks can stay at this level though
##   if(unique(entrezGeneID %in% entrezGenes) %in% FALSE) ##if any don't match
##     stop("The entrez Genes discovered don't match the IDs being looked up!")
##   if(length(entrezGenes) > length(entrezGeneID))
##     warning("Some of the entrez Genes beings sought were not found.")
##   if(length(entrezGeneID) > length(entrezGenes))
##     stop("There are more EGs being found than we expected.")
  
##   ## return a list of things
##   list(entrez = entrezGeneID,
##        species = speciesName,
##        pmIds = pmIds##, 
## ##       GOIds = GOIds##,  
## ##        KEGGGeneIds = KEGGGeneIds,
## ##        KEGGPathIds = KEGGPathIds,
## ##        aliasIds = aliasIds, 
## ##        symbolIds = symbolIds,
## ##        fullNames = fullNames,
## ##        RSProtIds = RSProtIds,
## ##        RSRNAIds = RSRNAIds,
## ##        MIMIds = MIMIds,
## ##        unigeneIds = unigeneIds
       
##        )
  
}




## NCBI got back to me
##An example approach is given below:

## 1) esearch with taxid in the following format:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid9606%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y

## 2) parse out the WebEnv and QueryKey value:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=NCID_1_39931064_130.14.18.47_9001_1290210718_1522935737&query_key=1&rettype=uilist&retmode=text

## Regards,

## Tao Tao, PhD
## NCBI User Services


## get EGs from an NCBI tax ID
getEntrezGenesFromTaxId <- function(taxId){
  ## 1st retrieve the WebEnv and QueryKey values
  url1 <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid",taxId,"%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y", sep="")
   
  ## NOW we have to parse the available XML
  XML <- xmlParse(getURL(url1, followlocation=TRUE))
  ## Some tags can only occur once per gene
  ## TODO: wire up the xpath for this
  webEnv <- unlist(xpathApply(XML, "//WebEnv", xmlValue))
  queryKey <- unlist(xpathApply(XML, "//QueryKey", xmlValue))

  ## Then assemble the final URL
  url2 <- paste(
     "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=",
     webEnv, "&query_key=", queryKey, "&rettype=uilist&retmode=text", sep="")
  readLines(url2)
}




## Then we need to combine the sub-lists found withing our "super"-list
.mergeLists <- function(listOfLists){
  ## mergeLists just takes two lists at a time, and merges them by
  ## concatenating their contents.
  list1 <- listOfLists[[1]]
  result <- vector("list", length(list1))
  for(i in seq_len(length(list1))){
    for(j in seq_len(length(listOfLists))){
      result[[i]] <- c(result[[i]], listOfLists[[j]][[i]])
    }
  }
  ## keep the names
  names(result) = names(list1)
  result
}

## General helper for merging two simple lists
.combineTwoLists <- function(list1,list2){
  if(length(list1) != length(list2)){
    stop("lists must be equal length to be merged")}
  result <- vector("list",length(list1))
  for(i in seq_len(length(list1))){
    if(is.null(list1[[i]]) && is.null(list2[[i]])){
      result[[i]] <- NA
    }else{
      result[[i]] <- c(list1[[i]], list2[[i]])
    }
  }
  result
}


## conversion Utility:
.convertNullToNA <- function(list){
  ind <- unlist(lapply(list, is.null))
  list[ind] <- NA
  list
}

## this just makes a nice named data.frame from your data lists
.makeSimpleDF <- function(entrez, fieldVals, name){
  if(length(entrez) != length(fieldVals))
    stop("To make data.frame from list + vector, both must be equal len.")
  names(fieldVals) <- entrez
  fieldVals <- .convertNullToNA(fieldVals)
  expandedList <- unlist2(fieldVals)
  result <- data.frame(names(expandedList), expandedList,
                       stringsAsFactors=FALSE) 
  colnames(result) <- c("gene_id", name)
  ## now I have to remove any rows with NA values across the whole row.
  result <- result[!is.na(result[,2]),]
  row.names(result) <- NULL ##1:dim(result)[1]
  result
}


## used to collapse GO lists to a data frame
.unwindGOs <- function(GOIds, entrez, type){
  res <- vector("list",length(GOIds))
  for(i in seq_len(length(GOIds))){
    for(j in seq_len(length(GOIds[[i]]))){
      if( is.null(unlist2(GOIds[[i]][[j]][[type]])) ){
        res[[i]] <- NA
      }else{
        res[[i]] <- c(res[[i]], unlist2(GOIds[[i]][[j]][[type]]))
      }
    }
  }
  names(res) <- entrez
  res <- .convertNullToNA(res)
  res
}


## used to make the 6 custom GO tables
.makeUnWoundGOTables <- function(entrez, GOIds, con){
  ## GOIds is a list of equal length to the entrez IDs
  if(length(entrez) != length(GOIds)){
    stop("There must be a list of GOIds")}  
  uw_gos <- .unwindGOs(GOIds, entrez, type=1)
  if(length(uw_gos[is.na(uw_gos)]) != length(uw_gos)){
    go_id <- unlist2(uw_gos)
    evidence <- unlist2(.unwindGOs(GOIds, entrez, type=2))
    ontology <- Ontology(go_id) ## This step REQUIRES that there be GO IDs
    baseFrame <- cbind(gene_id = names(go_id), go_id=go_id,
                       evidence=evidence, ontology=ontology)
    bp <- data.frame(matrix(split(baseFrame, ontology)$BP,
                            byrow=FALSE,
                            nrow=length(grep("BP",ontology))))[,1:3]
    mf <- data.frame(matrix(split(baseFrame, ontology)$MF,
                            byrow=FALSE,
                            nrow=length(grep("MF",ontology))))[,1:3]
    cc <- data.frame(matrix(split(baseFrame, ontology)$CC,
                            byrow=FALSE,
                            nrow=length(grep("CC",ontology))))[,1:3]
    headerNames = c("gene_id","go_id","evidence")
    names(bp) <- headerNames
    names(mf) <- headerNames
    names(cc) <- headerNames
    
    .makeSimpleTable(bp, table = "go_bp", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(mf, table = "go_mf", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(cc, table = "go_cc", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    ## Now expand the three data.frames to incl all ancestor terms 
    bp_all <- .expandGOFrame(bp, GO.db::GOBPANCESTOR)
    mf_all <- .expandGOFrame(mf, GO.db::GOMFANCESTOR)
    cc_all <- .expandGOFrame(cc, GO.db::GOCCANCESTOR)
    
    .makeSimpleTable(bp_all, table = "go_bp_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(mf_all, table = "go_mf_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(cc_all, table = "go_cc_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
  }else{
    ## as with other tables, if we have nothing to populate, then we don't
    ## want to even make a table!
    warning(paste("no values found for any GO tables",
                  " in this data chunk.", sep=""))
    return() 
  }
}


## TODO: flesh out the following:

#.makeMetaTables <- function(){}


.makeOrgDB <- function(sList, con){
  .makeCentralTable(sList$entrez, con)
  ## gene_info table is special
  ## I need to make the data.frame and do some minor filtering.
##   gene_infoData <- data.frame(
##     gene_id = sList$entrez,
##     gene_name = unlist(.convertNullToNA(as.list(sList$fullNames))),
##     symbol = unlist(.convertNullToNA(as.list(sList$symbolIds))))
##   gene_infoData <- ## still have to remove lines with no data! 
##     gene_infoData[!is.na(gene_infoData[,2]) & !is.na(gene_infoData[,2]),]
##   .makeSimpleTable(gene_infoData,
##                    table = "gene_info", con, fieldNameLens=c(255,80),
##                    indFields = character())
  
  .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
                                 fieldVals = sList$pmIds, "pubmed_id"),
                   table = "pubmed", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = .combineTwoLists(sList$alias,
##                                    sList$symbol), "alias_symbol"),
##                    table = "alias", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$KEGGPathIds, "path_id"),
##                    table = "kegg", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = .combineTwoLists(sList$RSProtIds,
##                                    sList$RSRNAIds), "accession"),
##                    table = "refseq", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$MIMIds, "omim_id"),
##                    table = "omim", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$unigeneIds, "unigene_id"),
##                    table = "unigene", con)
##   ## GO tables are special
##   .makeUnWoundGOTables(entrez = sList$entrez, GOIds = sList$GOIds, con)
}

getDataAndAddToDb <- function(EGChunk, con){      
    list <- getGeneStuff(EGChunk)
#    .makeOrgDB(list, con)
    print(gc())
}

## Wrap the functionality like so:
buildEntrezGeneDbFromWebServices <- function(entrezGenes, file="test.sqlite"){
  EGs <- entrezGenes
  chunkSize <- 400  ## 800 is the max here but is probably NOT optimal
  ## TODO: check if there is a file and if so just remove it.
  ## file.remove(file) ##remove the old file when they re-run the code?
  con <- dbConnect(SQLite(), file)

  ## Then break it into chunks
  if(length(EGs)<chunkSize){
    sList <- getGeneStuff(EGs)
    .makeOrgDB(sList, con)
  }else{
    numChunks <- length(EGs) %/% chunkSize
    remChunks <- length(EGs) %% chunkSize
    splitFactor <- rep(seq_len(numChunks), each=chunkSize)
    splitFactor <- c(splitFactor, rep(numChunks+1, each=remChunks))
    EGChunks <- split(EGs, splitFactor)    
    ## Then we just need to apply through and make a super-list
##     superListOfLists <- lapply(EGChunks, getGeneStuff)
##     sList <- .mergeLists(superListOfLists)
##     .makeOrgDB(sList, con)
    ##lapply(EGChunks, getDataAndAddToDb, con)
    for(chunk in EGChunks){
      getDataAndAddToDb(chunk, con)
    }
  }
}





##############################################################################
## More TODO:
##
## 1) Wrap this so that we can 1) get all the EGs, then retrieve their results
## serially into one big super-list for import into a DB.
##
## 2) Some checking will have to be done as we add contents that are matched
## (thinking of the GO terms here) to the DB.  Specifically, each GO ID needs
## to have an evidence code and I know from making this work, that some of
## them will NOT have that.  Therefore, I have to check as they are formatted
## and put into the DB.

Try the AnnotationForge package in your browser

Any scripts or data that you put into this service are public.

AnnotationForge documentation built on May 2, 2018, 3:19 a.m.