R/NCBI_getters.R

Defines functions buildEntrezGeneDbFromWebServices getDataAndAddToDb .makeOrgDB .makeUnWoundGOTables .unwindGOs .makeSimpleDF .convertNullToNA .combineTwoLists .mergeLists getEntrezGenesFromTaxId getGeneStuff getGOInfo .modifyGOs .padGOIds .getSubNodeInfo .checkRetrieveNode .getDBTag .getOtherSrcDocs

## This file is for code that calls NCBI and then retrieves data to populate a
## local DB (eventually on the fly).

## I need to have functions that will make tables in a DB, but I also need
## (separate) function(s)? that get the "stuff" from NCBI.  Let's start with
## that.

## Lets start by just making this thing get me some PMIDs for a string of EGs

## getNucleotideStuff <- function(x, ret="acc"){
##   baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
##   xsep <- paste(x, collapse=",")
##   url <- paste(baseUrl,"db=nucleotide&id=",xsep,"&rettype=",ret, sep="")
##   readLines(url)
## }

## egs = c("10","4557270")
## getNucleotideStuff(egs)
## getNucleotideStuff(egs, "gb")
## getNucleotideStuff(egs, "gp")
## getNucleotideStuff(egs, "seqid")
## getNucleotideStuff(egs, "ft")



## Another way to approach this is something like this:
## str(sapply(EG, function(elt) unlist(xpathApply(xmlDoc(elt), "//PubMedId", xmlValue))))

## where EGSet is the whole doc ie.
## EGSet = xmlParse(url)
## for (i in 1:n) print(xpathApply(EGSet, sprintf("count(//Entrezgene[%d]//PubMedId)", i)))
## for (i in 1:n) print(xpathApply(EGSet, sprintf("//Entrezgene[%d]//PubMedId/text()", i), xmlValue))

## or
## make queries
## xpq = sprintf("//Entrezgene[%d]//PubMedId", 1:n)
## lapply(xpq, xpathApply, doc=EGSet, fun=xmlValue)

## Other notes from discussion of Xpath with Martin on how to use XPath to
## retrieve more from a node without allocating a node...
## library(XML)
## base <-
## url <-
## sprintf("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=%s&retmode=xml",
##                "1,10,100")
## eg <- xmlParse(url)
## sets <- getNodeSet(eg, "//Entrezgene")
## res <-
##     xpathApply(sets[[1]],
## "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)

## xpathApply(eg, "count(//Dbtag_db[text()='GO'])")
## xpathApply(eg, "//Dbtag_db[text()='GO']/text()")



#####################################################################
## helper functions for retrieving data from nodes 

## The following is a generic solution for getting stuff from sub-nodes
## Generic way to get src Docs

.getOtherSrcDocs <- function(doc, otherSourcePath){
  ## for each doc, get other Source nodes and make mini-docs
  srcDocs <- lapply(getNodeSet(doc, otherSourcePath), xmlDoc)
  srcDocs
}

## retrieve Db tag from such an other source doc.
.getDBTag <- function(srcDoc, dbTagPath){
  dbTag <- unlist(xpathApply(srcDoc, dbTagPath, xmlValue))
  if(is.null(dbTag)) dbTag <- "Bummer, the dbTag is NULL."
  dbTag
}


## check the node and return the value requested
.checkRetrieveNode <- function(doc, dbTagPath, resIDPath, type){
  results = vector("list",length(dbTagPath))
  for(i in seq_len(length(dbTagPath))){
    if(.getDBTag(doc, dbTagPath[i])==type
       || (is.null(type) & dbTagPath==resIDPath)){
      results[[i]] <- (unlist(xpathApply(doc, resIDPath[i], xmlValue)))
    }
  }
  if(!is.null(results))return(results)
}



## default values just get you KEGG gene IDs (for now)
.getSubNodeInfo <- function(doc,
                           type= "KEGG",
                           otherSourcePath = "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
                           dbTagPath = "/Other-source/Other-source_src/Dbtag/Dbtag_db",
                           resIDPath = "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"){
  ## Get sub-nodes for looped parsing
  srcDocs <- .getOtherSrcDocs(doc, otherSourcePath)

  ## check to make sure we don't have unequal arguments
  if(length(dbTagPath) != length(resIDPath))
    stop("result and dbTag (checking) paths must have same number of elements")

  if(length(dbTagPath) == 1){
    results <- unlist(lapply(srcDocs, .checkRetrieveNode,
                             dbTagPath, resIDPath, type))
  }else{  ##compound result is expected so don't simplify it.
    results <- lapply(srcDocs, .checkRetrieveNode,
                             dbTagPath, resIDPath, type)
  }
}




## GO attempt #2 (still fails since there are apparently some GO IDs where
## there is not a GO evidence code available. So I need to fully vectorize the generic code.
## A convenience function to pad GO Ids
.padGOIds <- function(GOIds){
  pads <- 7-nchar(GOIds)
  res<-character(length(GOIds))
  for(i in seq_len(length(GOIds))){
    res[i] <- paste(paste(rep("0",pads[i]),collapse=""),GOIds[i],sep="")
  }
  paste("GO:",res,sep="")
}

.modifyGOs <- function(elem){
  elem[[1]] <- .padGOIds(elem[[1]])
  elem[[2]] <- gsub("evidence: ", "", elem[[2]])
  elem
}

## for GO retrieval I have partially vectorized the helper functions.
getGOInfo <- function(doc){
  
  GOIds <- .getSubNodeInfo(doc, type = "GO",
                    otherSourcePath = "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source",
                           dbTagPath = rep("/Other-source/Other-source_src/Dbtag/Dbtag_db", 2),
                           resIDPath = c("/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id", "/Other-source/Other-source_post-text"))

  GOIds <- lapply(GOIds, .modifyGOs)  
  GOIds
}

getGeneStuff <- function(entrezGenes, dir = "files"){
  baseUrl <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
  xsep <- paste(entrezGenes, collapse=",")
  url <- paste(baseUrl,"db=gene&id=",xsep,"&retmode=xml", sep="")

  ## NOW we have to parse the available XML
  EGSet <- xmlParse(getURL(url, followlocation=TRUE))

  ## Here we will save the files out to a dir
  miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
  wd <- getwd()
  path <- paste(wd, dir, sep="/")
  if(!file.exists(path)){dir.create(path)}
  setwd(path)
  paths <- paste(paste(path, entrezGenes, sep="/"),".xml",sep="")
  for(i in seq_len(length(miniDocs))){
    saveXML( miniDocs[[i]], file = paths[[i]])
  }
  setwd(wd)
 
  ## attempt to remove miniDocs each time?
  ## (something tells me this is a token gesture only)
  rm(miniDocs)
  ## even just doing this (saving the files) takes nearly 30 gigs of RAM before
  ## too long (but at least it completed).


  
##   ## Some tags can only occur once per gene
##   entrezGeneID <- unlist(xpathApply(EGSet, "//Gene-track_geneid", xmlValue))
##   speciesName <- unlist(xpathApply(EGSet, "//Org-ref_taxname", xmlValue))

##   ## But most information is either more complex than that or is stored in a
##   ## more complex way by the XML
##   ## miniDocs are the individual entrez Gene records
## ##   miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)

## ## ##   ## Sometimes we are lucky and the information we want has an obviously
## ## ##   ## unique tag.
## ## ##   ## Like pubmed Ids
## ##   pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ##   pmIds <- lapply(miniDocs, function(x)
## ##                   unlist(xpathApply(x, pmIdPath, xmlValue)))


## ###########################################################################
## ## Lets try to get rid of the minidocs

## ## miniDocs <- lapply(getNodeSet(EGSet, "//Entrezgene"), xmlDoc)
  
##    ##pmIdPath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_refs/Pub/Pub_pmid/PubMedId"
## ##   pmIds <- lapply(miniDocs, function(x)
## ##                   unlist(xpathApply(x, pmIdPath, xmlValue)))

##   ## Can try this way: - no improvement?
## ##   egs <- getNodeSet(EGSet, "//Entrezgene")
## ##   ###pmIds <- xpathApply(xmlDoc(egs[[1]]), "//PubMedId", xmlValue)
## ##   pmIds <- sapply(egs, function(x) unlist(xpathApply(xmlDoc(x), "//PubMedId", xmlValue)))

##   ## Can also try this way: - but for 800 genes in a chunk,
##   ## - at least it tops out at 2 GB (for 800 genes/chunk = better on usage AND
##   ## - speed with smaller chunks, but it means we bug some server more
##   ## - often...
##   ## IT also takes MUCH longer though! (probably because of larger parse space))
##   n <- length(entrezGenes)
##   paths <- sprintf("//Entrezgene[%d]//PubMedId", 1:n)
##   pmids <- lapply(paths, xpathApply, doc=EGSet, fun=xmlValue)
##   pmIds <- lapply(pmids, unlist)

## ## This is frustrating.  Total process looks to take about 6-12 Gigs of ram (and be pretty slow for parsing only PMIDs ~ 1 hour).  I mean the ram usage is WAY down when I stop calling getNodeSet() (or maybe NOT - since it seems to climb more as things progress), but I am seriously tempted to just save out individual XML files to a tempDir, in an initial step, and then parse one at a time.  Worst part is that it seems to eat MORE ram the further along it goes...

##   ## I should really do this.  At least if I am working from files in a dir, I
##   ## can have better control over how the parsing is done and where the time
##   ## is going.

  
  
## ##   ## For GO I should be able to do something like this:
## ##   res <- xpathApply(egs[[1]],
## ##        "//Other-source[//Dbtag_db/text()='GO']//Object-id_id/text()", xmlValue)
## ##   ## checks
## ##   res1 <- xpathApply(egs[[1]], "count(//Dbtag_db[text()='GO'])")
## ##   res2 <- xpathApply(egs[[1]], "//Dbtag_db[text()='GO']/text()")
    



  
## ##   ## But sometimes we have to do some more checking to make sure that what we
## ##   ## are retrieving is in the context of certain kinds of tags, or of tags
## ##   ## that contain certain information.

## ##   ## custom helper to get and process the GO terms
## ##   GOIds <- lapply(miniDocs, getGOInfo)

## ##   ## KEGG Gene IDs are NOT the path IDs.
## ##   otherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary/Gene-commentary_source/Other-source"
## ##   dbTagPath <- "/Other-source/Other-source_src/Dbtag/Dbtag_db"
## ##   resIDPath_str <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_str"
## ##   KEGGGeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)

## ##   KEGGPathIds <- lapply(miniDocs, .getSubNodeInfo, type = "KEGG pathway",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)

## ##   unigeneIds <- lapply(miniDocs, .getSubNodeInfo, type = "UniGene",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_str)
## ##   resIDPath_id <- "/Other-source/Other-source_src/Dbtag/Dbtag_tag/Object-id/Object-id_id"
## ##   ## May only want to look for MIM if we are tax ID 9606 (or can be empty here)
## ##   MIMIds <- lapply(miniDocs, .getSubNodeInfo, type = "MIM",
## ##                     otherSourcePath = otherSourcePath,
## ##                            dbTagPath = dbTagPath,
## ##                            resIDPath = resIDPath_id)

## ##   ## Refseqs are in another couple of places (we can merge these later, or I
## ##   ## can write a helper like for GO and merge them as we go.)
## ##   RSOtherSourcePath <- "/Entrezgene/Entrezgene_comments/Gene-commentary/Gene-commentary_comment/Gene-commentary"
## ##   RSdbTagPath <- "/Gene-commentary/Gene-commentary_heading" 
## ##   RSResIdTagPathRNA <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession" 
## ##   RSRNAIds <- lapply(miniDocs, .getSubNodeInfo,
## ##     type = "RefSeqs maintained independently of Annotated Genomes",
## ##                     otherSourcePath = RSOtherSourcePath,
## ##                            dbTagPath = RSdbTagPath,
## ##                            resIDPath = RSResIdTagPathRNA)
  
## ##   RSResIdTagPathProt <- "/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_products/Gene-commentary/Gene-commentary_accession"   
## ##   RSProtIds <- lapply(miniDocs, .getSubNodeInfo,
## ##     type = "RefSeqs maintained independently of Annotated Genomes",
## ##                     otherSourcePath = RSOtherSourcePath,
## ##                            dbTagPath = RSdbTagPath,
## ##                            resIDPath = RSResIdTagPathProt)

## ##   ## Get official Symbol
## ##   symbolSourcePath <- "/Entrezgene/Entrezgene_properties/Gene-commentary/Gene-commentary_properties/Gene-commentary"
## ##   symbolDbTag <- "/Gene-commentary/Gene-commentary_label"
## ##   symbolresIDPath <- "/Gene-commentary/Gene-commentary_text"
## ##   symbolIds <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = "Official Symbol",
## ##                     otherSourcePath = symbolSourcePath,
## ##                            dbTagPath = symbolDbTag,
## ##                            resIDPath = symbolresIDPath)
## ##   ## Get official Name
## ##   fullNames <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = "Official Full Name",
## ##                     otherSourcePath = symbolSourcePath, ##same path as symbols
## ##                            dbTagPath = symbolDbTag,
## ##                            resIDPath = symbolresIDPath)
## ##   ## Get alternate symbols (merge with official ones when making table)
## ##   ## TODO: would be SAFER to do some of the 1:1 stuff like this too
## ##   ## Therefore redo how I get EG, PMID, and species name.
## ##   aliasSourcePath <- "/Entrezgene/Entrezgene_gene/Gene-ref/Gene-ref_syn"
## ##   aliasDbTag <- "/Gene-ref_syn/Gene-ref_syn_E"
## ##   aliasresIDPath <- "/Gene-ref_syn/Gene-ref_syn_E"
## ##   aliasIds <- lapply(miniDocs, .getSubNodeInfo,
## ##                       type = NULL,
## ##                     otherSourcePath = aliasSourcePath,
## ##                            dbTagPath = aliasDbTag,
## ##                            resIDPath = aliasresIDPath)


  
##   ## Data sanity checks:
##   ## All genes should be from the same critter:
##   ## TODO: move the checks on EG uniqueness to outside of this function
##   if(length(unique(entrezGeneID)) != length(entrezGeneID))
##      stop("Some of the entrez gene IDs have been repeated.")
##   if(length(unique(speciesName))>1)
##     stop("The IDs being processed need to all be from the same species.")
     
##   ## The following checks can stay at this level though
##   if(unique(entrezGeneID %in% entrezGenes) %in% FALSE) ##if any don't match
##     stop("The entrez Genes discovered don't match the IDs being looked up!")
##   if(length(entrezGenes) > length(entrezGeneID))
##     warning("Some of the entrez Genes beings sought were not found.")
##   if(length(entrezGeneID) > length(entrezGenes))
##     stop("There are more EGs being found than we expected.")
  
##   ## return a list of things
##   list(entrez = entrezGeneID,
##        species = speciesName,
##        pmIds = pmIds##, 
## ##       GOIds = GOIds##,  
## ##        KEGGGeneIds = KEGGGeneIds,
## ##        KEGGPathIds = KEGGPathIds,
## ##        aliasIds = aliasIds, 
## ##        symbolIds = symbolIds,
## ##        fullNames = fullNames,
## ##        RSProtIds = RSProtIds,
## ##        RSRNAIds = RSRNAIds,
## ##        MIMIds = MIMIds,
## ##        unigeneIds = unigeneIds
       
##        )
  
}




## NCBI got back to me
##An example approach is given below:

## 1) esearch with taxid in the following format:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid9606%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y

## 2) parse out the WebEnv and QueryKey value:
## http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=NCID_1_39931064_130.14.18.47_9001_1290210718_1522935737&query_key=1&rettype=uilist&retmode=text

## Regards,

## Tao Tao, PhD
## NCBI User Services


## get EGs from an NCBI tax ID
getEntrezGenesFromTaxId <- function(taxId){
  ## 1st retrieve the WebEnv and QueryKey values
  url1 <- paste("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=txid",taxId,"%5Borgn%5D+AND+alive%5Bprop%5D&usehistory=y", sep="")
   
  ## NOW we have to parse the available XML
  XML <- xmlParse(getURL(url1, followlocation=TRUE))
  ## Some tags can only occur once per gene
  ## TODO: wire up the xpath for this
  webEnv <- unlist(xpathApply(XML, "//WebEnv", xmlValue))
  queryKey <- unlist(xpathApply(XML, "//QueryKey", xmlValue))

  ## Then assemble the final URL
  url2 <- paste(
     "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&WebEnv=",
     webEnv, "&query_key=", queryKey, "&rettype=uilist&retmode=text", sep="")
  readLines(url2)
}




## Then we need to combine the sub-lists found withing our "super"-list
.mergeLists <- function(listOfLists){
  ## mergeLists just takes two lists at a time, and merges them by
  ## concatenating their contents.
  list1 <- listOfLists[[1]]
  result <- vector("list", length(list1))
  for(i in seq_len(length(list1))){
    for(j in seq_len(length(listOfLists))){
      result[[i]] <- c(result[[i]], listOfLists[[j]][[i]])
    }
  }
  ## keep the names
  names(result) = names(list1)
  result
}

## General helper for merging two simple lists
.combineTwoLists <- function(list1,list2){
  if(length(list1) != length(list2)){
    stop("lists must be equal length to be merged")}
  result <- vector("list",length(list1))
  for(i in seq_len(length(list1))){
    if(is.null(list1[[i]]) && is.null(list2[[i]])){
      result[[i]] <- NA
    }else{
      result[[i]] <- c(list1[[i]], list2[[i]])
    }
  }
  result
}


## conversion Utility:
.convertNullToNA <- function(list){
  ind <- unlist(lapply(list, is.null))
  list[ind] <- NA
  list
}

## this just makes a nice named data.frame from your data lists
.makeSimpleDF <- function(entrez, fieldVals, name){
  if(length(entrez) != length(fieldVals))
    stop("To make data.frame from list + vector, both must be equal len.")
  names(fieldVals) <- entrez
  fieldVals <- .convertNullToNA(fieldVals)
  expandedList <- unlist2(fieldVals)
  result <- data.frame(names(expandedList), expandedList,
                       stringsAsFactors=FALSE) 
  colnames(result) <- c("gene_id", name)
  ## now I have to remove any rows with NA values across the whole row.
  result <- result[!is.na(result[,2]),]
  row.names(result) <- NULL ##1:dim(result)[1]
  result
}


## used to collapse GO lists to a data frame
.unwindGOs <- function(GOIds, entrez, type){
  res <- vector("list",length(GOIds))
  for(i in seq_len(length(GOIds))){
    for(j in seq_len(length(GOIds[[i]]))){
      if( is.null(unlist2(GOIds[[i]][[j]][[type]])) ){
        res[[i]] <- NA
      }else{
        res[[i]] <- c(res[[i]], unlist2(GOIds[[i]][[j]][[type]]))
      }
    }
  }
  names(res) <- entrez
  res <- .convertNullToNA(res)
  res
}


## used to make the 6 custom GO tables
.makeUnWoundGOTables <- function(entrez, GOIds, con){
  ## GOIds is a list of equal length to the entrez IDs
  if(length(entrez) != length(GOIds)){
    stop("There must be a list of GOIds")}  
  uw_gos <- .unwindGOs(GOIds, entrez, type=1)
  if(length(uw_gos[is.na(uw_gos)]) != length(uw_gos)){
    go_id <- unlist2(uw_gos)
    evidence <- unlist2(.unwindGOs(GOIds, entrez, type=2))
    ontology <- Ontology(go_id) ## This step REQUIRES that there be GO IDs
    baseFrame <- cbind(gene_id = names(go_id), go_id=go_id,
                       evidence=evidence, ontology=ontology)
    bp <- data.frame(matrix(split(baseFrame, ontology)$BP,
                            byrow=FALSE,
                            nrow=length(grep("BP",ontology))))[,1:3]
    mf <- data.frame(matrix(split(baseFrame, ontology)$MF,
                            byrow=FALSE,
                            nrow=length(grep("MF",ontology))))[,1:3]
    cc <- data.frame(matrix(split(baseFrame, ontology)$CC,
                            byrow=FALSE,
                            nrow=length(grep("CC",ontology))))[,1:3]
    headerNames = c("gene_id","go_id","evidence")
    names(bp) <- headerNames
    names(mf) <- headerNames
    names(cc) <- headerNames
    
    .makeSimpleTable(bp, table = "go_bp", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(mf, table = "go_mf", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(cc, table = "go_cc", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    ## Now expand the three data.frames to incl all ancestor terms 
    bp_all <- .expandGOFrame(bp, GO.db::GOBPANCESTOR)
    mf_all <- .expandGOFrame(mf, GO.db::GOMFANCESTOR)
    cc_all <- .expandGOFrame(cc, GO.db::GOCCANCESTOR)
    
    .makeSimpleTable(bp_all, table = "go_bp_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(mf_all, table = "go_mf_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
    
    .makeSimpleTable(cc_all, table = "go_cc_all", con, fieldNameLens=c(10,3),
                     indFields = c("_id", "go_id"))
  }else{
    ## as with other tables, if we have nothing to populate, then we don't
    ## want to even make a table!
    warning(paste("no values found for any GO tables",
                  " in this data chunk.", sep=""))
    return() 
  }
}


## TODO: flesh out the following:

#.makeMetaTables <- function(){}


.makeOrgDB <- function(sList, con){
  .makeCentralTable(sList$entrez, con)
  ## gene_info table is special
  ## I need to make the data.frame and do some minor filtering.
##   gene_infoData <- data.frame(
##     gene_id = sList$entrez,
##     gene_name = unlist(.convertNullToNA(as.list(sList$fullNames))),
##     symbol = unlist(.convertNullToNA(as.list(sList$symbolIds))))
##   gene_infoData <- ## still have to remove lines with no data! 
##     gene_infoData[!is.na(gene_infoData[,2]) & !is.na(gene_infoData[,2]),]
##   .makeSimpleTable(gene_infoData,
##                    table = "gene_info", con, fieldNameLens=c(255,80),
##                    indFields = character())
  
  .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
                                 fieldVals = sList$pmIds, "pubmed_id"),
                   table = "pubmed", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = .combineTwoLists(sList$alias,
##                                    sList$symbol), "alias_symbol"),
##                    table = "alias", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$KEGGPathIds, "path_id"),
##                    table = "kegg", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = .combineTwoLists(sList$RSProtIds,
##                                    sList$RSRNAIds), "accession"),
##                    table = "refseq", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$MIMIds, "omim_id"),
##                    table = "omim", con)
##   .makeSimpleTable(.makeSimpleDF(entrez = sList$entrez,
##                                  fieldVals = sList$unigeneIds, "unigene_id"),
##                    table = "unigene", con)
##   ## GO tables are special
##   .makeUnWoundGOTables(entrez = sList$entrez, GOIds = sList$GOIds, con)
}

getDataAndAddToDb <- function(EGChunk, con){      
    list <- getGeneStuff(EGChunk)
#    .makeOrgDB(list, con)
    print(gc())
}

## Wrap the functionality like so:
buildEntrezGeneDbFromWebServices <- function(entrezGenes, file="test.sqlite"){
  EGs <- entrezGenes
  chunkSize <- 400  ## 800 is the max here but is probably NOT optimal
  ## TODO: check if there is a file and if so just remove it.
  ## file.remove(file) ##remove the old file when they re-run the code?
  con <- dbConnect(SQLite(), file)

  ## Then break it into chunks
  if(length(EGs)<chunkSize){
    sList <- getGeneStuff(EGs)
    .makeOrgDB(sList, con)
  }else{
    numChunks <- length(EGs) %/% chunkSize
    remChunks <- length(EGs) %% chunkSize
    splitFactor <- rep(seq_len(numChunks), each=chunkSize)
    splitFactor <- c(splitFactor, rep(numChunks+1, each=remChunks))
    EGChunks <- split(EGs, splitFactor)    
    ## Then we just need to apply through and make a super-list
##     superListOfLists <- lapply(EGChunks, getGeneStuff)
##     sList <- .mergeLists(superListOfLists)
##     .makeOrgDB(sList, con)
    ##lapply(EGChunks, getDataAndAddToDb, con)
    for(chunk in EGChunks){
      getDataAndAddToDb(chunk, con)
    }
  }
}





##############################################################################
## More TODO:
##
## 1) Wrap this so that we can 1) get all the EGs, then retrieve their results
## serially into one big super-list for import into a DB.
##
## 2) Some checking will have to be done as we add contents that are matched
## (thinking of the GO terms here) to the DB.  Specifically, each GO ID needs
## to have an evidence code and I know from making this work, that some of
## them will NOT have that.  Therefore, I have to check as they are formatted
## and put into the DB.
Bioconductor/AnnotationForge documentation built on May 4, 2024, 4:35 p.m.