R/create_kegg_db.R

Defines functions download.organisms.KEGG get_path2name prepare_pkg_skeleton create_kegg_db

Documented in create_kegg_db

##' create KEGG.db package
##'
##' 
##' @title create_kegg_db
##' @param species one of KEGG supported species, e.g. hsa for human
##' @return KEGG.db package generated in working directory
##' @export
##' @author Guangchuang Yu and Ziru Chen
create_kegg_db <- function(species) {
    packagedir <- tempfile() # tempdir() maynot empty

    ## skeleton
    prepare_pkg_skeleton(packagedir)

    ## sqlite
    sqlite_path <- paste(packagedir, "inst", "extdata", sep=.Platform$file.sep)
    prepare_kegg_db(species, sqlite_path)

    ## build pkg
    pkgbuild::build(packagedir, dest_path = ".")
}


prepare_pkg_skeleton <- function(packagedir) {
    .fcp <- function(..., todir, file) {
        file.copy(from = system.file("KEGG.db", ..., file, package = "createKEGGdb"),
                  to = paste(todir, file, sep = .Platform$file.sep))
    }

    if(!dir.exists(packagedir)) {
        dir.create(packagedir)
    }

    ## to store sqlite
    sqlite_path <- paste(packagedir, "inst", "extdata", sep=.Platform$file.sep)
    if(!dir.exists(sqlite_path)){
        dir.create(sqlite_path,recursive = TRUE)
    }

    R_src <- paste(packagedir, "R", sep=.Platform$file.sep)
    if(!dir.exists(R_src)){
        dir.create(R_src,recursive = TRUE)
    }

    .fcp("R", todir = R_src, file = "zzz.R")
    .fcp(todir = packagedir, file = "DESCRIPTION")
    .fcp(todir = packagedir, file = "LICENSE")
    .fcp(todir = packagedir, file = "NAMESPACE")
}


##' @importFrom magrittr %<>%
get_path2name <- function(species){
  if (length(species) == 1) {
    keggpathid2name.df <- clusterProfiler:::kegg_list("pathway", species)
  } else {
    keggpathid2name.list <- vector("list", length(species))
    names(keggpathid2name.list) <- species
    for (i in species) {
      keggpathid2name.list[[i]] <- clusterProfiler:::kegg_list("pathway", i)    
    }
    keggpathid2name.df <- do.call(rbind, keggpathid2name.list)
    rownames(keggpathid2name.df) <- NULL
  }
  keggpathid2name.df[,2] <- sub("\\s-\\s[a-zA-Z ]+\\(\\w+\\)$", "", keggpathid2name.df[,2])
  # keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
  colnames(keggpathid2name.df) <- c("path_id","path_name")
  return(keggpathid2name.df)
}


##' @importFrom magrittr %<>%
download.organisms.KEGG <- function(organism) {
  keggpathid2extid.df <- clusterProfiler:::kegg_link(organism, "pathway")
  if (is.null(keggpathid2extid.df)){
    write(paste(Sys.time(),"Pathway data of",organism,"is null."), stderr())
  }else{
    message(paste0(Sys.time()," Getting KEGG data of ",organism,"."))
    keggpathid2extid.df[,1] %<>% gsub("[^:]+:", "", .)
    keggpathid2extid.df[,2] %<>% gsub("[^:]+:", "", .)
    colnames(keggpathid2extid.df) <- c("pathway_id","gene_or_orf_id")
    message(paste(Sys.time(),"KEGG data of",organism,"has been downloaded."))
  }
  return(keggpathid2extid.df)
}


get_organisms_list <- function(db){
  organisms <- clusterProfiler:::kegg_list(db)
  organisms_list <- as.character(organisms[,2])
  return(organisms_list)
}


##' @importFrom clusterProfiler download_KEGG
##' @importFrom RSQLite dbDriver
##' @importFrom RSQLite dbConnect
##' @importFrom RSQLite dbWriteTable
##' @importFrom RSQLite dbDisconnect
prepare_kegg_db <- function(organisms, sqlite_path) {
  dbfile <- file.path(sqlite_path, "KEGG.sqlite")
  unlink(dbfile)
  ###################################################
  ### create database
  ###################################################
  drv <- dbDriver("SQLite")
  db <- dbConnect(drv, dbname=dbfile)

  KEGGPATHID2NAME <- get_path2name(organisms)
  ###################################################
  ### put the pathway2name data into the tables 
  ###################################################
  dbWriteTable(conn = db, "pathway2name", KEGGPATHID2NAME, row.names=FALSE)
  
  if (length(organisms) == 1){
    if(organisms == "all"){
      organisms <- get_organisms_list("organism")
    }
  }
  for(organism in organisms){
    KEGGPATHID2EXTID <- download.organisms.KEGG(organism)
    if(!is.null(KEGGPATHID2EXTID)){
      ###################################################
      ### put the pathway2gene data into the tables 
      ###################################################
      # 数据是直接添加进去,不会自动去重
      dbWriteTable(conn = db, "pathway2gene", KEGGPATHID2EXTID, row.names=FALSE,append = TRUE)
      message(paste(Sys.time(),"KEGG data of",organism,"has been added to the sqlite database."))
      
    }
  }
  
  ###################################################
  ### append the metadata
  ###################################################
  metadata <- rbind(c("PATHNAMESOURCENAME", "KEGG PATHWAY"),
                    c("PATHNAMESOURCEURL", "ftp://ftp.genome.jp/pub/kegg/pathway"),
                    c("PATHNAMESOURCEDATE", format(Sys.Date(), "%Y%m%d")),
                    c("KEGGSOURCENAME", "KEGG GENOME"),
                    c("KEGGSOURCEURL", "ftp://ftp.genome.jp/pub/kegg/genomes"),
                    c("KEGGSOURCEDATE", format(Sys.Date(), "%Y%m%d")),
                    c("GOEXTSOURCEDATE", "2015-Sepec2go27"),
                    c("GOEXTSOURCENAME", "Gene Ontology External Link"),
                    c("GOEXTSOURCEURL", "http://www.geneontology.org/external2go"),
                    c("Db type", "KEGGDB"),
                    c("DBSCHEMA", "KEGG_DB"),
                    c("DBSCHEMAVERSION", "2.1"))
  
  metadata <- as.data.frame(metadata)
  colnames(metadata) <- c("name", "value") #makeAnnDbPkg规定的
  dbWriteTable(conn = db, "metadata", metadata, row.names=FALSE)
  
  map.counts <- rbind(c("pathway2name", nrow(KEGGPATHID2NAME)),
                      c("pathway2gene", nrow(KEGGPATHID2EXTID)))
  map.counts <- as.data.frame(map.counts)
  colnames(map.counts) <- c("map_name","count")
  dbWriteTable(conn = db, "map_counts", map.counts, row.names=FALSE)

  map.metadata <- rbind(c("ENZYMEID2GO","Gene Ontology External Link","http://www.geneontology.org/external2go","2015-Sepec2go27"),
                        c("GO2ENZYMEID","Gene Ontology External Link","http://www.geneontology.org/external2go","2015-Sepec2go27"),
                        c("EXTID2PATHID","KEGG GENOME","ftp://ftp.genome.jp/pub/kegg/genomes","2011-Mar15"),
                        c("PATHID2EXTID","KEGG GENOME","ftp://ftp.genome.jp/pub/kegg/genomes","2011-Mar15"),
                        c("PATHNAME2ID","KEGG PATHWAY","ftp://ftp.genome.jp/pub/kegg/pathway",format(Sys.Date(),"%Y%m%d")),
                        c("PATHID2NAME","KEGG PATHWAY","ftp://ftp.genome.jp/pub/kegg/pathway",format(Sys.Date(),"%Y%m%d")))
  map.metadata <- as.data.frame(map.metadata)
  colnames(map.metadata) <- c("map_name","source_name","source_url","source_date")
  dbWriteTable(conn = db, "map_metadata", map.metadata, row.names=FALSE)
  
  dbDisconnect(db)
  invisible(dbfile)
}


utils::globalVariables(".")
YuLab-SMU/createKEGGdb documentation built on March 15, 2023, 3:48 p.m.