R/kegg_database.R

Defines functions get_kegg_compound

Documented in get_kegg_compound

####KEGG database
#' @title get_kegg_compound
#' @description Get compound from KEGG
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param local Use have downloaded database or from online.
#' @param threads threads
#' @importFrom KEGGREST keggList keggGet
#' @export

get_kegg_compound <-
  function(local = TRUE,
           threads = 3) {
    if (local) {
      data("kegg_compound_database", envir = environment())
      message(
        crayon::yellow(
          "This database is downloaded in",
          kegg_compound_database@database.info$Version
        )
      )
      # cat("\n")
      return(kegg_compound_database)
    } else{
      message(crayon::yellow("It may take a while...\n"))
      compound_ID <-
        KEGGREST::keggList(database = "compound") %>%
        names() %>%
        unique() %>%
        stringr::str_replace_all(., "cpd:", "")
      
      temp_fun = function(x) {
        KEGGREST::keggGet(dbentries = x)[[1]]
      }
      
      kegg_compound_database <-
        pbapply::pblapply(compound_ID, function(x) {
          KEGGREST::keggGet(dbentries = x)[[1]]
        })
      
      future::plan(future::multisession, workers = threads)
      kegg <-
        kegg_compound_database %>%
        furrr::future_map(
          .f = function(x) {
            KEGG.ID = x$ENTRY
            Compound.name = paste(x$NAME, collapse = "{}")
            Formula = x$FORMULA
            if (is.null(x$FORMULA)) {
              Formula = NA
            }
            mz = as.numeric(x$EXACT_MASS)
            if (is.null(x$EXACT_MASS)) {
              mz = NA
            }
            CAS.ID = stringr::str_replace(grep("CAS",
                                               x$DBLINKS,
                                               value = TRUE),
                                          "CAS: ", "") %>%
              stringr::str_trim(side = "both")
            
            PubChem.ID = stringr::str_replace(grep("PubChem",
                                                   x$DBLINKS, value = TRUE),
                                              "PubChem: ", "") %>%
              stringr::str_trim(side = "both")
            
            if (length(CAS.ID) == 0) {
              CAS.ID = NA
            }
            
            if (length(PubChem.ID) == 0) {
              PubChem.ID = NA
            }
            
            data.frame(
              Lab.ID = KEGG.ID,
              Compound.name,
              Formula,
              mz,
              CAS.ID,
              HMDB.ID = NA,
              KEGG.ID,
              PubChem.ID
            )
          },
          .progress = TRUE
        ) %>%
        do.call(rbind, .) %>%
        as.data.frame()
      
      kegg =
        kegg %>%
        dplyr::filter(!is.na(mz)) %>%
        dplyr::mutate(synonym = Compound.name) %>%
        dplyr::mutate(
          RT = NA,
          mz.pos = NA,
          mz.neg = NA,
          Submitter = "KEGG"
        ) %>%
        dplyr::select(
          Lab.ID,
          Compound.name,
          mz,
          RT,
          CAS.ID,
          HMDB.ID,
          KEGG.ID,
          Formula,
          mz.pos,
          mz.neg,
          Submitter,
          dplyr::everything()
        )
      
      kegg$Compound.name =
        kegg$Compound.name %>%
        stringr::str_split(pattern = "\\{\\}") %>%
        purrr::map(function(x) {
          x[1]
        }) %>%
        unlist() %>%
        stringr::str_replace(pattern = ";", "")
      
      openxlsx::write.xlsx(kegg, file = "kegg.xlsx", asTable = TRUE)
      
      keggMS1datbase =
        metid::construct_database(
          path = ".",
          version = as.character(Sys.Date()),
          metabolite.info.name = "kegg.xlsx",
          source = "KEGG",
          link = "https://www.genome.jp/kegg/compound/",
          creater = "metpath",
          email = "shenxt@stanford.edu",
          rt = FALSE,
          threads = threads
        )
      return(keggMS1datbase)
      unlink(x = "kegg.xlsx")
    }
  }



#' @title get_kegg_pathway
#' @description Get pathway from KEGG
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param local Use have downloaded database or from online.
#' @param organism organism
#' @param threads threads
#' @importFrom KEGGREST keggList keggGet
#' @export

get_kegg_pathway <- function(local = TRUE,
                             organism = "hsa",
                             threads = 3) {
  organism = match.arg(organism)
  if (local) {
    if (organism == "hsa") {
      data("kegg_hsa_pathway", envir = environment())
      message(
        crayon::yellow(
          "This database is downloaded in",
          kegg_hsa_pathway@database_info$version
        )
      )
      # cat("\n")
      return(kegg_hsa_pathway)
    }
  } else{
    message(crayon::yellow("It may take a while...\n"))
    organism = match.arg(organism)
    pathway_ID <-
      KEGGREST::keggList(database = "pathway", organism = organism) %>%
      names() %>%
      unique() %>%
      stringr::str_replace_all(., "path:", "")
    
    kegg_hsa_pathway_database <-
      pbapply::pblapply(pathway_ID, function(x) {
        KEGGREST::keggGet(dbentries = x)[[1]]
      })
    
    pathway_id =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        unname(x$ENTRY)
      }) %>%
      unlist()
    
    pathway_name =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        unname(x$PATHWAY_MAP)
        # stringr::str_split(pattern = " - ") %>%
        # `[[`(1) %>%
        # `[`(1)
      }) %>%
      unlist()
    
    pathway_name =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        unname(x$PATHWAY_MAP)
      }) %>%
      unlist()
    
    describtion =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        unname(x$DESCRIPTION)
      })
    
    pathway_class =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        unname(x$CLASS)
      })
    
    gene_list =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        gene = x$GENE
        if (is.null(gene)) {
          return(data.frame())
        }
        data.frame(
          KEGG.ID = gene[seq(1, length(gene) - 1, by = 2)],
          Gene.name = gene[seq(2, length(gene), by = 2)],
          stringsAsFactors = FALSE
        )
      })
    
    compound_list =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        data.frame(
          KEGG.ID = names(x$COMPOUND),
          Compound.name = x$COMPOUND,
          stringsAsFactors = FALSE
        )
      })
    
    reference_list =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        purrr::map(
          x$REFERENCE,
          .f = function(y) {
            y = lapply(y, function(z) {
              if (length(z) > 1) {
                paste(z, collapse = "{}")
              } else{
                z
              }
            })
            y = unlist(y)
            if (any(names(y) == "JOURNAL")) {
              names(y)[names(y) == "JOURNAL"] = "JOURNAL1"
              c(y, JOURNAL2 = "")
            }
          }
        ) %>%
          do.call(rbind, .) %>%
          as.data.frame()
      })
    
    related_disease =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        data.frame(
          Disease.ID = names(x$DISEASE),
          Disease.name = x$DISEASE,
          stringsAsFactors = FALSE
        )
      })
    
    
    related_module =
      kegg_hsa_pathway_database %>%
      purrr::map(function(x) {
        data.frame(
          Module.ID = names(x$MODULE),
          Module.name = x$MODULE,
          stringsAsFactors = FALSE
        )
      })
    
    pathway =
      new(
        Class = "pathway_database",
        database_info = list(source = "KEGG",
                             version = as.character(Sys.Date())),
        pathway_id = pathway_id,
        pathway_name = pathway_name,
        describtion = describtion,
        pathway_class = pathway_class,
        gene_list = gene_list,
        compound_list = compound_list,
        protein_list = list(),
        reference_list = reference_list,
        related_disease = related_disease,
        related_module = related_module
      )
    
    if (length(pathway@gene_list) == 0) {
      pathway@gene_list = vector(mode = "list",
                                 length = length(pathway@pathway_id)) %>%
        purrr::map(function(x) {
          x = data.frame()
          x
        })
    }
    
    if (length(pathway@compound_list) == 0) {
      pathway@compound_list = vector(mode = "list",
                                     length = length(pathway@pathway_id)) %>%
        purrr::map(function(x) {
          x = data.frame()
          x
        })
    }
    
    if (length(pathway@protein_list) == 0) {
      pathway@protein_list = vector(mode = "list",
                                    length = length(pathway@pathway_id)) %>%
        purrr::map(function(x) {
          x = data.frame()
          x
        })
    }
    
    return(pathway)
  }
}
tidymass/metpath documentation built on July 21, 2023, 3:11 p.m.