R/KEGG.R

Defines functions format_KEGG collect_KEGG do_keggGet get_KEGG

Documented in collect_KEGG do_keggGet format_KEGG get_KEGG

#' Wrapper function to import KEGG database
#'
#' This function includes collect_KEGG() and format_KEGG()
#'
#' @param orgdb String containing database of organism using KEGG naming (e.g. "hsa".)
#'
#' @return List.
#' @export
#'
#' @examples get_KEGG("hsa")
get_KEGG <- function(orgdb = "hsa"){
  temp_dict_KEGG <- collect_KEGG(orgdb)
  temp_dict_KEGG <- list(pathway = temp_dict_KEGG[["pathway"]][["success"]])
  temp_tidy_KEGG <- format_KEGG(dict = temp_dict_KEGG)

  return(list(dict_KEGG = temp_dict_KEGG,
              tidy_KEGG = temp_tidy_KEGG
              ))

}

#' ***Please clarify***
#'
#' @param data
#'
#' @return
#' @export
#'
#' @examples
do_keggGet <- function(data){
  return(keggGet(data))
}


#' Import KEGG database
#'
#' @param organism Organism for which KEGG database to import (Available from KEGGREST::keggList("organism"))
#' @param category_names Category of pathway to import (Available from KEGGREST::listDatabases())
#'
#' @return KEGG dictionary ***Please add description***
#' @export
#'
#' @examples collect_KEGG(organism = "hsa", category_names = c("pathway"))
collect_KEGG <- function(organism, category_names){
  #--------------------------------------------------
  # collect KEGG
  #--------------------------------------------------
  res <- list()
  for(category in category_names){
    ids <- keggLink(category, organism)  # Note: names(ids) are gene IDs
    map <- data.frame(
      ID = ids,
      Description = NA,
      KEGG_geneID = names(ids)
    )
    map$NCBI_geneID <- NA
    flags <- c()
    I <- length(map$ID)
    for(i in 1:I){
      Sys.sleep(0.1)
      #--------------------------------------------------
      # Print the current process
      #--------------------------------------------------
      if((i == 1) || (i %% floor(0.25 * I) == 0 && i < 0.95 * I) || (i == I)){
        text <- paste("Now processing ", i, "/", I, " for ", category, "...\n", sep = "")
        cat(text)
      }
      #--------------------------------------------------
      # (i) Description
      #--------------------------------------------------
      category_id <- map$ID[i]
      if(category == "module")
        category_id <- str_extract(category_id, "(?<=_)(.*)")
      tmp_description <- try(do_keggGet(category_id), silent = TRUE)
      if(class(tmp_description) == "try-error"){
        flags <- c(flags, i)
        next
      }
      #--------------------------------------------------
      # (ii) NCBI_geneID
      #--------------------------------------------------
      tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
      if(class(tmp_gene) == "try-error"){
        flags <- c(flags, i)
        next
      }
      #------------------------------
      # (i)
      #------------------------------
      name <- tmp_description[[1]][["NAME"]]
      map$Description[i] <- ifelse(is.null(name), "No_record", name)
      #------------------------------
      # (ii)
      #------------------------------
      dblinks <- tmp_gene[[1]][["DBLINKS"]]
      id <- dblinks[grep("NCBI-GeneID", dblinks)]
      id <- gsub("NCBI-GeneID: ", "", id)
      map$NCBI_geneID[i] <- id
    }
    res[[category]][["success"]] <- map
    #--------------------------------------------------
    # Rescue the failures
    #--------------------------------------------------
    if(length(flags) > 0){
      failure <- c()
      for(i in flags){
        #------------------------------
        # (i) Description
        #------------------------------
        category_id <- map$ID[i]
        if(category == "module")
          category_id <- str_extract(category_id, "(?<=_)(.*)")
        tmp_description <- try(do_keggGet(category_id), silent = TRUE)
        cnt = 0
        while(class(tmp_description) == "try-error"){
          cnt <- cnt + 1
          if(cnt >= 10)
            break
          Sys.sleep(1)
          tmp_description <- try(do_keggGet(category_id), silent = TRUE)
        }
        if(cnt >= 10){
          failure <- rbind(failure, c("ID", map$ID[i]))
          next
        }
        #------------------------------
        # (ii) NCBI_geneID
        #------------------------------
        tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
        cnt = 0
        while(class(tmp_gene) == "try-error"){
          cnt <- cnt + 1
          if(cnt >= 10)
            break
          Sys.sleep(1)
          tmp_gene <- try(do_keggGet(map$KEGG_geneID[i]), silent = TRUE)
        }
        if(cnt >= 10){
          failure <- rbind(failure, c("KEGG_geneID", map$KEGG_geneID[i]))
          next
        }
        #----------
        # (i)
        #----------
        name <- tmp_description[[1]][["NAME"]]
        map$Description[i] <- ifelse(is.null(name), "No_record", name)
        #----------
        # (ii)
        #----------
        dblinks <- tmp_gene[[1]][["DBLINKS"]]
        id <- dblinks[grep("NCBI-GeneID", dblinks)]
        id <- gsub("NCBI-GeneID: ", "", id)
        map$NCBI_geneID[i] <- id
      }
      res[[category]][["success"]] <- map
      tmp <- data.frame(
        matrix(ncol = 2, nrow = 0, dimnames = list(NULL, c(
          "Input_category_for_keggGet", "Input_for_keggGet"
        ))))
      tmp <- rbind(tmp, data.frame(
        Input_category_for_keggGet = unique(failure)[,1],
        Input_for_keggGet = unique(failure)[,2]
      ))
      res[[category]][["failure"]] <- tmp
    }
  }

  return(res)
}



#' Format KEGG database to be ASURAT-friendly
#'
#' @param dict list of KEGG pathways imported ***Please clarify***
#'
#' @return Formatted KEGG dictionary ***Please clarify***
#' @export
#'
#' @examples
#' dict_KEGG_raw <- collect_KEGG(organism = "hsa", category_names = c("pathway"))
#' dict_KEGG <- list(pathway = dict_KEGG_raw[["pathway"]][["success"]])
#' tidy_KEGG <- format_KEGG(dict = dict_KEGG)
format_KEGG <- function(dict){
  #--------------------------------------------------
  # Definitions
  #--------------------------------------------------
  category_names <- names(dict)

  #--------------------------------------------------
  # Reformat
  #--------------------------------------------------
  res <- list()
  for(category in category_names){
    tmp <- dict[[category]]
    map <- unique(data.frame(
      ID = tmp[["ID"]],
      Description = tmp[["Description"]],
      Count = NA,
      Gene = NA,
      GeneID = NA
    ))
    for(i in 1:nrow(map)){
      #------------------------------
      # Gene and Count
      #------------------------------
      genes <- unique(tmp[which(tmp[["ID"]] == map[["ID"]][i]),]$NCBI_geneID)
      map$GeneID[i] <- paste(genes, collapse = "/")
      map$Count[i] <- length(genes)
    }
    rownames(map) <- 1:nrow(map)
    res[[category]] <- map
  }

  return(res)
}
johannesnicolaus/ASURAT_source documentation built on Dec. 21, 2021, 2:11 a.m.