R/go.R

Defines functions proc_obo

Documented in proc_obo

#' Helper to create `db_path`
#' 
#' @inheritParams prepGO
create_db_path <- function (db_path) 
{
  if (!fs::dir_exists(db_path)) {
    new_db_path <- fs::path_expand_r(db_path)
    new_db_path2 <- fs::path_expand(db_path)
    
    if (fs::dir_exists(new_db_path)) {
      db_path <- new_db_path
    } 
    else if (fs::dir_exists(new_db_path2)) {
      db_path <- new_db_path2
    } 
    else {
      dir.create(file.path(db_path, "cache"), recursive = TRUE, showWarnings = FALSE)
      db_path <- new_db_path
    }
  }
  
  invisible(db_path)
}


#' Helper to save `obo` without header
#' 
#' @param fn_obo filename according to \code{obo_url}
#' @inheritParams prepGO
proc_obo <- function(db_path, fn_obo, 
                     type = c("biological_process", "cellular_component", "molecular_function")) 
{
  filepath <- file.path(db_path, "cache", fn_obo)
  
  if (!file.exists(filepath)) 
    stop("File not found ", filepath, ".", call. = FALSE)

  suppressWarnings(df <- readLines(filepath))
  first_row <- grep("\\[Term\\]", df)[1]
  last_row <- grep("\\[Typedef\\]", df)[1] - 2
  df <- df[first_row:last_row]
  
  go_ids <- df %>% .[grepl("^id:\\s{1}", .)] %>% gsub("^id:\\s{1}", "", .)
  go_nms <- df %>% .[grepl("^name:\\s{1}", .)] %>% gsub("^name:\\s{1}", "", .)
  go_type <- df %>% .[grepl("^namespace:\\s{1}", .)] %>% gsub("^namespace:\\s{1}", "", .)
  
  df <- tibble::tibble(go_id = go_ids, go_name = go_nms, go_space = go_type) %>% 
    dplyr::filter(go_space %in% type) %>% 
    dplyr::select(-go_space) 
}


#' Helper to save `gaf` without header
#' 
#' @param fn_gaf filename according to \code{gaf_url}
#' @inheritParams prepGO
proc_gaf <- function(db_path, fn_gaf) 
{
  filepath <- file.path(db_path, "cache", fn_gaf)
  if (!file.exists(filepath)) stop("File not found ", filepath, ".", call. = FALSE)

  con <- gzfile(path.expand(filepath))
  
  suppressWarnings(df <- readLines(con))
  first_row <- grep("UniProtKB", df)[1]
  last_row <- length(df)
  df <- df[first_row:last_row]
  
  out_nm <- gsub("\\.gz$", "_hdr_rm.txt", fn_gaf)
  writeLines(df, file.path(db_path, out_nm))
  try(close(con))
  
  df <- readr::read_tsv(file.path(db_path, out_nm), col_names = FALSE, 
                        col_types = cols(
                          X1 = col_character(),
                          X2 = col_character(),
                          X3 = col_character(),
                          X4 = col_character(),
                          X5 = col_character(),
                          X6 = col_character(),
                          X7 = col_character(),
                          X8 = col_character(),
                          X9 = col_character(),
                          X10 = col_character(),
                          X11 = col_character(),
                          X12 = col_character(),
                          X13 = col_character(),
                          X14 = col_character(),
                          X15 = col_character(),
                          X16 = col_character(),
                          X17 = col_character()
                        )) 
  
  unlink(file.path(db_path, out_nm))
  
  df <- df %>% 
    dplyr::select(X2:X3, X5) %>% 
    `colnames<-`(c("uniprot_acc", "gene", "go_id"))
}


#' Helper to map `SYMBOL` to `ENTREZID` 
#' 
#' @param keys Identifier such as gene names.
#' @param from the type of \code{keys}
#' @param to the type of target IDs
#' @inheritParams prepGO
#' @import dplyr purrr 
annot_from_to <- function(abbr_species = "Hs", keys = NULL, from = "SYMBOL", to = "ENTREZID") 
{
  if (all(is.null(keys))) stop("Argument `keys` cannot be NULL", call. = FALSE)

  pkg_nm <- paste("org", abbr_species, "eg.db", sep = ".")
  
  if (!requireNamespace(pkg_nm, quietly = TRUE)) {
    stop("Run `BiocManager::install(\"", pkg_nm, "\")` first.", call. = FALSE)
  }

  x <- tryCatch(
    get(pkg_nm), 
    error = function(e) 1
  )
  
  if (!is.object(x)) {
    if (x == 1) stop("Did you forget to run `library(", pkg_nm, ")`?", call. = FALSE)
  }

  accessions <- purrr::quietly(AnnotationDbi::select)(
    get(pkg_nm), 
    keys = keys,
    columns = to,
    keytype = from
  )$result
}


#' Helper to get a complete list of `ENTREZID` from `egUNIPROT`
#' 
#' @inheritParams prepGO
#' @inheritParams annot_from_to
get_full_entrez <- function(species, from = "egUNIPROT") 
{
  abbr_species <- sp_lookup_Ul(species)
  
  pkg_nm <- paste("org", abbr_species, "eg.db", sep = ".")
  pkg_nm_from <- paste("org", abbr_species, from, sep = ".")
  
  if (!requireNamespace(pkg_nm, quietly = TRUE)) {
    stop("Run `BiocManager::install(\"", pkg_nm, "\")` first.", call. = FALSE)
  }
  
  pkg_nm_from %>% get() %>% AnnotationDbi::mappedkeys()
}


#' Helper to find human orthologs
#' 
#' @inheritParams prepMSig
#' @examples \donttest{res <- find_human_orthologs(mouse)}
find_human_orthologs <- function(species, ortho_mart) 
{
  if (species == "human") {
    stop("Ortholog `species` needs to be different to `human`.", call. = FALSE)
  }
  
  out_nm <- paste0("ortho_hs", sp_lookup(species))

  data(package = "proteoQ", mart_hs, envir = environment())
  
  if (species == "mouse") {
    data(package = "proteoQ", mart_mm, envir = environment())
    martL <- mart_mm
  } 
  else if (species == "rat") {
    data(package = "proteoQ", mart_rn, envir = environment())
    martL <- mart_rn
  } 
  else {
    martL <- biomaRt::useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = ortho_mart)
  }
  
  res <- biomaRt::getLDS(
    attributes = c("entrezgene_id"),
    filters = "entrezgene_id", 
    values = get_full_entrez(species = "human", from = "egUNIPROT"), 
    mart = mart_hs,
    attributesL = c("entrezgene_id"), 
    martL = martL
  ) %>% 
    `colnames<-`(c("human", species)) 
  
  invisible(res)
}


#' Helper to look up GO species 
#' 
#' @inheritParams prepMSig
sp_lookup_go <- function(species) 
{
  switch(species, 
         human = "human",
         mgi = "mouse",
         rgd = "rat")
}


#' Helper to find the two-letter character string of abbreviated species
#' 
#' @import stringr
#' @inheritParams prepMSig
find_abbr_species <- function(species = "human", abbr_species = NULL) 
{
  species <- rlang::as_string(rlang::enexpr(species))
  abbr_species <- rlang::enexpr(abbr_species)
  
  if (is.null(abbr_species) || species %in% c("human", "mouse", "rat")) {
    abbr_species <- switch(species, 
           human = "Hs",
           mouse = "Mm",
           rat = "Rn",
           stop("`species` not in one of `human`, `mouse` or `rat`.", 
                "\nThus users need to provide a two-letter abbreviation of `abbr_species`, ", 
                "\ni.e., `abbr_species = Ce` for later uses with `org.Ce.eg.db` annotation.", 
                call. = FALSE)
    )
  } 
  else {
    abbr_species <- rlang::as_string(abbr_species)
    
    if (stringr::str_length(abbr_species) != 2) {
      warning("The number of characters is typically `2` for `abbr_species`.", call. = FALSE)
    }
    
    if (abbr_species != stringr::str_to_title(abbr_species)) {
      warning("An `abbr_species` is typically in Title case, i.e., `Xx`.", call. = FALSE)
    }    
  }

  return(abbr_species)
}


#' Helper to set the output file name for a data base
#' 
#' @param signature A character string, i.e. "go" for uses in an output filename.
#' @inheritParams prepMSig
set_db_outname <- function(filename = NULL, species = "human", signature) 
{
  filename <- rlang::enexpr(filename)
  
  if (is.null(filename)) {
    abbr_species_lwr <- switch(species, 
                               human = "hs", 
                               mouse = "mm", 
                               rat = "rn", 
                               stop("`species` not in one of `human`, `mouse` or `rat`.", 
                                    "\nThus users need to provide a `filename`.", 
                                    call. = FALSE))
    
    filename <- paste0(signature, "_", abbr_species_lwr, ".rds")
  } 
  else {
    filename <- rlang::as_string(filename)
    fn_prefix <- gsub("\\.[^.]*$", "", filename)
    fn_suffix <- gsub("^.*\\.([^.]*)$", "\\1", filename)
    if (fn_prefix == fn_suffix) stop("No '.' to separate a basename and an extension.", call. = FALSE)
    if (fn_suffix != "rds") stop("File extension must be `.rds`.", call. = FALSE)
    filename <- paste0(fn_prefix, ".rds")
  }
}


#' Helper to download `gmt`
#' 
#' @inheritParams prepMSig
dl_msig <- function(msig_url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/c2.all.v7.0.entrez.gmt", 
                    db_path = "~/proteoQ/dbs/msig", overwrite = FALSE) 
{
  if (!grepl("\\.entrez\\.gmt$", msig_url)) {
    stop("Use a link to a `.entrez.gmt` file; not `.symbols.gmt`.", call. = FALSE)
  }
  
  if (!requireNamespace("downloader", quietly = TRUE)) {
    stop("\n====================================================================", 
         "\nNeed package \"downloader\" for this function to work.",
         "\n====================================================================",
         call. = FALSE)
  }

  fn_msig <- msig_url %>% gsub("^.*/(.*)$", "\\1", .)
  
  if ((!file.exists(file.path(db_path, "cache", fn_msig))) | overwrite)  {
    downloader::download(msig_url, file.path(db_path, "cache", fn_msig), mode = "wb")
  }
  
  return(fn_msig)
}


#' Helper to save `gmt` 
#' 
#' @param fn_gmt filename of downloaded gmt results.
#' @inheritParams prepMSig
proc_gmt <- function(species, abbr_species, ortho_mart, fn_gmt, db_path, filename) 
{
  filepath <- file.path(db_path, "cache", fn_gmt)
  
  if (!file.exists(filepath)) 
    stop("File not found ", filepath, ".", call. = FALSE)
  
  df <- suppressWarnings(readr::read_tsv(filepath, col_names = FALSE)) %>% 
    `names_pos<-`(1, "term")
  
  df <- local({
    cols_entrez <- purrr::map_lgl(df, is.numeric) %>% which()
    
    df[, c(1, cols_entrez)] %>% 
      tidyr::gather("col", "entrez", -term) %>% 
      dplyr::select(-col)  %>% 
      dplyr::filter(!is.na(entrez)) 
  })
  
  if (species != "human") {
    df <- local({
      orthos <- find_human_orthologs(species, ortho_mart) %>% 
        `names<-`(c("from", "to"))
      
      df <- df %>% 
        dplyr::left_join(orthos, by = c("entrez" = "from")) %>% 
        dplyr::filter(!is.na(to))  %>% 
        dplyr::select(-entrez) %>% 
        dplyr::rename(entrez = to)      
    })
  }
  
  gsets <- df %>% 
    split(., .$term, drop = TRUE) %>% 
    purrr::map(`[[`, 2) %>% 
    `names<-`(paste(tolower(abbr_species), names(.), sep = "_"))
  
  saveRDS(gsets, file.path(db_path, filename))
  
  invisible(file.path(db_path, filename))
}


#'Download and prepare gene ontology
#'
#'\code{prepGO} downloads and prepares data bases of
#'\href{http://current.geneontology.org/products/pages/downloads.html}{gene
#'ontology} (GO) for enrichment analysis by gene sets.
#'
#'@import dplyr purrr fs readr org.Hs.eg.db org.Mm.eg.db
#'  org.Rn.eg.db
#'@importFrom magrittr %>% %T>% %$% %<>% 
#'@param overwrite Logical; if TRUE, overwrite the downloaded database(s). The
#'  default is FALSE.
#'@param species Character string; the name of a species for the
#'  \emph{conveninent} preparation of GO. The species available for the
#'  convenience feature is in one of \code{c("human", "mouse", "rat")} with
#'  \code{"human"} being the default. The argument is not required for other
#'  species; instead, users will provide values under arguments
#'  \code{abbr_species}, \code{gaf_url} and \code{obo_url}.
#'@param abbr_species Two-letter character string; the abbreviated name of
#'  species used with
#'  \href{https://bioconductor.org/packages/release/data/annotation/html/org.Hs.eg.db.html}{org.Xx.eg.db}.
#'  The value of \code{abbr_species} will be determined automatically if the
#'  species is in one of \code{c("human", "mouse", "rat")}. Otherwise, for
#'  example, users need to provide \code{abbr_species = Ce} for fetching the
#'  \code{org.Ce.eg.db} package in the name space of proteoQ. 
#'  
#'  For analysis against
#'  \href{http://current.geneontology.org/products/pages/downloads.html}{gene
#'  ontology} and \href{https://www.gsea-msigdb.org/gsea/index.jsp}{Molecular
#'  Signatures}, the argument is further applied to differentiate the same
#'  biological terms under different species; e.g., \code{GO~0072686 mitotic
#'  spindle} becomes \code{hs_GO~0072686 mitotic spindle} for human and
#'  \code{mm_GO~0072686 mitotic spindle} for mouse.
#'@param db_path Character string; the local path for database(s). The default
#'  is \code{"~/proteoQ/dbs/go"}.
#'@param gaf_url A URL to
#'  \href{http://current.geneontology.org/products/pages/downloads.html}{GO
#'  Annotation File} (GAF). A valid web address is required for species other
#'  than \code{c("human", "mouse", "rat")}. At the NULL default and the species
#'  in one of \code{c("human", "mouse", "rat")}, the link will be determined
#'  automatically; note that users can overwrite the default GAF by providing
#'  their own URL.
#'@param obo_url A URL link to GO terms in an OBO format. At the NULL default,
#'  the web address will be determined automatically. Users can overwrite the
#'  default OBO by providing their own URL.
#'@param filename Character string; An output file name. At the NULL default,
#'  the name will be determined automatically at a given \code{species}; i.e.,
#'  \code{go_hs.rds} for human data. The file is saved as a \code{.rds} object
#'  for uses with \code{\link{prnGSPA}}.
#'@param type Character vector. The name space in gene ontology to be included.
#'  The default is to include all in \code{c("biological_process",
#'  "cellular_component", "molecular_function")}. In the example of \code{type =
#'  c("biological_process", "cellular_component")}, terms of
#'  \code{molecular_function} will be excluded.
#' @examples
#' \donttest{
#' library(proteoQ)
#'
#' # `human` and `mouse` with a default OBO;
#' # outputs under `db_path`
#' prepGO(human)
#' prepGO(mouse)
#' 
#' # head(readRDS(file.path("~/proteoQ/dbs/go/go_hs.rds")))
#' # head(readRDS(file.path("~/proteoQ/dbs/go/go_mm.rds")))
#' 
#' # enrichment analysis with custom `GO`
#' prnGSPA(
#'   gset_nms = c("~/proteoQ/dbs/go/go_hs.rds",
#'                "~/proteoQ/dbs/go/go_mm.rds"),
#' )
#'
#' # `mouse` with a slim OBO
#' prepGO(
#'   species = mouse,
#'   obo_url = "http://current.geneontology.org/ontology/subsets/goslim_mouse.obo",
#'   filename = mm_slim.rds,
#' )
#'
#' # `worm` not available for default GO preparation
#' if (!requireNamespace("BiocManager", quietly = TRUE))
#'     install.packages("BiocManager")
#' BiocManager::install("org.Ce.eg.db")
#'
#' library(org.Ce.eg.db)
#'
#' prepGO(
#'   # species = worm,
#'   abbr_species = Ce,
#'   gaf_url = "http://current.geneontology.org/annotations/wb.gaf.gz",
#'   obo_url = "http://purl.obolibrary.org/obo/go/go-basic.obo", 
#'   filename = go_ce.rds,
#' )
#' }
#' 
#'@export
prepGO <- function(species = "human", abbr_species = NULL, gaf_url = NULL, obo_url = NULL, 
                   db_path = "~/proteoQ/dbs/go", 
                   type = c("biological_process", "cellular_component", "molecular_function"), 
                   filename = NULL, overwrite = FALSE) 
{
  old_opts <- options()
  options(warn = 1L)
  on.exit(options(old_opts), add = TRUE)
  
  if (!requireNamespace("downloader", quietly = TRUE)) {
    stop("\n====================================================================", 
         "\nNeed package \"downloader\" for this function to work.",
         "\n====================================================================",
         call. = FALSE)
  }

  if (!requireNamespace("AnnotationDbi", quietly = TRUE)) {
    stop("\n====================================================================", 
         "\nNeed package \"AnnotationDbi\" for this function to work.\n",
         "\nif (!requireNamespace(\"BiocManager\", quietly = TRUE)) ", 
         "\n\tinstall.packages(\"BiocManager\")",
         "\nBiocManager::install(\"AnnotationDbi\")", 
         "\n====================================================================",
         call. = FALSE)
  }
  

  species <- rlang::as_string(rlang::enexpr(species))

  db_path <- create_db_path(db_path)

  if (is.null(gaf_url)) {
    gaf_url <- switch(species, 
                      human = c("goa_human.gaf.gz" = "http://current.geneontology.org/annotations/goa_human.gaf.gz"), 
                      mouse = c("goa_mouse.gaf.gz" = "http://current.geneontology.org/annotations/mgi.gaf.gz"), 
                      rat = c("goa_rat.gaf.gz" = "http://current.geneontology.org/annotations/rgd.gaf.gz"), 
                      stop("`species` need to be one of `human`, `mouse` or `rat` for an auto lookup of GAF files.", 
                           call. = FALSE))
    fn_gaf <- names(gaf_url)
  } 
  else {
    fn_gaf <- gaf_url %>% gsub("^.*/(.*)$", "\\1", .)
    
    species <- local({
      species_go <- fn_gaf %>% 
        gsub("([^\\.]*)[\\.].*", "\\1", .) %>% 
        gsub("^goa_", "", .) %>% 
        sp_lookup_go()
      
      if (is.null(species_go)) {
        species_go <- "unknown"
      } else if (species_go != species) {
        message("The species is `", species_go, "`.")
      }
      
      species <- species_go      
    })
  }
  
  if (is.null(obo_url)) {
    obo_url <- c("go-basic.obo" = "http://purl.obolibrary.org/obo/go/go-basic.obo")
    fn_obo <- names(obo_url)
  } 
  else {
    fn_obo <- obo_url %>% gsub("^.*/(.*)$", "\\1", .)
  }

  if ((!file.exists(file.path(db_path, "cache", fn_gaf))) || overwrite)  {
    downloader::download(gaf_url, file.path(db_path, "cache", fn_gaf), mode = "wb")
    # download.file(gaf_url, file.path(db_path, "cache", fn_gaf), mode = "w")
  }
  
  if ((!file.exists(file.path(db_path, "cache", fn_obo))) | overwrite) {
    downloader::download(obo_url, file.path(db_path, "cache", fn_obo), mode = "wb")
    # download.file(obo_url, file.path(db_path, "cache", fn_obo), mode = "w")
  }
  
  abbr_species <- find_abbr_species(!!species, !!rlang::enexpr(abbr_species))
  filename <- set_db_outname(!!rlang::enexpr(filename), species, "go")

  df <- local({
    df_gaf <- proc_gaf(db_path, fn_gaf)
    df_obo <- proc_obo(db_path, fn_obo, type)
    
    dplyr::left_join(df_gaf, df_obo, by = "go_id") %>% 
      dplyr::mutate(go_name = paste(go_id, go_name)) %>% 
      dplyr::select(go_name, gene)
  })
  
  accessions <- annot_from_to(abbr_species = abbr_species, 
                              keys = unique(df$gene), from = "SYMBOL", to = "ENTREZID") %>% 
    dplyr::rename(gene = SYMBOL, entrez = ENTREZID) %>% 
    dplyr::filter(!is.na(entrez), !is.na(gene))  %>% 
    dplyr::filter(!duplicated(gene))
  
  gsets <- accessions %>% 
    dplyr::left_join(df, by = "gene") %>% 
    dplyr::select(-gene) %>% 
    split(., .$go_name, drop = TRUE) %>% 
    purrr::map(`[[`, 1) %>% 
    `names<-`(paste(tolower(abbr_species), names(.), sep = "_"))
  
  saveRDS(gsets, file.path(db_path, filename))
  
  invisible(file.path(db_path, filename))
}


#'Preparation of Molecular Signatures
#'
#'\code{prepMSig} downloads and prepares data bases of
#'\href{https://www.gsea-msigdb.org/gsea/index.jsp}{Molecular Signatures}
#'(MSig) for enrichment analysis by gene sets.
#'
#'@import dplyr purrr fs readr org.Hs.eg.db
#'@importFrom magrittr %>% %T>% %$% %<>% 
#'@inheritParams prepGO
#'@param species Character string; the name of a species for the
#'  \emph{conveninent} preparation of \code{MSig} data bases. The species
#'  available for the convenience feature is in one of \code{c("human", "mouse",
#'  "rat")} with \code{"human"} being the default. The argument is not required
#'  for other species; instead, users will provide values under arguments
#'  \code{ortho_mart} for the lookup of orthologs to human.
#'@param db_path Character string; the local path for database(s). The default
#'  is \code{"~/proteoQ/dbs/msig"}.
#'@param msig_url A URL to
#'  \href{https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/}{MSig
#'  }. At the \code{NULL} default, a \code{c2.all.v[...].entrez.gmt} data will
#'  be used for all species. A valid web address is required for a custom data
#'  base. For simplicity, only files with entrez IDs will be handled; 
#'  files of \code{c2.all.v[...].symbols.gmt} will not be parsed. 
#'@param ortho_mart Character string; a dataset name from
#'  \code{\link[biomaRt]{useMart}} and/or \code{\link[biomaRt]{listDatasets}}
#'  for the lookup of orthologs to \code{human} genes. For species in
#'  \code{c("human", "mouse", "rat")}, the value will be determined
#'  automatically unless otherwise specified.
#'@param filename Character string; An output file name. At the \code{NULL}
#'  default, the name will be determined automatically at a given
#'  \code{species}; i.e., \code{msig_hs.rds} for \code{human} data. The file is
#'  saved as a \code{.rds} object for uses with \code{\link{prnGSPA}}.
#' @examples
#' \donttest{
#' library(proteoQ)
#'
#' ## the default `MSig` is `c2.all`
#' # `human`; outputs under `db_path`
#' prepMSig()
#' head(readRDS(file.path("~/proteoQ/dbs/msig/msig_hs.rds")))
#' 
#' prnGSPA(
#'   gset_nms = file.path("~/proteoQ/dbs/msig/msig_hs.rds"), 
#' )
#'
#' # `mouse`
#' prepMSig(species = mouse, filename = msig_mm.rds)
#' head(readRDS(file.path("~/proteoQ/dbs/msig/msig_mm.rds")))
#'
#' # `rat`
#' prepMSig(species = rat, filename = msig_rn.rds)
#' head(readRDS(file.path("~/proteoQ/dbs/msig/msig_rn.rds")))
#'
#' # `dog`; need `ortho_mart` for species other than `human`, `mouse` and `rat`
#' # (try `?biomaRt::useMart` for a list of marts)
#' prepMSig(
#'   # species = dog,
#'   abbr_species = Cf, 
#'   ortho_mart = cfamiliaris_gene_ensembl,
#'   filename = msig_cf.rds,
#' )
#'
#' # also `dog`
#' prepMSig(
#'   species = my_dog,
#'   abbr_species = Cf, 
#'   ortho_mart = cfamiliaris_gene_ensembl,
#'   filename = msig_cf2.rds,
#' )
#'
#' msig_cf <- readRDS(file.path("~/proteoQ/dbs/msig/msig_cf.rds"))
#' msig_cf2 <- readRDS(file.path("~/proteoQ/dbs/msig/msig_cf2.rds"))
#'identical(msig_cf, msig_cf2)
#'
#' ## use an `MSig`other than the default of `c2.all`
#' prepMSig(
#'   msig_url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/c2.cgp.v7.0.entrez.gmt",
#'   species = human,
#'   filename = c2_cgp_hs.rds,
#' )
#'
#' prepMSig(
#'   msig_url = "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/c2.cgp.v7.0.entrez.gmt",
#'   species = dog,
#'   ortho_mart = cfamiliaris_gene_ensembl,
#'   filename = c2_cgp_cf.rds,
#' )
#' }
#' 
#' \dontrun{
#' # enrichment analysis with custom `MSig`
#' prnGSPA(
#'   gset_nms = c("~/proteoQ/dbs/msig/msig_hs.rds",
#'                "~/proteoQ/dbs/msig/msig_mm.rds"),
#' )
#' }
#'
#'@export
prepMSig <- function(species = "human", msig_url = NULL, abbr_species = NULL, 
                     ortho_mart = switch(species, 
                                         mouse = "mmusculus_gene_ensembl", 
                                         rat = "rnorvegicus_gene_ensembl", 
                                         human = "to_itself",
                                         "unknown"), 
                     db_path = "~/proteoQ/dbs/msig", filename = NULL, overwrite = FALSE) 
{
  old_opts <- options()
  options(warn = 1)
  on.exit(options(old_opts), add = TRUE)
  
  if (!requireNamespace("biomaRt", quietly = TRUE)) {
    stop("\n====================================================================", 
         "\nNeed package \"biomaRt\" for this function to work.\n",
         "\nif (!requireNamespace(\"BiocManager\", quietly = TRUE)) ", 
         "\n\tinstall.packages(\"BiocManager\")",
         "\nBiocManager::install(\"biomaRt\")", 
         "\n====================================================================",
         call. = FALSE)
  }

  msig_url <- if (is.null(msig_url))
    "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.0/c2.all.v7.0.entrez.gmt"
  else
    rlang::as_string(rlang::enexpr(msig_url))

  db_path <- create_db_path(db_path)
  
  fn_gmt <- dl_msig(msig_url, db_path, overwrite)
  
  species <- rlang::as_string(rlang::enexpr(species))
  
  ortho_mart <- local({
    ok <- tryCatch(force(ortho_mart), error = function(e) 1)
    
    if (ok == 1) {
      ortho_mart <- rlang::as_string(rlang::enexpr(ortho_mart))
    } 
    else {
      ortho_mart <- ok
    }
  })
  
  if (species == "human" && ortho_mart != "to_itself") species <- "unknown"

  if (ortho_mart == "unknown") 
    stop(
      "Specify the value of `ortho_mart` for species other than \"human\", \"mouse\", and \"rat\".", 
      call. = FALSE
    )

  abbr_species <- find_abbr_species(!!species, !!rlang::enexpr(abbr_species))
  filename <- set_db_outname(!!rlang::enexpr(filename), species, "msig")
  out_path <- proc_gmt(species, abbr_species, ortho_mart, fn_gmt, db_path, filename)
  invisible(out_path)
}


#' Map UniProt or Refseq accessions to Entrez IDs
#'
#' @param from Character string; the type of accession keys in c("UNIPROT",
#'   "REFSEQ", "ACCNUM").
#' @inheritParams prepMSig
#' @inheritParams annot_from_to
#' @import dplyr purrr tidyr org.Hs.eg.db org.Mm.eg.db
#'   org.Rn.eg.db
#' @export
map_to_entrez <- function(species = "human", abbr_species = NULL, from = "UNIPROT", 
                          filename = NULL, db_path = "~/proteoQ/dbs/entrez", 
                          overwrite = FALSE) 
{
  old_opts <- options()
  options(warn = 1)
  on.exit(options(old_opts), add = TRUE)
  
  db_path <- create_db_path(db_path)
  species <- rlang::as_string(rlang::enexpr(species))
  abbr_species <- find_abbr_species(!!species, !!rlang::enexpr(abbr_species))
  from <- rlang::as_string(rlang::enexpr(from))
  filename <- set_db_outname(!!rlang::enexpr(filename), 
                             species, paste(tolower(from), 
                                            "entrez", 
                                            sep = "_" ))
  
  if ((!file.exists(file.path(db_path, filename))) || overwrite)  {
    pkg_nm <- paste("org", abbr_species, "eg.db", sep = ".")
    
    if (!requireNamespace(pkg_nm, quietly = TRUE)) {
      stop("Run `BiocManager::install(\"", pkg_nm, "\")` first, 
           then `library(", pkg_nm, ")`", call. = FALSE)
    }
    
    new_from <- paste0("eg", from)
    
    x <- tryCatch(
      get(paste("org", abbr_species, new_from, sep = ".")),
      error = function(e) 1
    )
    
    if (!is.object(x)) {
      if (x == 1) stop("Did you forget to run `library(", pkg_nm, ")`?", 
                       call. = FALSE)
    }
    
    entrez_ids <- AnnotationDbi::mappedkeys(x) 
    
    accessions <- as.list(x[entrez_ids]) %>% 
      list_to_dataframe() %>% 
      `names_pos<-`(., 1, c("entrez")) %>% 
      `names_pos<-`(., 2:ncol(.), paste(new_from, 1:(length(.)-1), sep = ".")) %>% 
      dplyr::mutate_at(.vars = grep("^eg", names(.)), ~ as.character(.x)) %>% 
      tidyr::gather("variable", "value", -entrez) %>% 
      dplyr::filter(!is.na(entrez), !is.na(value)) %>% 
      dplyr::select(-c("variable")) %>% 
      dplyr::mutate(species = species)
    
    if (from == "UNIPROT") {
      accessions <- accessions %>% 
        dplyr::rename(uniprot_acc = value)
      
      accessions <- annot_from_to(abbr_species, unique(accessions$uniprot_acc), 
                                  "UNIPROT", "SYMBOL") %>% 
        dplyr::rename(uniprot_acc = UNIPROT, gene = SYMBOL) %>% 
        dplyr::filter(!is.na(uniprot_acc), !is.na(gene))  %>% 
        dplyr::left_join(accessions, by = "uniprot_acc") %>% 
        dplyr::mutate(entrez = as.numeric(entrez)) %>% 
        dplyr::select(c("uniprot_acc", "gene", "entrez", "species"))
    } 
    else if (from == "REFSEQ") {
      accessions <- accessions %>% dplyr::rename(refseq_acc = value)
      
      accessions <- annot_from_to(abbr_species, unique(accessions$refseq_acc), 
                                  "REFSEQ", "SYMBOL") %>% 
        dplyr::rename(refseq_acc = REFSEQ, gene = SYMBOL) %>% 
        dplyr::filter(!is.na(refseq_acc), !is.na(gene))  %>% 
        dplyr::left_join(accessions, by = "refseq_acc") %>% 
        dplyr::mutate(entrez = as.numeric(entrez)) %>% 
        dplyr::select(c("refseq_acc", "gene", "entrez", "species"))
    } 
    else {
      stop("Variable `from` needs to be either `UNIPROT` or `REFSEQ`.", 
           call. = FALSE)
    }
    
    saveRDS(accessions, file.path(db_path, filename))

    invisible(accessions)
  } 
  else {
    invisible(NULL)
  }
}


#'Map UniProt accessions to Entrez IDs
#'
#'\code{Uni2Entrez} prepares lookup tables between UniProt accessions and
#'Entrez IDs for uses with \link{normPSM} and downstream gene-set analysis such
#'as \link{prnGSPA}. The utility is optional for \code{human}, \code{mouse} and
#'\code{rat} data. It is \strong{required} for other species with \link{prnGSPA}
#'in users' workflows. It can also be used to update and overrule the lookups
#'for \code{human}, \code{mouse} and \code{rat} that are defaulted by
#'\code{proteoQ}.
#'
#'@param species Character string; the name of a species. 
#'@param db_path Character string; the local path for database(s). The default
#'  is \code{"~/proteoQ/dbs/entrez"}.
#'@inheritParams prepMSig
#'@import dplyr purrr tidyr org.Hs.eg.db org.Mm.eg.db org.Rn.eg.db
#'@example inst/extdata/examples/prepEntrez_.R
#'@export
Uni2Entrez <- function(species = "human", abbr_species = NULL, filename = NULL, 
                       db_path = "~/proteoQ/dbs/entrez", overwrite = FALSE) 
{
  map_to_entrez(!!rlang::enexpr(species), 
                !!rlang::enexpr(abbr_species), 
                "UNIPROT", 
                !!rlang::enexpr(filename), 
                db_path, 
                overwrite)
}


#'Map RefSeq accessions to Entrez IDs and gene names
#'
#'\code{Ref2Entrez} prepares lookup tables between RefSeq accessions and
#'Entrez IDs and gene names for uses with \link{normPSM} and downstream gene-set
#'analysis such as \link{prnGSPA}. The utility is optional for \code{human} and
#'\code{mouse} data. It is \strong{required} for other species with
#'\link{prnGSPA} in users' workflows. It can also be used to update and overrule
#'the lookups for \code{human} and \code{mouse} that are defaulted by
#'\code{proteoQ}.
#'
#'@rdname Uni2Entrez
#'@import dplyr purrr tidyr org.Hs.eg.db org.Mm.eg.db org.Rn.eg.db
#'@example inst/extdata/examples/prepEntrez_.R
#'@export
Ref2Entrez <- function(species = "human", abbr_species = NULL, filename = NULL, 
                       db_path = "~/proteoQ/dbs/entrez", overwrite = FALSE) 
{
  map_to_entrez(!!rlang::enexpr(species), 
                !!rlang::enexpr(abbr_species), 
                "REFSEQ", 
                !!rlang::enexpr(filename), 
                db_path, 
                overwrite)
}


#' Map uniprot or refseq to entrez (not currently used)
#'
#' @param os_name An organism name by UniProt.
#' @inheritParams Uni2Entrez
#' @inheritParams annot_from_to
#' @import dplyr purrr tidyr org.Hs.eg.db org.Mm.eg.db org.Rn.eg.db
map_to_entrez_os_name <- function(species = "human", abbr_species = NULL, 
                                  os_name = "Homo sapiens", 
                                  from = "UNIPROT", 
                                  filename = NULL, db_path = "~/proteoQ/dbs/entrez", 
                                  overwrite = FALSE) 
{
  # the value of species will be used under column `species` in Protein.txt etc
  # add column species in the rds output
  
  db_path <- create_db_path(db_path)
  species <- rlang::as_string(rlang::enexpr(species))
  os_name <- rlang::as_string(rlang::enexpr(os_name))
  
  create_os_lookup(species, os_name, overwrite)
  
  abbr_species <- find_abbr_species(!!species, !!rlang::enexpr(abbr_species))
  filename <- set_db_outname(!!rlang::enexpr(filename), species, 
                             paste(tolower(from), "entrez", sep = "_" ))
  
  if ((!file.exists(file.path(db_path, filename))) || overwrite)  {
    pkg_nm <- paste("org", abbr_species, "eg.db", sep = ".")
    
    if (!requireNamespace(pkg_nm, quietly = TRUE)) {
      stop("Run `BiocManager::install(\"", pkg_nm, "\")` first, 
           then `library(", pkg_nm, ")`", call. = FALSE)
    }
    
    new_from <- paste0("eg", from)
    
    x <- tryCatch(
      get(paste("org", abbr_species, new_from, sep = ".")),
      error = function(e) 1
    )
    
    if (!is.object(x)) {
      if (x == 1) 
        stop("Did you forget to run `library(", pkg_nm, ")`?")
    }
    
    entrez_ids <- AnnotationDbi::mappedkeys(x) 
    
    accessions <- as.list(x[entrez_ids]) %>% 
      list_to_dataframe() %>% 
      `names_pos<-`(., 1, c("entrez")) %>% 
      `names_pos<-`(., 2:ncol(.), paste(new_from, 1:(length(.)-1), sep = ".")) %>% 
      dplyr::mutate_at(.vars = grep("^eg", names(.)), ~ as.character(.x)) %>% 
      tidyr::gather("variable", "value", -entrez) %>% 
      dplyr::filter(!is.na(entrez), !is.na(value)) %>% 
      dplyr::select(-c("variable"))
    
    if (from == "UNIPROT") {
      accessions <- accessions %>% dplyr::rename(uniprot_acc = value)
    } 
    else if (from == "REFSEQ") {
      accessions <- accessions %>% dplyr::rename(refseq_acc = value)
    } 
    else {
      stop("Variable `from` needs to be either `UNIPROT` or `REFSEQ`.", call. = FALSE)
    }
    
    saveRDS(accessions, file.path(db_path, filename))
    
    invisible(accessions)
  } 
  else {
    invisible(NULL)
  }
}



#' create a lookup table and remove duplicated entries
#' 
#' @inheritParams map_to_entrez_os_name
create_os_lookup <- function(species, os_name, overwrite = FALSE) 
{
  my_lookup <- c(
    "Homo sapiens" = "human",
    "Mus musculus" = "mouse",
    "Rattus norvegicus" = "rat"
  )

  # check if conflict between species and os_name
  convert_default_os <- function (species, os_name) {
    if (species %in% my_lookup) {
      lookup_nm <- my_lookup %>% .[. %in% species] %>% names()
      if (!purrr::is_empty(lookup_nm) && lookup_nm != os_name) {
        os_name <- lookup_nm
      }
    }
    
    return(os_name)
  }
  
  dat_dir <- get_gl_dat_dir()
  
  os_name <- convert_default_os(species, os_name)
  curr_lookup <- uniprot_entrez_lookup <- setNames(species, os_name)

  if (os_name == "Homo sapiens" && species != "human") 
    stop("`os_name = Homo sapiens` is reversed for `species = human`.", call. = FALSE)
  
  if (os_name == "Mus musculus" && species != "mouse") 
    stop("`os_name = Mus musculus` is reversed for `species = mouse`.", call. = FALSE)
  
  if (os_name == "Rattus norvegicus" && species != "rat") 
    stop("`os_name = Rattus norvegicus` is reversed for `species = rat`.", call. = FALSE)  
  
  # check if the same os_name but diferent species
  file <- file.path(dat_dir, "uniprot_entrez_lookup.rda")
  
  if (file.exists(file)) {
    load(file)
    
    if (overwrite) {
      uniprot_entrez_lookup <- uniprot_entrez_lookup %>% 
        .[! names(.) == os_name] %>% 
        .[! . == species]
    } 
    else {
      old_sp <- uniprot_entrez_lookup %>% .[names(.) == os_name]
      curr_sp <- curr_lookup

      if (!purrr::is_empty(old_sp) && (curr_sp != old_sp)) {
        stop("`", names(curr_lookup), "` was previously linked to `species = ", old_sp, "`.", 
             "\nTo overwrite the value of `os_name`, set `overwrite = TRUE`.", 
             call. = FALSE)
      }
      
      old_nm <- uniprot_entrez_lookup %>% .[. == species] %>% names()
      curr_nm <- os_name

      if (!purrr::is_empty(old_nm) && (curr_nm != old_nm)  && !overwrite) {
        stop("`", species, "` was previously linked to `os_name = ", old_nm, "`.", 
             "\nTo overwrite, set `overwrite = TRUE`.", 
             call. = FALSE)
      }
    }

    uniprot_entrez_lookup <- c(curr_lookup, uniprot_entrez_lookup) %>% 
      .[!duplicated(.)]
  }
  
  save(uniprot_entrez_lookup, file = file)
}
qzhang503/proteoQ documentation built on March 16, 2024, 5:27 a.m.