R/enrichment.R

Defines functions enrichment

Documented in enrichment

#' Enrich pathways with genes from putative miRNA-target gene interactions.
#'
#' This function will do function analysis with genes from potential
#' miRNA-target gene interactions in the input data.frame, which is
#' generated by \link{database_support}, with total 4 kinds of pathway
#' databases, including mouse and human two species, beseides, this
#' function will permute 5000 times (Default) for each pathway to show
#' an empirical p_value to avoid the bias from hypergeometric p-value,
#' this indicating that it would take a few minutes to do functional
#' analysis.
#'
#' @return matrix format. There are 7 columns in it, including
#'    database, term, total genes of the term,  targets in the term,
#'    targets in total genes of the term (%), raw p-value, empirical
#'    p-value.
#'
#' @seealso \code{\link[stats]{Hypergeometric}} for details.
#'
#' @param data_support matrix format generated from
#'    \link{database_support}.
#' @param org species of genes and miRNAs, only support "hsa", "mmu"
#' @param per_time Times of permutation about each enriched pathways,
#'    higher times, more precise empirical p-value user can obtain,
#'    meanwhile, this function would cost more time. Default is 5000.
#'
#' @examples
#' ## Use the internal dataset
#' data("mirna", package = "anamiR", envir = environment())
#' data("pheno.mirna", package = "anamiR", envir = environment())
#' data("mrna", package = "anamiR", envir = environment())
#' data("pheno.mrna", package = "anamiR", envir = environment())
#'
#' ## SummarizedExperiment class
#' require(SummarizedExperiment)
#' mirna_se <- SummarizedExperiment(
#'  assays = SimpleList(counts=mirna),
#'  colData = pheno.mirna)
#'
#' ## SummarizedExperiment class
#' require(SummarizedExperiment)
#' mrna_se <- SummarizedExperiment(
#'  assays = SimpleList(counts=mrna),
#'  colData = pheno.mrna)
#'
#' ## Finding differential miRNA from miRNA expression data with t.test
#' mirna_d <- differExp_discrete(
#'    se = mirna_se,
#'    class = "ER",
#'    method = "t.test"
#' )
#'
#' ## Finding differential mRNA from mRNA expression data with t.test
#' mrna_d <- differExp_discrete(
#'    se = mrna_se,
#'    class = "ER",
#'    method = "t.test"
#' )
#'
#' ## Convert annotation to miRBse 21
#' mirna_21 <- miR_converter(data = mirna_d, original_version = 17)
#'
#' ## Correlation
#' cor <- negative_cor(mrna_data = mrna_d, mirna_data = mirna_21)
#'
#' ## Intersect with known databases
#' sup <- database_support(cor_data = cor)
#'
#' ## Functional analysis
#' pat <- enrichment(data_support = sup, org = "hsa", per_time = 100)
#'
#' @import stats
#' @import RMySQL
#' @import DBI
#' @export
enrichment <- function(
  data_support,
  org = c("hsa", "mmu"),
  per_time = 5000
) {
  org <- match.arg(org)
  db <- RMySQL::dbConnect(RMySQL::MySQL(),
                          user = "visitor",
                          password = "visitor",
                          dbname = "visitor",
                          host = "anamir.cgm.ntu.edu.tw")
  gene_list <- unique(data_support[, 2])
  identified <- length(gene_list)
  if (org %in% "hsa"){
    databases <- c("KEGG_hsa", "Reactome_hsa", "BioCarta_hsa")
    query <- paste0("SELECT DISTINCT Gene_symbol FROM `all_hsa`;")
    gene_all <- DBI::dbGetQuery(db, query)
    total <- length(gene_all[["Gene_symbol"]])
  }
  if (org %in% "mmu"){
    databases <- c("KEGG_mmu", "Reactome_mmu", "MouseCyc_mmu")
    query <- paste0("SELECT DISTINCT gene_symbol FROM `all_mmu`;")
    gene_all <- DBI::dbGetQuery(db, query)
    total <- length(gene_all[["gene_symbol"]])
  }
  pathway_table <- list()
  number <- 1
  for (data in databases) {
    query <- paste0("SELECT DISTINCT term FROM `", data, "`;")
    term_all <- DBI::dbGetQuery(db, query)
    for (i in seq_len(length(term_all[["term"]]))) {
      if (grepl("'", term_all[i, 1])){
        tmp <- strsplit(term_all[i, 1], "'")
        term_all[i, 1] <- paste(tmp[[1]][1], "''", tmp[[1]][2])
      }
      query <- paste0("SELECT * FROM `", data,
                      "` WHERE term LIKE '",
                      term_all[i, 1], "' ;")

      term_info <- DBI::dbGetQuery(db, query)
      gene_set <- length(term_info[["gene_symbol"]])
      term_name <- term_info[1, 2]
      overlap <- length(intersect(term_info[, 1], gene_list))
      if (overlap == 0) {
        next
      }
      percent <- overlap / gene_set
      p <- stats::phyper(overlap - 1,
                    gene_set,
                    total - gene_set,
                    identified,
                    lower.tail = FALSE)
      number <- number + 1
      #permutation
      count <- vector(mode = "numeric", length = per_time + 1)
      permu_time <- per_time
      for (j in seq_len(permu_time)) {
        gene_per <- sample(gene_all[["Gene_symbol"]], identified)
        if (org %in% "mmu"){
          gene_per <- sample(gene_all[["gene_symbol"]], identified)
        }
        overlap_per <- length(intersect(gene_per, term_info[, 1]))
        p_per <- stats::phyper(overlap_per - 1, gene_set,
                        total - gene_set,
                        identified,
                        lower.tail = FALSE)

        count[j] <- p_per
      }
      count[permu_time + 1] <- p
      count <- sort.default(count)
      empirical_p <- which(count == p) / (per_time + 1)
      if (length(empirical_p) > 1) {
        empirical_p <- mean.default(empirical_p)
      }
      pathway_table[[number]] <- c(data, term_name,
                                   gene_set, overlap,
                                   percent, p, empirical_p)
    }
  }
  #disconnect db
  cons <- RMySQL::dbListConnections(RMySQL::MySQL())
  for (con in cons) RMySQL::dbDisconnect(con)
  pathway_table <- do.call(rbind, pathway_table)
  colnames(pathway_table) <- c("Database",
                               "Term",
                               "Total Genes of the Term",
                               "Targets in the Term",
                               "Targets in Total Genes of the Term(%)",
                               "Raw P-Value",
                               "Empirical P-Value")
  return(pathway_table)
}
AllenTiTaiWang/anamiR documentation built on May 5, 2019, 4:55 a.m.