R/enrichment.R

Defines functions gseaCompareDotplot ep_str_wrap compareGSEA enrich_cnetplot parse_ratio enrich_barplot_by_level enrich_barplot.data.frame enrich_barplot.enrichResult enrich_barplot enrich_add_go_level enrichment_analysis

Documented in compareGSEA enrich_add_go_level enrich_barplot enrich_barplot_by_level enrich_barplot.data.frame enrich_barplot.enrichResult enrich_cnetplot enrichment_analysis gseaCompareDotplot

#' Perform enrichment analysis by using clusterProfiler.
#'
#' @param genes A vector of gene id.
#' @param go_data A data.frame contains four columns:
#' \enumerate{
#'   \item First column must be named as \code{Gene_ID}
#'   \item Second column must be named as \code{GO_ID}
#'   \item Third column must be named as \code{GO_Name}
#'   \item Fourth column must be named as \code{GO_Level}
#' }
#' @param ... Arguments passed to \code{\link[clusterProfiler]{enricher}}
#' @return A \code{enrichResult} instance
#' @export
enrichment_analysis <- function(genes, go_data, ...) {
  # Discard genes without annotation
  go_data <- go_data %>%
    dplyr::filter(GO_ID != "")

  # Construct TERM2GENE and TERM2NAME
  # TERM2GENE is a data.frame with first column of term ID and second column of
  # corresponding mapped gene and TERM2NAME is a data.frame with first column of
  # term ID and second column of corresponding term name
  go2gene <- go_data %>%
    dplyr::select(GO_ID, Gene_ID)
  go2name <- go_data %>%
    dplyr::select(GO_ID, GO_Name)

  # Perform GO enrichment analysis
  enrich_results <- clusterProfiler::enricher(
    genes, TERM2GENE = go2gene, TERM2NAME = go2name, ...)

  enrich_results
}

#' Add GO level data to result of enrichment analysis.
#'
#' @param enr A \code{enrichResult} instance
#' @inheritParams enrichment_analysis
#' @return A data.frame
#' @export
enrich_add_go_level <- function(enr, go_data) {
  # Discard genes without annotation
  go_data <- go_data %>%
    dplyr::filter(GO_ID != "")

  # Add GO_Level to results
  go_level_table <- go_data %>%
    dplyr::group_by(GO_ID) %>%
    dplyr::summarize(Level = unique(GO_Level))

  enrich_results_with_level <- enr %>%
    tibble::as_tibble() %>%
    dplyr::left_join(go_level_table, by = c("ID" = "GO_ID")) %>%
    dplyr::select(ID, Description, Level, dplyr::everything())

  enrich_results_with_level <- enrich_results_with_level %>%
    # RichFactor means DEGs of a GO term divided by All Genes of this GO term.
    dplyr::mutate(
      RichFactor = Count / as.numeric(sub("/\\d+", "", BgRatio))) %>%
    dplyr::select(ID:GeneRatio, BgRatio, RichFactor, dplyr::everything())

  enrich_results_with_level
}

#' Make barplot of enrichment analysis.
#'
#' @param obj Results returned by \code{enrichment_analysis}
#' @param x Which column used as x-axis
#' @param show_category How many annotation terms to show.
#' @param title Title of plot.
#' @export
enrich_barplot <- function(
  obj, x = "GeneRatio", show_category = 10, title = "") {
  UseMethod("enrich_barplot")
}

#' @describeIn enrich_barplot Barplot for \code{enrichResult} instance
#' @method enrich_barplot enrichResult
#' @export
enrich_barplot.enrichResult <- function(
  enr, x = "GeneRatio", show_category = 10, title = "") {
  enrich_barplot(
    enr@result, x = x, show_category = show_category, title = title)
}

#' @describeIn enrich_barplot Barplot for \code{data.frame}
#' @method enrich_barplot data.frame
#' @export
enrich_barplot.data.frame <- function(
  resdf, x = "GeneRatio", show_category = 10, title = "") {
  # Filter empty terms
  resdf <- resdf[!is.na(resdf$Description), ]
  resdf <- resdf[resdf$Count != 0, ]

  # Calculate ratios
  resdf$GeneRatio <- parse_ratio(resdf$GeneRatio)
  resdf$BgRatio <- parse_ratio(resdf$BgRatio)

  # Get top N GO terms
  if (show_category <= nrow(resdf)) {
      resdf <- resdf[1:show_category, ]
  }

  # Make top Go terms get higher y values, because of reverse levels
  resdf$Description <- factor(resdf$Description,
                          levels = rev(unique(resdf$Description)))

  # Set x to user specified col and then
  resdf <- dplyr::mutate(resdf, x = eval(parse(text = x)))

  # Re-order Go term description by x values,
  idx <- order(resdf[["x"]], decreasing = TRUE)
  resdf$Description <- factor(
    resdf$Description, levels = rev(unique(resdf$Description[idx])))

  # Make the plot
  offset <- max(resdf[[x]]) * 0.01
  resdf[["p.adjust.log"]] <- -log10(resdf[["p.adjust"]])

  ggplot2::ggplot(resdf,
      ggplot2::aes_string(x, "Description")) +
      ggplot2::geom_col(
        ggplot2::aes_string(alpha = "p.adjust.log"), fill = "deepskyblue") +
      ggplot2::scale_color_viridis_c() +
      ggplot2::scale_size_continuous(range = c(2, 10)) +
      ggplot2::scale_x_continuous(expand = c(0, 0)) +
      ggplot2::scale_alpha_continuous(name = "-Log10(p.adjust)") +
      ggplot2::ylab(NULL) +
      ggplot2::geom_text(
        ggplot2::aes_string(label = "Description"),
        x = offset, hjust = 0) +
      ggplot2::ggtitle(title) +
      cowplot::theme_cowplot() +
      ggplot2::theme(axis.text.y = ggplot2::element_blank())
}

#' Plot barplot of each GO level.
#'
#' @inheritParams enrich_barplot
#' @export
enrich_barplot_by_level <- function(
  resdf, x = "GeneRatio", show_category = 20) {
  p_list <- list()
  mf <- resdf %>%
      dplyr::filter(Level == "molecular_function")
  if (nrow(mf) > 0) {
    p_list[[length(p_list) + 1]] <- mf %>% go_barplot(x = x,
      show_category = show_category, title = "Molecular Function")
  }

  bp <- resdf %>%
    dplyr::filter(Level == "biological_process")
  if (nrow(bp) > 0) {
    p_list[[length(p_list) + 1]] <- bp %>% go_barplot(x = x,
      show_category = show_category, title = "Biological Process")
  }

  cc <- resdf %>%
      dplyr::filter(Level == "cellular_component")
  if (nrow(cc) > 0) {
    p_list[[length(p_list) + 1]] <- cc %>% go_barplot(x = x,
      show_category = show_category, title = "Cellular Component")
  }

  p <- patchwork::wrap_plots(p_list)

  p
}

parse_ratio <- function(ratio) {
  ratio <- sub("^\\s*", "", as.character(ratio))
  ratio <- sub("\\s*$", "", ratio)
  numerator <- as.numeric(sub("/\\d+$", "", ratio))
  denominator <- as.numeric(sub("^\\d+/", "", ratio))
  return(numerator / denominator)
}

#' Make Gene-Concept Network plot for enrichment analysis
#'
#' @param enr A \code{enrichResult} instance.
#' @param fold_change A named numeric vector of fold Change.
#'  The names of \code{fold_change} should be consistent with
#'  ID of \code{enr}
#' @param show_category How many annotation terms to show.
#' @param circular Whether using circular layout.
#' @export
enrich_cnetplot <- function(
  enr,
  fold_change = NULL,
  show_category = 10,
  circular = FALSE
) {
  p <- enr %>%
    enrichplot::cnetplot(
      foldChange = fold_change,
      showCategory = show_category,
      circular = circular,
      node_label = "none",
      colorEdge = circular
    )

  p <- p + ggraph::geom_node_text(
    ggplot2::aes_(label = ~name),
    data = p$data[1:show_category, ],
    repel = TRUE,
    bg.color = "white"
  )

  p
}

#' Perform GSEA on gene clusters.
#'
#' @param ranked_clusters a list of ranked logFC vector.
#' @param ... pass to \code{\link[clusterProfiler]{gseGO}}
#' @return A un-official \code{compareClusterResult} object
#' @export
compareGSEA <- function(ranked_clusters, ...) {
  gsea_list <- lapply(ranked_clusters, function(i) {
    x <- suppressMessages(clusterProfiler::gseGO(i, ...))
    if (class(x) == "gseaResult") {
      as.data.frame(x)
    }
  })

  clusters_levels <- names(ranked_clusters)

  df <- dplyr::bind_rows(gsea_list, .id = "Cluster")
  df[["Cluster"]] <- factor(df$Cluster, levels = clusters_levels)

  new("compareClusterResult",
    compareClusterResult = df,
    geneClusters = ranked_clusters,
    fun = "enrichGO", # Make simplify work.
    .call = match.call(expand.dots = TRUE)
  )
}

#' ep_str_wrap internal string wrapping function
#' @param string the string to be wrapped
#' @param width the maximum number of characters before wrapping to a new line
#' @author Guangchuang Yu
#' @noRd
ep_str_wrap <- function(string, width) {
  x <- gregexpr(" ", string)
  vapply(seq_along(x),
    FUN = function(i) {
      y <- x[[i]]
      n <- nchar(string[i])
      len <- (c(y, n) - c(0, y))
      idx <- len > width
      j <- which(!idx)
      if (length(j) && max(j) == length(len)) {
        j <- j[-length(j)]
      }
      if (length(j)) {
        idx[j] <- len[j] + len[j + 1] > width
      }
      idx <- idx[-length(idx)]
      start <- c(1, y[idx] + 1)
      end <- c(y[idx] - 1, n)
      words <- substring(string[i], start, end)
      paste0(words, collapse = "\n")
    },
    FUN.VALUE = character(1)
  )
}

#' Compare dotplot for GSEA
#'
#' @param object input object
#' @param show_category number of enriched terms to display
#' @param font_size font size
#' @param label_format a numeric value sets wrap length, alternatively
#'  a custom function to format axis labels. by default wraps names
#'  longer that 30 characters
#' @param title plot title
#' @export
gseaCompareDotplot <- function(
  object,
  show_category = 5,
  font_size = 12,
  label_format = 30,
  title = ""
) {
  df <- as.data.frame(object)

  if (is.null(show_category)) {
    result <- df
  } else if (is.numeric(show_category)) {
    top_n <- function(res, show_category) {
      res %>%
        dplyr::group_split(Cluster) %>%
        purrr::map_dfr(function(df, N) {
          if (length(df$setSize) > N) {
            idx <- order(df$pvalue, decreasing = FALSE)[1:N]
            return(df[idx, ])
          } else {
            return(df)
          }
        }, N = show_category)
    }

    result <- top_n(df, show_category)
  } else {
    result <- subset(df, Description %in% show_category)
  }

  ## remove zero count
  result$Description <- as.character(result$Description) ## un-factor
  GOlevel <- result[, c("ID", "Description")] ## GO ID and Term
  GOlevel <- unique(GOlevel)

  result <- result[result$setSize != 0, ]
  result$Description <- factor(
    result$Description, levels = rev(GOlevel$Description))

  core_count <- function(core_enrichment) {
    core_enrichment %>%
      stringr::str_split("/") %>%
      sapply(length)
  }

  result <- dplyr::mutate(result, Count = core_count(core_enrichment))

  label_func <- function(str) {
    ep_str_wrap(str, label_format)
  }

  if (is.function(label_format)) {
      label_func <- label_format
  }

  p <- ggplot2::ggplot(result, ggplot2::aes_string(
      x = "Cluster", y = "Description", size = "Count")) +
    ggplot2::geom_point(ggplot2::aes_string(color = "p.adjust")) +
    ggplot2::scale_color_continuous(
      low = "red", high = "blue",
      guide = ggplot2::guide_colorbar(reverse = TRUE)) +
    ggplot2::ylab(NULL) +
    ggplot2::ggtitle(title) +
    DOSE::theme_dose(font_size) +
    ggplot2::scale_size_continuous(range = c(3, 8)) +
    ggplot2::scale_y_discrete(labels = label_func)

  p
}
altairwei/brookite documentation built on Nov. 1, 2022, 7:03 p.m.