glacier: Gene List Annotation, Calculation and Illustration of Enrichment in R

Documented in calculate calculate_post calculate_pre explore_annotation process_annotations process_database process_input_seurat process_input_text

# constants
split_1 <- stringr::str_c(sep = "|", " in comparison between ", " upon ",
                          " in comparison of ", " in ", " during ", " after ")
split_2 <- stringr::str_c(sep = "|", " versus ", " vs ", " before ", " after ",
                          " compared to ")
form_lhs <- stringr::str_glue("(?:{split_1})(.*)(?:{split_2})")
form_rhs <- stringr::str_glue("(?:{split_2})(.*?)\\.?$")

#' Process annotations
#'
#' Choose annotations to include in annotation list, from gene set names, gene
#' set symbols, gene set descriptions, annotations automatically extracted from
#' gene set descriptions or manual annotations.
#'
#' @param anno output of \code{\link{import_annotations}}
#' @param info tibble: "name" gene set name "info" gene set descriptions
#' @param options \code{"name"} for gene set names, \code{"syms"} for gene set
#'   symbols, \code{"info"} for gene set descriptions, \code{"auto"} for
#'   automatically generated annotations and/or \code{"file"} for manual
#'   annotations
#'
#' @return
#' \code{gs_annos} tibble: gene sets and annotations
#'
#' \code{annos} vector: annotations
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @examples \dontrun{
#' anno_path <- system.file("extdata", "ex_anno.csv", package = "glacier")
#' anno <- import_annotations(anno_path, ",", TRUE, c(2, 4), 5)
#' info <- anno[c("name", "info")]
#'
#' anno_proc <- process_annotations(anno, info, "file")
#' }
process_annotations <- function(anno, info, options) {
  anno_proc <- anno %>% dplyr::select("name")
  info <- anno_proc %>%
    dplyr::left_join(info, by = "name") %>%
    dplyr::pull("info")

  # extract annotations
  if ("name" %in% options) anno_proc$anno_name <- anno_proc$name
  if ("syms" %in% options) anno_proc$anno_syms <-
    stringr::str_match(anno_proc$name, "_(.*)")[, 2]
  if ("info" %in% options) anno_proc$anno_info <- info
  if ("auto" %in% options) anno_proc$anno_auto <-
    ifelse(
      stringr::str_detect(info, "up-regulated"),
      stringr::str_match(info, form_lhs)[, 2],
      stringr::str_match(info, form_rhs)[, 2]
    )
  if ("file" %in% options) anno_proc <- anno_proc %>%
    tibble::add_column(dplyr::select(anno, dplyr::starts_with("anno_")))

  # generate annotation list
  annos <- anno_proc %>%
    dplyr::select(dplyr::starts_with("anno_")) %>%
    unlist(use.names = F) %>%
    unique()

  list(gs_annos = anno_proc, annos = annos[!is.na(annos) & annos != ""])
}

#' Process database
#'
#' Choose database category and organism for use in analysis.
#'
#' @param data output of \code{\link{import_database}} or
#'   \code{\link{import_msigdb}}
#' @param categories optional: categories to include; default all
#' @param organisms optional: organisms to include; default all
#'
#' @return
#' \code{gs_genes} list: names: gene set names vector: genes
#'
#' \code{gs_info} tibble: gene set information
#'
#' \code{genes} vector: list of genes
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @examples \dontrun{
#' data_path <- system.file("extdata", "ex_data.csv", package = "glacier")
#' data <- import_database(data_path, ",", FALSE, c(2, 4), 0)
#'
#' data_proc <- process_database(data, "Not assigned", "Not assigned")
#' }
process_database <- function(data, categories = FALSE, organisms = FALSE) {
  # filter categories and organisms
  gs_info <- data$gs_info
  if (is.null(categories) || categories != FALSE)
    gs_info <- gs_info %>% dplyr::filter(.data$category %in% categories)
  if (is.null(organisms) || organisms != FALSE)
    gs_info <- gs_info %>% dplyr::filter(.data$organism %in% organisms)

  # extract gene sets and genes
  gs_genes <- data$gs_genes[gs_info$name]
  genes <- gs_genes %>% unlist(use.names = F) %>% unique()

  list(gs_genes = gs_genes, gs_info = gs_info, genes = genes)
}

#' Process text input
#'
#' Removes duplicate genes. If multiple values for the same gene are found, only
#' the first value will be kept.
#'
#' @param text character: input with genes and optionally values
#'
#' @return tibble: "gene" gene names "value" gene values
#' @export
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @examples
#' input <- process_input_text("FCN1 FTL CLU")
#' input <- process_input_text("FCN1 0.1 FTL 0.8 CLU 0.05")
process_input_text <- function(text) {
  tokens <- text %>%
    stringr::str_split("[ \t\r\n,;]") %>%
    unlist() %>%
    purrr::discard(~. == "")
  values <- suppressWarnings(as.numeric(tokens))

  # process
  tokens[!is.na(values)] <- NA
  values <- values[-1] %>% c(NA)

  # store results
  genes <- tibble::tibble(gene = tokens, value = values) %>%
    tidyr::drop_na(.data$gene) %>%
    dplyr::distinct(.data$gene, .keep_all = T)
}

#' Extract differentially expressed genes from Seurat object
#'
#' Finds differentially expressed genes, records adjusted P-value and filters
#' for values less than \code{max_p}.
#'
#' @param seurat Seurat object
#' @param clst_1 first cluster
#' @param clst_2 optional: second cluster; default all others
#' @param max_p P-value cutoff, only genes with P-values less than this will be
#'   returned
#'
#' @return tibble: "gene" gene names "value" gene values
#' @export
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @examples
#' seu_path <- system.file("extdata", "ex_seurat.rds", package = "glacier")
#' seurat <- readRDS(seu_path)
#'
#' input <- process_input_seurat(seurat, 0)
process_input_seurat <- function(seurat, clst_1, clst_2 = NULL, max_p = 0.05) {
  if (!requireNamespace("Seurat", quietly = T))
    stop("Package 'Seurat' is required for this feature")
  if (!is.null(clst_2) && clst_1 == clst_2)
    return(tibble::tibble(gene = character(), value = numeric()))

  seurat %>%
    Seurat::FindMarkers(ident.1 = clst_1, ident.2 = clst_2) %>%
    tibble::rownames_to_column("gene") %>%
    tibble::tibble() %>%
    dplyr::select("gene", value = "p_val_adj") %>%
    dplyr::filter(.data$value <= max_p)
}

#' Find an annotation's associated gene sets and genes
#'
#' @param annotation annotation to explore
#' @param gs_annos value from \code{\link{process_annotations}}
#' @param gs_genes value from \code{\link{process_database}}
#' @param genes optional: genes to match, or (default) all
#'
#' @return
#' \code{"names"} vector: gene set names
#'
#' \code{"genes"} vector: genes
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @examples \dontrun{
#' anno_path <- system.file("extdata", "ex_anno.csv", package = "glacier")
#' data_path <- system.file("extdata", "ex_data.csv", package = "glacier")
#' anno <- import_annotations(anno_path, ",", TRUE, c(2, 4), 5)
#' data <- import_database(data_path, ",", FALSE, c(2, 4), 0)
#' info <- anno[c("name", "info")]
#'
#' anno_proc <- process_annotations(anno, info, "file")
#' data_proc <- process_database(data, "Not assigned", "Not assigned")
#' anno_assoc <- explore_annotation("Carcinogen", anno_proc$gs_annos,
#'                                  data_proc$gs_genes)
#' }
explore_annotation <- function(annotation, gs_annos, gs_genes, genes = NULL) {
  index <- (gs_annos == annotation) %>% rowSums(na.rm = T) %>% as.logical()
  match <- gs_genes[gs_annos$name[index]]
  if (!is.null(genes))
    match <- match %>% purrr::map(~intersect(., genes)) %>% purrr::compact()

  names <- names(match)
  genes <- match %>% unlist(use.names = F) %>% unique()
  list(names = names, genes = if (is.null(genes)) character() else genes)
}

#' Begin calculating overlap statistics
#'
#' @param input output of \code{\link{process_input_text}} or
#'   \code{\link{process_input_seurat}}
#' @param annos value from \code{\link{process_annotations}}
#' @param gs_annos value from \code{\link{process_annotations}}
#' @param gs_genes value from \code{\link{process_database}}
#'
#' @return \code{stats_pre} tibble: overlap statistics (incomplete)
#'
#' \code{matches} list: names: annotations vector: matched genes
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @examples \dontrun{
#' anno_path <- system.file("extdata", "ex_anno.csv", package = "glacier")
#' data_path <- system.file("extdata", "ex_data.csv", package = "glacier")
#' anno <- import_annotations(anno_path, ",", TRUE, c(2, 4), 5)
#' data <- import_database(data_path, ",", FALSE, c(2, 4), 0)
#' info <- anno[c("name", "info")]
#'
#' anno_proc <- process_annotations(anno, info, "file")
#' data_proc <- process_database(data, "Not assigned", "Not assigned")
#' input <- process_input_text("FCN1 0.1 FTL 0.8 CLU 0.05")
#' calc_pre <- calculate_pre(input, anno_proc$annos, anno_proc$gs_annos,
#'                           data_proc$gs_genes)
#' }
calculate_pre <- function(input, annos, gs_annos, gs_genes) {
  stat <- tibble::tibble(name = annos, n_sets = 0L, n_gene = 0L, n_hits = 0L,
                         g_hits = "")
  hits <- list()

  # iterate over annotations
  for (i in seq_along(annos)) {
    # get related genes and find overlap
    overlap <- explore_annotation(annos[i], gs_annos, gs_genes)
    matches <- intersect(overlap$genes, input$gene)

    # store information
    stat$n_sets[i] <- length(overlap$names)
    stat$n_gene[i] <- length(overlap$genes)
    stat$n_hits[i] <- length(matches)
    stat$g_hits[i] <- matches %>% stringr::str_c(collapse = ", ")
    hits[[annos[i]]] <- matches
  }

  list(stats_pre = stat, matches = hits)
}

#' Finish calculating overlap statistics
#'
#' @param stats_pre value from \code{\link{calculate_pre}}
#' @param input_size number of genes in input
#' @param universe number of genes in universe
#'
#' @return tibble: overlap statistics
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @examples \dontrun{
#' anno_path <- system.file("extdata", "ex_anno.csv", package = "glacier")
#' data_path <- system.file("extdata", "ex_data.csv", package = "glacier")
#' anno <- import_annotations(anno_path, ",", TRUE, c(2, 4), 5)
#' data <- import_database(data_path, ",", FALSE, c(2, 4), 0)
#' info <- anno[c("name", "info")]
#'
#' anno_proc <- process_annotations(anno, info, "file")
#' data_proc <- process_database(data, "Not assigned", "Not assigned")
#' input <- process_input_text("FCN1 0.1 FTL 0.8 CLU 0.05")
#' calc_pre <- calculate_pre(input, anno_proc$annos, anno_proc$gs_annos,
#'                           data_proc$gs_genes)
#' calc <- calculate_post(calc_pre$stats_pre, nrow(input), 10000)
#' }
calculate_post <- function(stats_pre, input_size, universe) {
  stat <- stats_pre %>% tibble::add_column(pvalue = 0, odds_r = 0)

  # calculate statistics
  for (i in seq_len(nrow(stat))) {
    # fisher's exact test: [1, ] belongs to annotation [, 1] entered in list
    data <- matrix(nrow = 2, ncol = 2)
    data[1, 1] <- stat$n_hits[i]
    data[1, 2] <- stat$n_gene[i] - data[1, 1]
    data[2, 1] <- input_size - data[1, 1]
    data[2, 2] <- universe - data[1, 1] - data[1, 2] - data[2, 1]

    # assign statistics
    test <- data %>% stats::fisher.test(alternative = "greater")
    stat$pvalue[i] <- test$p.value
    stat$odds_r[i] <- test$estimate[[1]]
  }

  # post-processing
  stat$enrich <- (stat$n_hits / input_size) / (stat$n_gene / universe)
  stat$adj_pv <- stat$pvalue %>% stats::p.adjust(method = "fdr")
  stat$adj_fe <- stat$enrich / -log(stat$adj_pv)

  stat %>% dplyr::select(
    Annotation = .data$name,
    `# gene sets` = .data$n_sets,
    `# genes` = .data$n_gene,
    `# matches` = .data$n_hits,
    `P-value` = .data$pvalue,
    `Adjusted P-value` = .data$adj_pv,
    `Odds Ratio` = .data$odds_r,
    `Fold Enrichment` = .data$enrich,
    `Adjusted Fold Enrichment` = .data$adj_fe,
    Matches = .data$g_hits
  )
}

#' Calculate overlap statistics
#'
#' @param input output of \code{\link{process_input_text}} or
#'   \code{\link{process_input_seurat}}
#' @param annos value from \code{\link{process_annotations}}
#' @param gs_annos value from \code{\link{process_annotations}}
#' @param gs_genes value from \code{\link{process_database}}
#' @param universe optional: number of genes in universe; default calculate from
#'   \code{gs_genes}
#'
#' @return \code{stats} tibble: overlap statistics
#'
#' \code{matches} list: names: annotations vector: matched genes
#' @keywords internal
#'
#' @importFrom magrittr %>%
#' @examples \dontrun{
#' anno_path <- system.file("extdata", "ex_anno.csv", package = "glacier")
#' data_path <- system.file("extdata", "ex_data.csv", package = "glacier")
#' anno <- import_annotations(anno_path, ",", TRUE, c(2, 4), 5)
#' data <- import_database(data_path, ",", FALSE, c(2, 4), 0)
#' info <- anno[c("name", "info")]
#'
#' anno_proc <- process_annotations(anno, info, "file")
#' data_proc <- process_database(data, "Not assigned", "Not assigned")
#' input <- process_input_text("FCN1 0.1 FTL 0.8 CLU 0.05")
#' calc <- calculate(input, anno_proc$annos, anno_proc$gs_annos,
#'                   data_proc$gs_genes)
#' }
calculate <- function(input, annos, gs_annos, gs_genes, universe) {
  if (is.null(universe)) universe <- gs_genes %>%
      unlist(use.names = F) %>%
      c(input$gene) %>%
      unique() %>%
      length()

  calc_pre <- calculate_pre(input, annos, gs_annos, gs_genes)
  calc <- calculate_post(calc_pre$stats_pre, nrow(input), universe)

  list(stats = calc, matches = calc_pre$matches)
}

lilin-yang/glacier documentation built on Oct. 3, 2020, 8:06 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

lilin-yang/glacier
Gene List Annotation, Calculation and Illustration of Enrichment in R

R/process.R
In lilin-yang/glacier: Gene List Annotation, Calculation and Illustration of Enrichment in R

Defines functions calculate calculate_post calculate_pre explore_annotation process_input_seurat process_input_text process_database process_annotations

Documented in calculate calculate_post calculate_pre explore_annotation process_annotations process_database process_input_seurat process_input_text

R Package Documentation

Browse R Packages

We want your feedback!

lilin-yang/glacier Gene List Annotation, Calculation and Illustration of Enrichment in R

R/process.R In lilin-yang/glacier: Gene List Annotation, Calculation and Illustration of Enrichment in R

Defines functions calculate calculate_post calculate_pre explore_annotation process_input_seurat process_input_text process_database process_annotations

Documented in calculate calculate_post calculate_pre explore_annotation process_annotations process_database process_input_seurat process_input_text

R Package Documentation

Browse R Packages

We want your feedback!

lilin-yang/glacier
Gene List Annotation, Calculation and Illustration of Enrichment in R

R/process.R
In lilin-yang/glacier: Gene List Annotation, Calculation and Illustration of Enrichment in R