R/DAVID_functions.R

Defines functions run_DAVID

Documented in run_DAVID

#' @title DAVID Gene Ontology Wrapper
#'
#' @description DAVID Gene Ontology Wrapper
#' @param davidWS A DAVIDWebService class object.
#' @param input_data data.frame with LFC values in column 1, and FDR values in column 2
#' @param min_lfc minimum value of the absolute value of the LFC required to be significant
#' @param list_name A string that will be used to identify the submitted list.
#' @param FA_cats A vector of strings indicating which annotation categories to search for. Use getAllAnnotationCategoryNames() to see a list of all possible annotation categories.
#' @param split_at a character at which annotation term strings will be split. If NULL, strings will not be split.
#' @import limma
#' @import stringi
#' @import RDAVIDWebService
#' @import org.Hs.eg.db
#' @import AnnotationDbi
#' @import GOplot
#' @import dplyr
#' @export
#' @examples
#' run_DAVID(davidWS, deg_list, list_name, FA_cats, split_at)

run_DAVID <- function(davidWS, input_data, min_lfc, list_name, FA_cats, split_at) {
  # Convert gene symbols into Ensembl IDs -------------------------------------
    degs <- subset(
      input_data,
      abs(input_data[, 1]) >= min_lfc & input_data[, 2] <= 0.05 &
        !is.na(input_data[, 1]) & !is.na(input_data[, 2])
    )

    if (nrow(degs) < 1) {
      print("No DEGs detected")
      return(NULL)
    }

    gs_data <- data.frame(rownames(degs), degs)
    colnames(gs_data) <- c("Gene_Symbol", "logFC", "padj")

    deg_list <- AnnotationDbi::mapIds(
      org.Hs.eg.db, rownames(degs), keytype = "SYMBOL", column = "ENSEMBL"
    )

    ensembl_IDs <- data.frame(deg_list)
    ensembl_IDs <- data.frame(rownames(ensembl_IDs), ensembl_IDs)
    colnames(ensembl_IDs) <- c("Gene_Symbol", "Ensembl_ID")
    ensembl_IDs <- dplyr::left_join(ensembl_IDs, gs_data)

  # Add gene list if needed then select correct gene list ---------------------
    gene_lists <- getGeneListNames(davidWS)
    list_position <- 0

    if (length(gene_lists) > 0) {
      for (x in 1:length(gene_lists)) {
        if (gene_lists[x] == list_name) list_position <- x
      }
    }

    if (list_position == 0) {
      addList(
        davidWS, deg_list, idType = "ENSEMBL_GENE_ID",
        listName = list_name, listType = "Gene"
      )
      gene_lists <- getGeneListNames(davidWS)
    } else setCurrentGeneListPosition(davidWS, list_position)

    print("Running DAVID analysis on gene list:")
    print(gene_lists[getCurrentGeneListPosition(davidWS)])

  # Run GO analysis -----------------------------------------------------------
    setAnnotationCategories(davidWS, FA_cats)
    FA_chart <- getFunctionalAnnotationChart(davidWS)
    if (nrow(FA_chart) < 1) return(NULL)

  # Convert Ensembl IDs back into gene symbols --------------------------------
    gene_list <- data.frame(FA_chart$Genes)

    for (r in 1:nrow(gene_list)) {
      gene_names <- limma::strsplit2(gene_list[r, 1], ", ")[1, ]

      for (n in 1:length(gene_names)) {
        ID_sub <- subset(ensembl_IDs, ensembl_IDs$Ensembl_ID == gene_names[n])
        gene_names[n] <- ID_sub$Gene_Symbol[1]
      }

      gene_names <- stri_join_list(list(gene_names), sep = ", ")
      gene_list[r, 1] <- gene_names
    }

  # Split "Term" column into 2 if applicable & put relevant data in a table ---
    if (is.null(split_at)) {
      FA_sub <- data.frame(
        FA_chart$Category, FA_chart$Term, FA_chart$Count, FA_chart$X.,
        gene_list, FA_chart$Fold.Enrichment, FA_chart$FDR/100
      )
      colnames(FA_sub) <- c(
        "Annotation_Category", "Term", "Count",
        "Percent_of_input_list", "Genes", "Fold.Enrichment", "FDR"
      )
      return(FA_sub)
    } else {
      split_IDs <- limma::strsplit2(FA_chart$Term, split_at)

      if (ncol(split_IDs) > 2) {
        for (x in nrow(split_IDs)) {
          new_cell <- split_IDs[x, 2]

          for (y in 3:ncol(split_IDs)) {
            if (split_IDs[x, y] != "") {
              new_cell <- stri_join(c(new_cell, split_IDs[x, y]), sep = ":")
            }
          }

          split_IDs[x, 2] <- new_cell
        }
        split_IDs <- split_IDs[, 1:2]
      }

      FA_sub <- data.frame(
        FA_chart$Category, split_IDs, FA_chart$Count, FA_chart$X.,
        gene_list, FA_chart$Fold.Enrichment, FA_chart$FDR/100
      )
      colnames(FA_sub) <- c(
        "Annotation_Category", "ID", "Term", "Count",
        "Percent_of_input_list", "Genes", "Fold.Enrichment", "FDR"
      )

      terms_data <- data.frame(FA_sub[, c(1:3, 8, 6)])
      colnames(terms_data) <- c(
        "category", "ID", "term", "adj_pval", "genes"
      )
    }

  # Calculate Z-scores for each row -------------------------------------------
    genes_data <- data.frame(ensembl_IDs$Gene_Symbol, ensembl_IDs$logFC)
    colnames(genes_data) <- c("ID", "logFC")
    genes_data <- subset(genes_data, !is.na(genes_data$logFC))

    cd <- GOplot::circle_dat(terms_data, genes_data)

    z_scores <- cd[, c(2, 8)]
    z_scores <- z_scores[!duplicated(z_scores), ]
    z_scores <- dplyr::left_join(FA_sub, z_scores, by = "ID")

  # Return a list with all of the results -------------------------------------
    list(
      DAVID_FA = z_scores,
      GOplot_results = cd,
      ensembl_conversion = ensembl_IDs
    )
}
Emask7/ZIKVtranscriptomics documentation built on Jan. 25, 2022, 12:32 a.m.