R/DE_analysis.R

Defines functions filter normalization DE comparison

Documented in comparison DE filter normalization

# parameters extraction ---------------------------------------------------
# comptage matrix



# filtering the count table -----------------------------------------------

#' Filter the counting table
#'
#' Remove all the lines with at least the count per million is superior than 1 on at least 3 columns
#'
#' @param counts The counting table
#'
#' @return the matrix that had been filtered
#' @export
#'
#' @importFrom edgeR cpm
filter <- function(counts) {
  if (!is.matrix(counts)) {
    counts <- as.matrix(counts, rownames = names(counts)[1])
  }

  counts <- na.omit(counts)
  keep.exprs <- rowSums(cpm(counts) > 1) >= 3 # we select the where the cpm is superior than 1 on at least 3 columns
  counts <- counts[keep.exprs, ] ## keep the rows selected

  return(counts)
}




# normalization -----------------------------------------------------------

#' Normalize the dataset
#'
#' @param counts matrix of the counting table
#' @param groups the groups (flowcells) for each of the column of the data
#'
#' @return a list with the normalized dataset in DGElist and the plot associated
#' @export
#'
#' @importFrom edgeR DGEList calcNormFactors
#' @importFrom limma plotMDS
#' @importFrom ggplot2 ggplot ggtitle geom_point geom_text xlab ylab scale_color_discrete aes
#' @importFrom ggrepel geom_text_repel
normalization <- function(counts, groups) {

  # without normalization
  y <- DGEList(counts, group = groups)
  coord_raw <- plotMDS(y, main = "MDSplot on the raw dataset", plot = F)

  gg_list <- list()

  gg_list$raw <- with(coord_raw, {
    ggplot(NULL, aes(x = x, y = y, label = names(x), color = groups)) +
      ggtitle("MDSplot on the raw dataset")
  })


  # with normalization
  y <- calcNormFactors(y, method = "TMM")
  coord <- plotMDS(y, top = min(500, nrow(y$samples$norm.factors)), method = "logFC", gene.selection = "pairwise", plot = FALSE)

  gg_list$normalize <- with(coord, {
    ggplot(NULL, aes(x = x, y = y, label = names(x), color = groups)) +
      ggtitle("MDSplot on the normalized dataset")
  })

  gg_list <- lapply(gg_list, function(x) {
    x +
      geom_point() +
      geom_text_repel(show.legend = F, inherit.aes = T) +
      xlab("Leading logFC dimension 1") +
      ylab("Leading logFC dimension 2") +
      scale_color_discrete(name = "groups") + theme_gray()
  })



  return(list(normalized_dataset = y, plot = gg_list))
}



# Differential expression analysis ----------------------------------------
#' Differential expression
#'
#' @param y DGElist normalized
#' @param conditions conditions of the analysis
#'
#' @return the glm fit tagwise dispersion
#' @export
#'
#' @importFrom stats model.matrix
#' @importFrom edgeR estimateGLMCommonDisp estimateGLMTrendedDisp estimateGLMTagwiseDisp glmFit
DE <- function(y, conditions) {
  conditions <- factor(conditions, unique(conditions), ordered = T)

  design <- model.matrix(~ 0 + conditions)

  y_tmp <- estimateGLMCommonDisp(y, design) ## Common dispersion estimation: the same dispersion value is used to model the variance of a gene
  y_tmp <- estimateGLMTrendedDisp(y_tmp, design) ## Trended dispersion estimation: esitmation with different dispersion values for each gene, to give an idea
  y_tmp <- estimateGLMTagwiseDisp(y_tmp, design) ## Tagwise to take into account a specific dispersion for each gene
  fit <- glmFit(y_tmp, design) ## Tagwise dispersion Glm

  return(fit)
}



# comparisons -------------------------------------------------------------
#' Do the comparison
#'
#' Make all the comparison that is inside the contrast arguments
#'
#' @param fit the glmfit form edgeR
#' @param contrast the contrast data.table
#'
#' @return list with 3 elements:
#'  - data, a data.table with:
#'     * rn: the name of the genes
#'     * logFC: log fold change
#'     * pval_adj: pvalue adjusted
#'     * comp_name: the name of the comparison
#'  - plot, a list with the names of the comparison, inside each of them there is another list with:
#'     * the Smear plot (mean-difference plot)
#'     * volcano plot
#' @export
#'
#' @importFrom data.table setDT := as.data.table rbindlist
#' @importFrom edgeR glmLRT
#' @importFrom purrr map transpose
#' @importFrom magrittr %>%
comparison <- function(fit, contrast) {
  setDT(contrast)

  table <- contrast %>%
    transpose() %>%
    map(function(x) {
      x <- unlist(x)
      comp <- glmLRT(fit, contrast = as.numeric(x[-1]))

      table <- as.data.table(comp$table, keep.rownames = T)
      table[, ":="(pval_adj = p.adjust(PValue, "BH"))]

      table[, .(rn, logFC, pval_adj, logCPM, comp_name = x[1])]
    }) %>%
    rbindlist()

  return(table[, .(rn, logFC, pval_adj, AveLogCPM = logCPM, comp_name)])
}




# plot --------------------------------------------------------------------
#' @importFrom ggplot2 ggplot labs scale_color_manual scale_shape_manual geom_smooth geom_point ylab
#' @importFrom purrr map flatten %>%
#' @importFrom data.table setDT
plot_volcano_smear <- function(data, pvalue, comparisons = NULL) {
  setDT(data)
  if (is.null(comparisons)) comparisons <- data[, unique(comp_name)]
  if (is.na(pvalue)) {
    return()
  }


  comparisons %>%
    map(function(title) {
      subdata <- data[comp_name == title]

      is_inferior <- subdata[, as.character(pval_adj < pvalue)]

      name_legend <- paste("adj PValue <=", pvalue)
      gg_begin <- ggplot(subdata, aes(col = is_inferior, shape = is_inferior)) +
        labs(shape = name_legend, colour = name_legend, title = title) +
        scale_color_manual(values = color_true_false) +
        scale_shape_manual(values = shape_true_false) + theme_gray()

      out <- list(
        Smear = gg_begin + geom_point(aes(x = AveLogCPM, y = logFC)) + geom_smooth(aes(x = AveLogCPM, y = logFC)),
        volcano = gg_begin + geom_point(aes(x = logFC, y = -log10(pval_adj))) + ylab("-log10(adj PValue)")
      )

      names(out) <- paste0(title, "_", c("Smear", "volcano"))
      return(out)
    }) %>%
    flatten()
}
ArthurPERE/RNASeqDE documentation built on Sept. 17, 2019, 7:34 p.m.