R/multi_diff.R

Defines functions upsets_from_contrasts limma_wrap build_cont_mat build_design_mat multi_diff

Documented in multi_diff

#' Differential gene expression of multi-normalised NanoString mRNA data
#'
#' @description Provides a wrapper around limma to performs differential
#' gene expression analysis of all possible pairwise comparisons on all
#' normalised assays within a count_set object.
#'
#' @param count_set a normalised, count_set summarized experiment.
#'  Default = NULL
#' @param adj_method method for multiple test corrections. Options are:
#'  "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none".
#'  See ?p.adjust.methods for a details description and references.
#'  Default = "fdr".
#' @param p_cut_off p value threshold for DGE.
#' @param logFC_cut_off log2 fold change threshold for DGE
#' @param pairing if data is paired "paired", if unpaired "unpaired" (Default)
#'
#' @examples
#' # biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#'                   "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#'                   "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#'                             group = rnf5_group,
#'                             samp_id = rnf5_sampleid)
#' # normalize
#' rnf5_count_set_norm <- multi_norm(count_set = rnf5_count_set,
#'                                   positive_control_scaling = TRUE,
#'                                   background_correct = "mean2sd",
#'                                   #plot_dir = "~/Dropbox/git/NanoStringClustR/plot_test/"
#'                                   )
#' # rank normalizations
#' rnf5_eval <- norm_rank(rnf5_count_set_norm)
#'
#' # differential gene expression
#' rnf5_multi_diff <- multi_diff(count_set = rnf5_count_set_norm,
#'                               adj_method = "fdr",
#'                               p_cut_off = 0.05,
#'                               logFC_cut_off = 0)
#'
#' @return Returns a list containing number of DGE for each normalization
#'  method, upset plots and full DEG results and fits.
#'
#' @export multi_diff
#'
#' @importFrom limma lmFit contrasts.fit eBayes topTable makeContrasts
#' @importFrom SummarizedExperiment assays
#' @importFrom ggplot2 ggplot aes geom_bar geom_point theme_classic scale_fill_brewer element_text xlab ylab theme vars facet_wrap
#' @importFrom Biobase subListExtract
#' @importFrom UpSetR upset fromList
#'
#' @references
#'
#'   Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, G.K. (2015). limma powers differential
#'   expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47.
#'
#'   Nils Gehlenborg (2019). UpSetR: A More Scalable Alternative to Venn and Euler Diagrams for Visualizing Intersecting Sets.
#'   R package version 1.4.0. https://CRAN.R-project.org/package=UpSetR
#'
#'


multi_diff <-   function(count_set = NULL,
                        adj_method = "fdr",
                        p_cut_off = 0.1,
                        logFC_cut_off = 0,
                        pairing = "unpaired") {

  ####### Input checks #######---------------------------------------------------

  # check count_set is provided
  if(is.null(count_set)) stop("A count_set list generated by count_set must be provided.")



  if(!is.null(count_set)) {
    # check the file format
    if(count_set@class != "SummarizedExperiment"){
      stop(paste("summarized file provided is not a SummarizedExperiment,
                 Please produce a SummarizedExperiment using count_set.", "\n",
                 sep=""))
    }
  }

  # check that the count_set has been normalised
  if(length(names(assays(count_set))) == 1) stop("The count_set input must be normalised
                                                 with multi_norm")

  # check the p-value adjustment method is correct
  if((adj_method %in% c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
                           "fdr", "none")) == FALSE){
    stop("adjust_method must be one of the following methods: \"holm\",
         \"hochberg\", \"hommel\", \"bonferroni\", \"BH\", \"BY\", \"fdr\" or
         \"none\"\n")
  }

  ####### Input checks finished #######---------------------------------------

  ### 1. Design matrix ###

  design <- build_design_mat(count_set = count_set, pairing = pairing)

  design_table <-design$table
  design <- design$design

  ### 2. Contrast matrix ###

  cont_mat <- build_cont_mat(design = design, design_table = design_table)

  ####### DGE ######----------------------------------------------------------
  assays_all <- unique(names(assays(count_set)))
  #drop <- c("counts", "background_corrected", "positive_control_scaled")
  #assays_all <- assays_all[!(assays_all %in% drop)]

  ### 3. Limma ###

  full_results <- sapply(seq_along(assays_all), function(i){
    limma_wrap(count_set = count_set,
               norm_method = assays_all[i],
               design = design,
               cont_mat = cont_mat,
               adj_method = adj_method)
  })

  colnames(full_results) <- assays_all

  full_fits <- full_results[2, ]
  full_results <- full_results[1, ]


###### no. of DEG ######----------------------------------------------------------

  nDEG <- lapply(seq_along(full_results), function(x)
    sapply(seq_along(full_results[[x]]), function(i)
      length(which((full_results[[x]][[i]]$adj.P.Val < p_cut_off) &
                   (abs(full_results[[x]][[i]]$logFC) > logFC_cut_off)))))

  names(nDEG) <- names(full_results)
  nDEG_table <- as.data.frame(nDEG)
  rownames(nDEG_table) <- names(full_results$housekeeping_scaled)

###### plot no. of DEGs ######---------------------------------------------------------

  for_plot <- nDEG_table
  for_plot$contrast <- rownames(for_plot)
  for_plot <- reshape::melt(for_plot, id = "contrast")
  colnames(for_plot) <- c("contrast", "norm_method", "No_DEG")

  #plot
  DEG_plot <- ggplot2::ggplot(data = for_plot, aes(x = norm_method, y = No_DEG))+
    geom_bar(stat="identity", aes(fill = norm_method), colour = "black")+
    ggplot2::facet_wrap(ggplot2::vars(contrast))+
    ggplot2::scale_fill_brewer()+
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "none")+
    xlab("Contrast")+
    ylab(ifelse(adj_method == "none", paste("No. DEG (p <", p_cut_off, ")", sep=""),
                paste("No. DEG (", adj_method, " adj.p <", p_cut_off, ")", sep="")))

###### upset diagrams ######----------------------------------------------------

  contrasts <- names(full_results[1:length(full_results)][[1]])

  #remove contrasts that have no DEG or UpSet won't work
  contrasts <- contrasts[ ! contrasts %in% (rownames(nDEG_table[rowSums(nDEG_table) == 0,])) ]

  length_contrasts <- vector("list", length(contrasts))

  #if there are no DEGs
  if(length(length_contrasts) == 0){
    warning(paste("No overlap in DEG at p = ", p_cut_off," & logFC > ", logFC_cut_off,
                  " with adj_method = ", adj_method, ". UpSets not made.", sep=""))
  } else {

  upsets <- lapply(seq_along(contrasts), function(i){

    upsets_from_contrasts(cont = contrasts[i],
                          full_results = full_results,
                          p_cut_off = p_cut_off,
                          logFC_cut_off = logFC_cut_off)

    })

  }


  return(list("plot_DEG" = DEG_plot,
              "overlap_DEG" = upsets,
              "summary_DEG" = nDEG_table,
              "results_DEG" = full_results,
              "fits" = full_fits))

}

###### function to build design matrix #######--------------------------------

build_design_mat <- function(count_set = NULL,
                             pairing = "unpaired") {

  group <- factor(count_set$group)

  if(pairing == "unpaired"){
    design <- stats::model.matrix(~ 0 + group)
    design_table <- data.frame("sample" = colnames(count_set),
                               "group" = group)
  }

  if(pairing == "paired"){
    pairs <- factor(count_set$pair)
    design <- stats::model.matrix(~ 0 + group + pairs)
    design_table <- data.frame("sample" = colnames(count_set),
                               "group" = group,
                               "pairs" = pairs)
  }

  rownames(design) <- colnames(count_set)
  colnames(design) <- gsub("group", "", colnames(design))
  colnames(design) <- gsub("pairs", "", colnames(design))

  return(list("design" = design, "table" = design_table))

}

###### function to build contrast matrix ####### ------------------------------

build_cont_mat <- function(design = design, design_table = design_table){

  all_c <- levels(design_table$group)
  all_com <- utils::combn(all_c, 2)
  all_combs <- c(sapply(seq_len(ncol(all_com)), function(i)
    paste(all_com[1,i], all_com[2,i], sep="-")))
  all_combinations <- paste(all_combs, collapse=",")

  contrast_cmd <- paste("limma::makeContrasts(", all_combinations, ",levels = design)",
                        sep="")

  contrast_matrix <- eval(parse(text=contrast_cmd))

  return(contrast_matrix)
}

###### function to wrap limma ##### ------------------------------------

limma_wrap <- function(count_set = NULL,
                       norm_method = "housekeeping_scaled",
                       design = design,
                       cont_mat = cont_mat,
                       adj_method = adj_method){

  get_counts <- paste0("assays(count_set)$", norm_method)
  data_in <- na.omit(eval(parse(text=get_counts)))

  fit<- limma::lmFit(data_in, design = design)

  all_fit <- lapply(seq_len(ncol(cont_mat)), function(i)
    limma::contrasts.fit(fit = fit,
                         contrasts = t(data.frame(t(cont_mat[,i]),
                                                  row.names = colnames(cont_mat)[i]))))

  all_fit <- lapply(seq_len(length(all_fit)), function(i)
    limma::eBayes(all_fit[[i]], robust = TRUE))

  all_results <- lapply(seq_len(length(all_fit)), function(i)
    limma::topTable(all_fit[[i]], number="inf", lfc = 0, adjust.method = adj_method))

  names(all_results) <- colnames(cont_mat)

  return(list("all_results" = all_results, "all_fit" = all_fit))

}

###########function to make upsets ####### -----------------------------------------

upsets_from_contrasts <- function(cont = contrast,
                                  full_results =  full_results,
                                  p_cut_off = p_cut_off,
                                  logFC_cut_off = logFC_cut_off){

  topTable <- Biobase::subListExtract(full_results, cont)

  DEG <- lapply(seq_along(topTable), function(j)
    topTable[[j]][which((topTable[[j]]$adj.P.Val < p_cut_off) &
                            (abs(topTable[[j]]$logFC) > logFC_cut_off)) ,])

  names(DEG) <- names(full_results)

  DEG_rownames <- lapply(seq_along(DEG), function(j)
    rownames(DEG[[j]]))

  names(DEG_rownames) <- names(DEG)

  #check there is an overlap to plot
  check_overlaps <- fromList(DEG_rownames)

  if (length(check_overlaps[,which(colSums(check_overlaps) > 0)])>1){

    upset_i <- UpSetR::upset(fromList(DEG_rownames),
                             nsets = length(fromList(DEG_rownames)),
                             number.angles = 0,
                             point.size = 4,
                             line.size = 2,
                             mainbar.y.label = "norm_method intersections",
                             sets.x.label = "DEG per norm_method",
                             text.scale = c(2, 2, 2, 2, 2, 2))
    return(upset_i)

  } else {
    warning("There is no overlap of DEG between normalization methods in contrast ", contrast_i, ".", sep = "")
  }

}
MarthaCooper/NanoStringClustR documentation built on June 25, 2021, 9:41 p.m.