R/multi_norm.R

Defines functions multi_norm

Documented in multi_norm

#'  Normalise nanostring mRNA data using multiple algorithms
#'
#' @description Performs nCounter scaling factor based normalisations using
#' spike-in controls and housekeeping genes and uses wrappers for geNorm,
#' variance stabilising normalisation (vsn), cyclic loess, quantile and
#' RUV-III normalisation.
#'
#' @param count_set A count_set of mRNA counts generated using count_set
#' @param norm_method types of normalizations to perform. DEFAULT = "all",
#' other = c("housekeeping_scaled", "all_endogenous_scaled",
#' "geNorm_housekeeping", "loess", "vsn", "quantile", ruvIII")
#' @param background_correct Background correction by calculating the
#'  proportion of each sample that are background counts, based on spike in
#'  negative controls. DEFAULT = "proportional", other = "mean2sd", "none"
#' @param positive_control_scaling Option to perform scaling factor
#' normalisation to the geometric mean of the positive controls. TRUE/FALSE
#' Default = TRUE
#' @param count_threshold Threshold above which at which a gene is considered
#'  to be expressed. Any integer. Default = 0. Options are "mean2sd" or any
#'  number. To apply no threshold, use -1.
#' @param geNorm_n number of housekeeping genes to keep after ranking using
#'the geNorm algorithm. Default = 5.
#' @param plot_dir Where to write the files to? The directory must already
#' exist. e.g. "/full/path/to/my/plots/". Default = NULL. No plot will be
#' saved if NULL
#' @param ruv_k k value for RUVIII normalisation. Default = 1. See RUV package
#' for more details.
#'
#' @examples
#'
#' # biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#'                   "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#'                   "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#'                             group = rnf5_group,
#'                             samp_id = rnf5_sampleid)
#' # normalize
#' rnf5_count_set_norm <- multi_norm(count_set = rnf5_count_set,
#'                        positive_control_scaling = TRUE,
#'                        background_correct = "mean2sd",
#'                        #plot_dir = "~/Dropbox/git/NanoStringClustR/plot_test/"
#'                        )
#'
#' @return Returns a count_set containing log2 transformed, normalised data and
#' a diagnostic plot report
#'
#' @export multi_norm
#'
#' @importFrom SummarizedExperiment assays colData rowData colData<- rowData<-
#' assays<-
#' @importFrom FSA geomean
#' @importFrom matrixStats rowMins
#' @importFrom affy normalize.loess
#' @importFrom vsn justvsn
#' @importFrom preprocessCore normalize.quantiles
#' @importFrom ruv RUVIII replicate.matrix
#' @importFrom stats sd var
#' @importFrom graphics par axis mtext
#' @importFrom ctrlGene geNorm
#'
#' @references
#'   Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann, Annemarie Poustka and Martin Vingron. Variance Stabilization
#'   Applied to Microarray Data Calibration and to the Quantification of Differential Expression. Bioinformatics 18, S96-S104
#'   (2002).
#'
#'   Gautier, L., Cope, L., Bolstad, B. M., and Irizarry, R. A. 2004. affy---analysis of Affymetrix GeneChip data at the probe
#'   level. Bioinformatics 20, 3 (Feb. 2004), 307-315.
#'
#'   Ben Bolstad (2018). preprocessCore: A collection of pre-processing functions. R package version 1.44.0.
#'   https://github.com/bmbolstad/preprocessCore
#'
#'   Johann Gagnon-Bartsch (2018). ruv: Detect and Remove Unwanted Variation using Negative Controls. R package version 0.9.7.
#'   https://CRAN.R-project.org/package=ruv
#'

multi_norm <- function(count_set = NULL,
                       norm_method = "all",
                       background_correct = "proportional",
                       positive_control_scaling = TRUE,
                       count_threshold = 0,
                       geNorm_n = 5,
                       plot_dir = NULL,
                       ruv_k = 1) {

  ####### Input checks #######------------------------------------------------

  # check count_set is provided
  if(is.null(count_set)) stop("A CountSet list generated by count_set must be provided.")

  if(!is.null(count_set)) {
    # check the file format
    if(count_set@class != "SummarizedExperiment"){
      stop(paste("summarized file provided is not a SummarizedExperiment,
                 produce a SummarizedExperiment using count_set.", "\n",
                 sep=""))
    }
  }

  #check norm_method
  if(!any(c("all","housekeeping_scaled", "all_endogenous_scaled",
             "geNorm_housekeeping", "loess", "vsn", "quantile", "RUVIII") %in% norm_method)){
    stop(paste("norm_method must be one or more of c(\"all\",\"housekeeping_scaled\",
    \"all_endogenous_scaled\", \"geNorm_housekeeping\", \"loess\", \"vsn\",
    \"quantile\", \"RUVIII\"). "))
  }

  #check background_correct
  if(!(background_correct %in% c("proportional", "mean2sd", "none"))) {
    stop(paste("background_correct must be \"proportional\",\"mean2sd\" or \"none\".",
               "\n", sep= ""))
  }

  #check geNorm
  if(!is.numeric(geNorm_n)){
    stop(paste("geNorm_n must be numeric", "\n",
               sep = ""))
  }

  #check count_threshold
  if(!(count_threshold == "mean2sd") & (!is.numeric(count_threshold))){
    stop(paste("count_threshold must be \"mean2sd\" or a positive number",
               "\n", sep = "" ))
  }

  #checks for code_class
  if(sum(grepl("Endogenous", unique(rowData(count_set)$Code_Class))) < 1 ){
    stop(paste("Endogenous probes missing in Code_Class"))
  }

  if(!("Positive" %in% unique(rowData(count_set)$Code_Class))){
    stop(paste("Positive probes missing in Code_Class"))
  }

  if(!("Negative" %in% unique(rowData(count_set)$Code_Class))){
    stop(paste("Negative probes missing in Code_Class"))
  }

  if(!("Housekeeping" %in% unique(rowData(count_set)$Code_Class))){
    stop(paste("Housekeeping probes missing in Code_Class"))
  }

  #check plot_dir
  if(is.null(plot_dir)){
    message("### No plot directory provided. Plots will not be saved", "\n", sep = "" )
  }

  # set variables for plotting results
  plot_this <- FALSE

  # if dir is not null, set plot_this to TRUE
  if(!is.null(plot_dir)){
    plot_this <- TRUE
  }


  ####### Input checks finished #######---------------------------------------

  #log2 transform counts for plotting, remove pos and neg probes
  count_set_in <- count_set
  assays(count_set_in)$counts <- log2((assays(count_set_in)$counts)+1)
  count_set_in <- count_set_in[!grepl("NEG..", rownames(rowData(count_set_in))) ,]
  count_set_in <- count_set_in[!grepl("POS..", rownames(rowData(count_set_in))) ,]

  ####### Optional pre-processing ######--------------------------------------

  ### 1. Background correction ###

  if(background_correct == "proportional") {
    message("### Performing background correction. Proportional", "\n", sep = "")

    # calculate ratio of negative counts to total counts
    negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
    sum_negatives <- apply(assays(negatives_se)$counts, 2, function(x) sum(x))
    sum_total_counts <- apply(assays(count_set)$counts, 2, function(x) sum(x))
    ratio <- 1 - (sum_negatives/sum_total_counts)

    #remove this proportion of counts from every count
    background_corrected <- t(apply(assays(count_set)$counts, 1, function(x){
      x*ratio
    }))

    #add to count_set
    assays(count_set)$background_corrected <- background_corrected
  }

  if(background_correct == "mean2sd") {
    message("### Performing background correction. Mean+2SD", "\n", sep = "")

    #calculate mean + 2sd of negatives
    negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
    background <- apply(assays(negatives_se)$counts, 2, function(x) mean(x) + 2*sd(x))
    background_corrected <- t(apply(assays(count_set)$counts, 1, "-", background))

    #convert all negative numbers to 0
    background_corrected[background_corrected < 0] <- 0

    #add to count_set
    assays(count_set)$background_corrected <- background_corrected
  }

  if(background_correct == "none") {
    #add to count_set
    assays(count_set)$background_corrected <- assays(count_set)$counts
  }

  ### apply threshold of expression. Default > 0 ###

  if(count_threshold == "mean2sd"){

    #calculate mean + 2sd of negatives
    negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
    background <- apply(assays(negatives_se)$background_corrected, 2, function(x) mean(x) + 2*sd(x))

    expressed <- matrixStats::rowMins(assays(count_set)$background_corrected) > mean(background)
    count_set <- count_set[expressed, ]
  }

  #remove negative controls from count_set
  count_set <- count_set[!grepl("NEG..", rownames(rowData(count_set))) ,]

  # if threshold of expression is numeric

  if(is.numeric(count_threshold)){
    expressed <- matrixStats::rowMins(assays(count_set)$background_corrected) > count_threshold
    count_set <- count_set[expressed, ]
  }

  ### 2.  Scaling factor spike-in positive controls  ###

  if(positive_control_scaling == TRUE){

    message("### Performing positive control scaling.", "\n", sep = "")
    #select positive controls
    positives_se <- count_set[rowData(count_set)$Code_Class == "Positive" ,]

      #calculate geometric mean of positive controls for each lane
      geomean <- apply(assays(positives_se)$background_corrected, 2,
                       function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
      #Divide this arithmetic mean by the geometric mean of each lane to
      ##generate a lane-specific normalization factor.
      am_geomeans <- mean(geomean)
      sf <- sapply(geomean, function(x) {am_geomeans/x})
      #multiply all counts by sf
      positive_control_scaled <- t(apply(assays(count_set)$background_corrected, 1, "*", sf))
      assays(count_set)$positive_control_scaled <- positive_control_scaled

  }

  #remove negative controls from count_set
  count_set <- count_set[!grepl("POS..", rownames(rowData(count_set))), ]

  #select pre-processed data for input to filtering and normalisations
  if(positive_control_scaling == FALSE ){
    assays(count_set)$normalisation_input <- assays(count_set)$background_corrected
  } else if(positive_control_scaling == TRUE){
    assays(count_set)$normalisation_input <- assays(count_set)$positive_control_scaled
  }

  #remove ligation and miRNA spike in controls from count_set, use currently not supported
  count_set <- count_set[!grepl("LIG_POS_.",rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("LIG_NEG_.",rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("osa-miR414", rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("osa-miR442", rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("cel-miR-254", rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("cel-miR-248", rownames(rowData(count_set))), ]
  count_set <- count_set[!grepl("ath-miR159a", rownames(rowData(count_set))), ]

  ####### Normalisations ####### --------------------------------------------------------

  message("### Performing scaling factor normalisations.", "\n", sep = "")

  if(any(c("all", "housekeeping_scaled") %in% norm_method) == TRUE){

    ### 1. Scaling factor normalisation based on the geometric mean of all housekeeping genes ###
    #select housekeeping genes
    housekeeping_se <- count_set[rowData(count_set)$Code_Class == "Housekeeping" ,]
    #calculate geometric mean of housekeeping genes for each lane
    geomean <- apply(assays(housekeeping_se)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
    #Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
    am_geomeans <- mean(geomean)
    sf <- sapply(geomean, function(x) {am_geomeans/x})
    #multiply all counts by sf
    housekeeping_scaled <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))

    #log2 transform, add to counts_set
    assays(count_set)$housekeeping_scaled <- log2(housekeeping_scaled + 1)

  }

  if(any(c("all", "all_endogenous_scaled") %in% norm_method) == TRUE){

    ### 3. Scaling factor normalisation based on the geometric mean of total counts ###
    #calculate geometric mean of all genes for each lane
    geomean <- apply(assays(count_set)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
    #Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
    am_geomeans <- mean(geomean)
    sf <- sapply(geomean, function(x) {am_geomeans/x})
    #multiply all counts by sf
    all_endogenous_scaled <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))

    #log2 transform, add to count_set
    assays(count_set)$all_endogenous_scaled <- log2(all_endogenous_scaled + 1)

  }

  if(any(c("all", "geNorm_housekeeping") %in% norm_method) == TRUE){

    ### 4. Scaling factor with geNorm selected HKs ###
    #rank genes by M value using geNorm algorithm
    M_vals <-ctrlGene::geNorm(t(assays(housekeeping_se)$normalisation_input), ctVal=FALSE)
    M_vals <- M_vals[order(M_vals$Avg.M),]
    stable_hk_genes <- as.character(M_vals$Genes)
    stable_hk_genes <- unlist(strsplit(stable_hk_genes, "[-]"))
    stable_hk_genes <- stable_hk_genes[1:geNorm_n]
    #select these genes from housekeeping_se
    stable_hk_se <-  count_set[rownames(rowData(count_set)) %in% stable_hk_genes ,]
    #calculate geometric mean of housekeeping genes for each lane
    geomean <- apply(assays(stable_hk_se)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
    #Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
    am_geomeans <- mean(geomean)
    sf <- sapply(geomean, function(x) {am_geomeans/x})
    #multiply all counts by sf
    geNorm_housekeeping <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))
    #log2 transform
    assays(count_set)$geNorm_housekeeping <- log2(geNorm_housekeeping + 1)

  }

  if(any(c("all", "loess") %in% norm_method) == TRUE){
    ### 5. Loess ###
    message("### Performing loess normalisation.", "\n", sep = "")
    loess <- affy::normalize.loess(((assays(count_set)$normalisation_input)+1), verbose = FALSE)
    #make empty loess assay in count_set
    assays(count_set)$loess <- log2(loess) #+1 already done
  }

  if(any(c("all", "vsn") %in% norm_method) == TRUE){

    ### 6. VSN ###
    if (nrow(assays(count_set)$normalisation_input) > 42){
      message("### Performing VSN normalisation.", "\n", sep = "")
      vsn <- vsn::justvsn(((assays(count_set)$normalisation_input)+1), verbose = FALSE)
      #make empty vsn assay in count_set.  #outputs glog2 counts
      assays(count_set)$vsn <- vsn
    } else {
      message("### Performing VSN normalisation.", "\n", sep = "")
      message("#####The number of genes in this count_set after background correction
      is too low for reliable estimation of the vsn transformation parameters. Check the
      vsn package for more details (Huber et al., 2002).", "\n", sep = "")
      message("##### VSN normalisation not performed.", "\n", sep = "")
    }

  }

  if(any(c("all", "quantile") %in% norm_method) == TRUE){

    ### 7. Quantile ###
    message("### Performing quantile normalisation.", "\n", sep = "")
    log_counts <- log2((assays(count_set)$normalisation_input)+1)
    quant_norm <- preprocessCore::normalize.quantiles(log_counts) #outputs log2 normalised counts
    colnames(quant_norm)<-colnames(log_counts)
    rownames(quant_norm)<-rownames(log_counts)
    #make empty quantile assay in count_set. #outputs log2 counts
    assays(count_set)$quantile <- quant_norm

  }

  if(any(c("all", "ruvIII") %in% norm_method) == TRUE){
    ### 8. RUVIII ###
    #use RUV if replicate samples are present based on sample id and grouping

    sample_names <- paste(count_set$samp_id, count_set$group, sep = "_")
    not_replicates <- unique(sample_names)

    if(length(not_replicates) < length(count_set$samp_id)) { #are there replicates?

      message("### Replicates present. Performing RUV")

      rep_mat <- ruv::replicate.matrix(sample_names)
      message("### RUVIII normalisation: There are ", nrow(rep_mat),
                " samples; ", ncol(rep_mat)," unique samples with ",
                nrow(rep_mat)-ncol(rep_mat), " replicate(s).")

      #Identifying Control Genes
      ##1. Use all genes as negative controls

      RUV_norm <- ruv::RUVIII(Y = t(log_counts),
                              M = rep_mat,
                              ctl = c(1:nrow(assays(count_set)$normalisation_input)))
      RUV_norm <- t(RUV_norm)

      #2. Select the most stably expressed genes
      stable_genes <- apply(RUV_norm, 1, var)
      control_genes <- which(stable_genes < 0.5)

      #3. RUVIII correction using control_genes
      ##Only k=1 is supported at the moment
      RUV_norm <-ruv::RUVIII(Y = t(log_counts), M = rep_mat, ctl = control_genes, k = ruv_k)
      RUV_norm <- t(RUV_norm) #outputs log2, RUVIII transformed, normalised counts

      #make empty RUVIII assay in count_set
      assays(count_set)$ruvIII <- RUV_norm


    } else if(length(not_replicates) == length(count_set$samp_id)) {
    message("### No replicates present. RUV not performed")
    }

  }

  # remove normalisation input assay
  assays(count_set)$normalisation_input <- NULL


  ###### Return all counts in log2 ######--------------------------------------

  #log2 normalise raw count data
  assays(count_set)$counts <- log2((assays(count_set)$counts)+1)

  #log2 transform background_corrected_counts
  assays(count_set)$background_corrected <- log2((assays(count_set)$background_corrected)+1)

  #log2 transform positive_control_scaled

  if(positive_control_scaling == TRUE){
  assays(count_set)$positive_control_scaled <- log2((assays(count_set)$positive_control_scaled)+1)
  }

  if(background_correct == "none" & !(count_threshold == "mean2sd") & (!is.numeric(count_threshold))) {
    assays(count_set)$background_corrected <- NULL
  }

  ####### plots #######---------------------------------------------------------

  if (plot_this == TRUE){

  #refactorise groups & batches in normalized and input count_sets
  count_set_in$group <- factor(count_set_in$group)
  count_set_in$batch <- factor(count_set_in$batch)
  count_set$group <- factor(count_set$group)
  count_set$batch <- factor(count_set$batch)

  #create vector of count_set assays after normalisations
  assays_all <- names(assays(count_set)[2:length(assays(count_set))])

  #Density Plot
  grDevices::pdf(file = paste(plot_dir, "Density.pdf", sep = ""), width = 10, height = 10)
  par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
  density_plot_wrap(count_set = count_set_in,
                    norm_method = "counts",
                    title = "counts",
                    colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
  lapply(seq_along(assays_all), function(i){
    data_in <- assays_all[i]
    density_plot_wrap(count_set = count_set,
                      norm_method = data_in,
                      title = data_in,
                      colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
                      legend = FALSE)
  })

  grDevices::dev.off()

  #PCA

  grDevices::pdf(file = paste(plot_dir, "PCA.pdf", sep = ""), width = 10, height = 10)
  par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
  pca_plot_wrap(count_set = count_set_in,
                norm_method = "counts",
                title = "counts",
                colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
  lapply(seq_along(assays_all), function(i){
    data_in <- assays_all[i]
    pca_plot_wrap(count_set = count_set,
                  norm_method = data_in,
                  title = data_in,
                  colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
                  legend = FALSE)
  })

  grDevices::dev.off()


    #RLE
  grDevices::pdf(file = paste(plot_dir, "RLE.pdf", sep = ""), width = 10, height = 10)
  par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
  rle_plot_wrap(count_set = count_set_in,
                norm_method = "counts",
                title = "counts",
                colors =  RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
  lapply(seq_along(assays_all), function(i){
    data_in <- assays_all[i]
    rle_plot_wrap(count_set = count_set,
                  norm_method = data_in,
                  title = data_in,
                  colors =  RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
                  legend = FALSE)
  })

  grDevices::dev.off()

   #Clustering
  grDevices::pdf(file = paste(plot_dir, "hclust.pdf", sep = ""), width = 10, height = 10)
  par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
  hclust_plot_wrap(count_set = count_set_in,
                   norm_method = "counts",
                   title = "counts",
                   colors =  RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
  lapply(seq_along(assays_all), function(i){
    data_in <- assays_all[i]
    hclust_plot_wrap(count_set = count_set,
                  norm_method = data_in,
                  title = data_in,
                  colors =  RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
                  legend = FALSE)
   })

  grDevices::dev.off()

  }

  return(count_set)

}
MarthaCooper/NanoStringClustR documentation built on June 25, 2021, 9:41 p.m.