R/norm_rank.R

Defines functions variation_around_med dunn_wrap norm_rank

Documented in norm_rank

#' Ranks NanoString normalizations using cluster validity indices
#'
#' @description ranks NanoString normalisation by 1) clustering of
#' groups and; 2) variation in relative log expression.
#' Clustering is assessed by a Generalised Dunn Index using the
#' average linkage and average diamter for inter and intracluster
#' distances, respectively. Overall variation is assessed by Relative
#' Log Expression.
#'
#' @param count_set a normalised, count_set summarized experiment
#' generated by count_set.Default = NULL
#'
#' @examples
#'# biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#'                   "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#'                   "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#'                             group = rnf5_group,
#'                             samp_id = rnf5_sampleid)
#' # normalize
#' rnf5_count_set_norm <- multi_norm(count_set = rnf5_count_set,
#'                                   positive_control_scaling = TRUE,
#'                                   background_correct = "mean2sd")
#'                                   #plot_dir = "~/Dropbox/git/NanoStringClustR/plot_test/")
#' # rank normalizations
#' rnf5_eval <- norm_rank(rnf5_count_set_norm)
#'
#' @return ranked normalizations
#'
#' @export norm_rank
#'
#' @importFrom clv cls.scatt.data clv.Dunn
#'
#'
#' @references
#'
#' Lukasz Nieweglowski (2013). clv: Cluster Validation Techniques.
#' R package version 0.3-2.1. https://CRAN.R-project.org/package=clv
#'


norm_rank <- function(count_set = count_set){

  ####### Input checks #######------------------------------------------------

  # check count_set is provided
  if(is.null(count_set)) stop("A count_set list generated by count_set must
                              be provided.")

  if(!is.null(count_set)) {
    # check the file format
    if(count_set@class != "SummarizedExperiment"){
      stop(paste("summarized file provided is not a SummarizedExperiment,
                 Please produce a SummarizedExperiment using count_set.", "\n",
                 sep=""))
    }
  }

  # check that the count_set has been normalised
  if(length(names(assays(count_set))) == 1)
    stop("The count_set input must be normalised with multi_norm")


  ####### Input checks finished #######-----------------------------------------

  ### refactorise count_set groups
  count_set$group <- factor(count_set$group)

  ### dunn indicies for groups ### --------------------------------------------

  assays_all <- names(assays(count_set))

  all_group_dunns <- lapply(seq_along(assays_all), function (i){

    dunn_wrap(count_set = count_set,
              norm_method = assays_all[i],
              clusters = as.vector(as.integer((count_set$group))))

  })

  names(all_group_dunns) <- assays_all

  combined_group_dunns <- lapply(seq_along(names(all_group_dunns)), function(i){
    method_i <- data.frame("norm_method" = names(all_group_dunns)[i],
                           "GDI" = all_group_dunns[[i]][1])
  })

  group_dunns <- do.call("rbind", combined_group_dunns)

  ### rank group dunns ###  -------------------

    ### lower rank = higher GDI = better clustering

  group_dunns_ordered <- group_dunns[order(group_dunns$GDI), ]
  group_dunns_ordered$GDI_rank <- c(nrow(group_dunns_ordered):1)

  ### variation around median RLE ### -------------------------------------------

  all_vars <- lapply(seq_along(assays_all), function(i){
    variation_around_med(count_set = count_set,
                         norm_method = assays_all[i])
  })

  names(all_vars) <- assays_all
  vars <- as.data.frame(do.call("rbind", all_vars))
  vars_df <- data.frame("norm_method" = rownames(vars),
                        "MRLE" = vars$V1)

  ### rank RLE ###  -------------------

    ### lower rank = lower sum variation = better RLE

  vars_ordered <- vars_df[order(vars_df$MRLE),]
  vars_ordered$MRLE_rank <- c(1:nrow(vars_ordered))

  ### overall ranks ### -------------------------------------------

    merged <- merge(vars_ordered, group_dunns_ordered, by = "norm_method")
    merged$NanoNormRank <- merged$MRLE_rank + merged$GDI_rank
    merged_ordered <- merged[order(merged$NanoNormRank),]

    return(merged_ordered)

}


######## function for dunn index ranking ####-----------------------------------

dunn_wrap <- function(count_set = count_set,
                      norm_method = "housekeeping_scaled",
                      clusters = as.vector(as.integer((count_set$group)))){

  get_counts <- paste0("assays(count_set)$", norm_method)
  data_in <- na.omit(eval(parse(text=get_counts)))

  cluster_data_euk <- clv::cls.scatt.data(t(data_in),
                                          clust = clusters,
                                          dist = "euclidean")

  #calculate GDI with average intra and intercluster distances
  intraclust = c("average")
  interclust = c("average")

  dunn_data_euk <- clv::clv.Dunn(cluster_data_euk, intraclust, interclust)

  return(dunn_data_euk)

}

####### function for calculating sum variation around median RLE ##### -------


variation_around_med <- function(count_set = count_set,
                                 norm_method = "housekeeping_scaled"){

  get_counts <- paste0("assays(count_set)$", norm_method)
  data_in <- na.omit(eval(parse(text=get_counts)))

  #calculate median expression of each gene
  med_gene <- apply(data_in, 1, median)
  #calculate deviations from this median genecount-med_gene
  dev_from_med <- apply(data_in, 2, function(i) (i - med_gene))
  #calculate median deviations per sample - median of RLE plpot
  med <- apply(dev_from_med, 2, function(i) median(i))
  #sum absolute differences of the medians from 0
  sum_delta <- sum(abs(med))

  return(sum_delta)

}
MarthaCooper/NanoStringClustR documentation built on June 25, 2021, 9:41 p.m.