R/rnaSeqNormalizer.R

Defines functions rnaSeqNormalizer rnaSeqNormalizer.gtex .categorizeDataFrame .transformEnsemblColumn1DataFrameToGeneSymbolMatrix .logAndScale.normalize .asinh.normalize .vst.normalize

Documented in rnaSeqNormalizer rnaSeqNormalizer.gtex

#' @importFrom methods new
#' @importFrom AnnotationDbi keys select
#' @import org.Hs.eg.db
#' @import EnsDb.Hsapiens.v79
#' @import DESeq
#'
#' @title rnaSeqNormalizer
#------------------------------------------------------------------------------------------------------------------------
#' @name rnaSeqNormalizer-class
#' @rdname rnaSeqNormalizer-class
#' @aliases rnaSeqNormalizer
#'

.rnaSeqNormalizer <- setClass ("rnaSeqNormalizer",
                          representation = representation(
                             state="environment",
                             algorithm="character",
                             duplicate.selection.statistic="character",
                             raw.data.type="character"
                             )
                            )

#------------------------------------------------------------------------------------------------------------------------
setGeneric('normalize', signature='obj', function(obj) standardGeneric('normalize'))
setGeneric('getNormalizedMatrix', signature='obj', function(obj) standardGeneric('getNormalizedMatrix'))
setGeneric('standardizeToGeneSymbolMatrix', signature='obj', function(obj) standardGeneric('standardizeToGeneSymbolMatrix'))
#------------------------------------------------------------------------------------------------------------------------
#' Define an object of class rnaSeqNormalizer
#'
#' @description
#' Cory, Michael and Max brewed up their preferred method, to which I have added
#' asinh and vst (variance stabilizing normalization), here packaged up for easy reuse.
#' ensg identifers are mapped to geneSymbols. duplicated geneSymbols are eliminated in
#' favor of the one with the highest score according to the selected statistic:
#' mean, median, sd (standard deviation)
#'
#' @rdname rnaSeqNormalizer-class
#'
#' @param x a matix or data.frame of RNA-seq counts, rows are genes, columnns are mostly samples
#' @param algorithm a character string, one of "log+scale", "asinh", "vsn
#' @param duplicate.selection.statistc a character string, one of "median", "mean", "sd".  when
#' duplicate geneSymbols are supplied or obtained from name mapping, the highest scoring of the
#' duplicates, by the specified meansure, is kept; all others are dropped
#'
#' @return A normalized matrix with dimensions, row and column names preserved
#'
#' @export
#'
#'
rnaSeqNormalizer <- function(x, algorithm, duplicate.selection.statistic)
{
   stopifnot(algorithm %in% c("log+scale", "asinh", "vst"))
   stopifnot(duplicate.selection.statistic %in% c("mean", "median", "sd"))

   state <- new.env(parent=emptyenv())
   state$matrix <- matrix()
   state$tbl <- data.frame()

   if(is.matrix(x)){
     stop(sprintf("rnaSeqNormalizer currently supports only data.frames with ENSG ids in the first column"))
      }

   if(is.data.frame(x)){
      raw.data.type <- "data.frame"
      state$tbl <- x
      if(length(x$ensembl_id) != nrow(x))
         stop("rnaSeqNormalizer currently supports only data.frames with ENSG ids in one (any) column")
      }

   .rnaSeqNormalizer(state=state, algorithm=algorithm,
                     duplicate.selection.statistic=duplicate.selection.statistic,
                     raw.data.type=raw.data.type)

} # ctor: for data.frames with ensembl_id in first column
#------------------------------------------------------------------------------------------------------------------------
#' Define an object of class rnaSeqNormalizer from a GTEx matrix
#'
#' @description
#' Cory, Michael and Max brewed up their preferred method, to which I have added
#' asinh and vst (variance stabilizing normalization), here packaged up for easy reuse.
#' ensg identifers are mapped to geneSymbols. duplicated geneSymbols are eliminated in
#' favor of the one with the highest score according to the selected statistic:
#' mean, median, sd (standard deviation)
#'
#' @rdname rnaSeqNormalizer.gtex
#'
#' @param x a matix or data.frame of RNA-seq counts, rows are genes, columnns are mostly samples
#' @param algorithm a character string, one of "log+scale", "asinh", "vsn
#' @param duplicate.selection.statistc a character string, one of "median", "mean", "sd".  when
#' duplicate geneSymbols are supplied or obtained from name mapping, the highest scoring of the
#' duplicates, by the specified meansure, is kept; all others are dropped
#'
#' @return A normalized matrix with dimensions, row and column names preserved
#'
#' @export
#'
#'
rnaSeqNormalizer.gtex <- function(x, algorithm, duplicate.selection.statistic)
{
   stopifnot(algorithm %in% c("log+scale", "asinh", "vst"))
   stopifnot(duplicate.selection.statistic %in% c("mean", "median", "sd"))
   state <- new.env(parent=emptyenv())

   state$tbl <- data.frame()

   if(!is.matrix(x)){
     stop(sprintf("rnaSeqNormalizer.gtex currently supports only matrices with geneSymbol rownames"))
     }

   raw.data.type <- "matrix"

   if(any(duplicated(rownames(x)))){
      mean <- apply(x, 1, mean)
      median <- apply(x, 1, median)
      sd <- apply(x, 1, sd)
      x <- cbind(x, mean=mean, median=median, sd=sd)
      new.order <- order(rownames(x), x[, duplicate.selection.statistic], decreasing=TRUE)
      x <- x[new.order,]
        # discard extra rows with duplicated geneSymbols, keeping the biggest by selection.statistic
      dup.syms <- which(duplicated(rownames(x)))
      if(length(dup.syms) > 0){
         x <- x[-dup.syms,]
         }
      start.of.extra.columns <- grep("mean", colnames(x))
      end.of.extra.columns <- grep("sd", colnames(x))
      x <- x[, -(start.of.extra.columns:end.of.extra.columns)]
      dim(x)
      }

   state$matrix <- x

   .rnaSeqNormalizer(state=state, algorithm=algorithm,
                     duplicate.selection.statistic=duplicate.selection.statistic,
                     raw.data.type=raw.data.type)

} # ctor: for data.frames with ensembl_id in first column
#------------------------------------------------------------------------------------------------------------------------
#' get a summary of the objects
#'
#' @rdname show
#' @aliases show
#'
#' @param obj An object of class rnaSeqNormalizer
#'
#' @export

setMethod('show', 'rnaSeqNormalizer',

    function(object) {
       cat(sprintf ('--- rnaSeqNormalizer'), '\n', sep='')
       cat(sprintf("data.frame dimensions: %d x %d", nrow(object@state$tbl), ncol(object@state$tbl)))
       cat("\n")
       })

#------------------------------------------------------------------------------------------------------------------------
#' convert data.frame to a geneSymbol matrix
#'
#' @rdname toGeneSymbolMatrix
#' @aliases toGeneSymbolMatrix
#'
#' @param obj An object of class rnaSeqNormalizer
#' @param method A character string specifying statistic to use to resolve the case in which
#'        multiple ENSEMBL identifiers map to the same gene symbol: one of "median", "mean", "sd"
#'
#' @export

setMethod('standardizeToGeneSymbolMatrix', 'rnaSeqNormalizer',

   function(obj) {

      if(obj@raw.data.type == "data.frame"){
         obj@state$matrix <- .transformEnsemblColumn1DataFrameToGeneSymbolMatrix(obj@state$tbl,
                                                                                 obj@duplicate.selection.statistic)
         }
      if(obj@raw.data.type == "matrix"){
         stop("no support yet for input matrices in standardizeGeneSymbolToMatrix")
         }
      })

#------------------------------------------------------------------------------------------------------------------------
# are gene identifiers in the rownames?  in the first column?  are they gene symbols, ensembl ids?
# the goal is to
.categorizeDataFrame <- function(tbl)
{
     # create an id map data.frame to assocate ensembl gene ids with gene symbols
   stopifnot("ensembl_id" %in% colnames(tbl))
   ensg <- tbl$ensembl_id
     # remove any trailing ensg version numbers, e.g., ENSG00000279457.3 -> ENSG00000279457
   ensg <- sub("\\..*$", "", ensg)

   tbl.map <- select(EnsDb.Hsapiens.v79, key=ensg,  columns=c("SYMBOL"),  keytype="GENEID")
   potential.matches <- ensg

   ensembl.matches <- which(!is.na(unlist(lapply(potential.matches,
                                                    function(id) match(id, tbl.map$GENEID, nomatch=NA)))))

   browser()
   stopifnot(length(ensembl.matches) == length(potential.matches))
   return("ensembl column")

} # .categorizeDataFrame
#------------------------------------------------------------------------------------------------------------------------
.transformEnsemblColumn1DataFrameToGeneSymbolMatrix <- function(tbl, duplicate.selection.statistic)
{
   #printf("--- .transformEnsemblColumn1DataFrameToGeneSymbolMatrix")

   dim(tbl)
   mean <- apply(tbl[, -1], 1, mean)
   median <- apply(tbl[, -1], 1, median)
   sd <- apply(tbl[, -1], 1, sd)
   tbl$mean <- mean
   tbl$median <- median
   tbl$sd <- sd

   browser()

   id.column <- grep("ensembl_id", colnames(tbl))
   ensg <- tbl[, id.column]
   ensg <- sub("\\..*$", "", ensg)

   tbl.map <- select(EnsDb.Hsapiens.v79, key=ensg,  columns=c("SYMBOL"),  keytype="GENEID")
   nas <- which(is.na(tbl.map$SYMBOL))
   stopifnot(length(nas) == 0)
   indices <- match(ensg, tbl.map$GENEID)
   length(indices)
   head(indices)
   which(is.na(indices))
   tbl$sym <- tbl.map$SYMBOL[indices]
   mapping.failures <- which(is.na(tbl$sym))

   if(length(mapping.failures) > 0){
      tbl$sym[mapping.failures] <- tbl[mapping.failures,"ensembl_id"]
      }

   dim(tbl)
   dup.syms <- which(duplicated(tbl$sym))
   length(dup.syms)
   new.order <- order(tbl$sym, tbl[, duplicate.selection.statistic], decreasing=TRUE)
   tbl <- tbl[new.order,]

      # discard extra rows added because a single ENSEMBL id is mapped to multiple gene symbols
   deleters <- which(duplicated(tbl$sym))
   if(length(deleters) > 0){
      tbl <- tbl[-deleters,]
      }
   dim(tbl)
   rownames(tbl) <- tbl$sym
   geneSymbols <- tbl$sym

   columns.to.delete <- c(match(c("ensembl_id", "mean", "median", "sd", "sym"), colnames(tbl)))
   tbl <- tbl[, -columns.to.delete]
   mtx <- as.matrix(tbl)
   mtx <- mtx[sort(rownames(mtx)),]

   return(mtx)

} # .transformEnsemblColumn1DataFrameToGeneSymbolMatrix
#------------------------------------------------------------------------------------------------------------------------
#' process according to algorithm and duplicate.selection.statistic
#'
#' @rdname getNormalizedMatrix
#' @aliases getNormalizedMatrix
#'
#' @param obj An object of class rnaSeqNormalizer
#'
#' @export

setMethod('getNormalizedMatrix', 'rnaSeqNormalizer',

   function(obj) {

      if(obj@raw.data.type == "data.frame")
         standardizeToGeneSymbolMatrix(obj)
      mtx <- obj@state$matrix

      if(obj@algorithm == "asinh")
         return(.asinh.normalize(mtx))
      if(obj@algorithm == "log+scale")
         return(.logAndScale.normalize(mtx))
      if(obj@algorithm == "vst")
         return(.vst.normalize(mtx))
      })


#------------------------------------------------------------------------------------------------------------------------
.logAndScale.normalize <- function(mtx)
{
   minValue <- min(mtx[mtx > 0])
   if(minValue == 0)
      minValue <- .Machine$double.eps

   mtx.1 <- mtx + minValue
   mtx.2 <- log10(mtx.1)
   mtx.3 <- t(scale(t(mtx.2)))
   means <- apply(mtx.2, 2, mean)

   mtx.3

} # .logAndScale.normalize
#------------------------------------------------------------------------------------------------------------------------
.asinh.normalize <- function(mtx)
{
   asinh(mtx)

} # .asinh.normalize
#------------------------------------------------------------------------------------------------------------------------
# from jim macdonald via alison paquqette: the variance statilizing transformation
.vst.normalize <- function(mtx)
{
   if(is.numeric(mtx[1,1])){
      warning("the vst algorithm needs a matrix of integer counts; converting...")
      mtx.i <- matrix(data=as.integer(mtx), nrow=nrow(mtx))
      colnames(mtx.i) <- colnames(mtx)
      rownames(mtx.i) <- rownames(mtx)
      mtx <- mtx.i
      }

   condition <- factor(rep("AD", ncol(mtx)))
   countdata <- newCountDataSet(mtx, condition) # DESseq
   countdata <- estimateSizeFactors(countdata)
   mtx.vst <- DESeq::getVarianceStabilizedData(estimateDispersions(countdata, method="blind"))
   mtx.vst

} # .vst.normalize
#------------------------------------------------------------------------------------------------------------------------
PriceLab/rnaSeqNormalization documentation built on Jan. 23, 2020, 4:48 a.m.