#' Cumulative sum scaling normalization of count data
#'
#' The method normalizes count data by cumulative sum scaling, using a specified quantile (e.g., 75th quantile)
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param q a number to indicate which quantile to normalize with. Default is 0.75.
#' @param qg a number to use for scaling all samples. Default is 1000 (as in reference). Can also specify "median" to use the median value scaling value across all samples.
#'
#' @details Count data is normalized by a given quantile, dividing by the sum of all values up to and including the given quantile of the sample and multiplying by the given scaling value (either 1000 or the median scaling value across all samples).
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the quantile-normalized data, location_param is NULL, and scale_param is a numeric vector containing, for every sample, the value of the sample at the designated quantile (q) divided by the value of the global quantile (q).
#'
#' @references Paulson, Joseph N, O Colin Stine, Hector Corrada Bravo, and Mihai Pop. "Differential abundance analysis for microbial marker-gene surveys." Nature Methods. 10.12 (2013)
#'
#' @examples
#' \dontrun{
#' library(mintJansson)
#' data(rRNA_data)
#' rRNA_CSS <- CSS_Norm(e_data = rRNA_data$e_data, edata_id = attr(rRNA_data, "cnames")$edata_cname)
#' norm_factors <- attr(rRNA_CSS,"data_info")$scale_param
#' }
#'
#' @author Allison Thompson, Lisa Bramer
#'
CSS_Norm <- function(e_data, edata_id, q=0.75, qg="median"){
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# calculate the q quantile of data, per sample and globally #
col.q = apply(e_data_norm, 2, function(x) sum(x[x<=quantile(x[x!=0], probs = q, na.rm=TRUE)], na.rm=TRUE))
#g.q = sum(e_data_norm[e_data_norm <= quantile(e_data_norm, probs = q, na.rm=TRUE)], na.rm=TRUE)
if(qg == 1000){
g.q = 1000
}else if(qg == "median"){
g.q = median(col.q, na.rm=TRUE)
}else{
stop("Invalid value for qg")
}
# normalize omics_data data by q quantile and transform back to count data #
for(i in 1:ncol(e_data_norm)){
e_data_norm[,i] = (e_data_norm[,i] / col.q[i]) * g.q
}
# put it all back together
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
return(list(normed_data=e_data, location_param=NULL, scale_param=col.q/g.q))
}
#' Normalization of count data via DESeq scale factors
#'
#' The method normalizes count data by scaling the raw counts by the median scaled counts. The normalized factors used by
#' this procedure are also called DEseq size scores (Anders et al. 2010). The same normalization technique is used in DESeq2 as well (Love et al. 2014).
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details Count data is normalized by the median scaled score
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the normalized data, location_param is NULL, scale_param is a numeric vector of DESeq scores.
#'
#' @author Bryan Stanfill
#'
#' @references
#' Anders, Simon, and Wolfgang Huber. "Differential expression analysis for sequence count data." Genome biol 11.10 (2010): R106.
#' Love, Michael I., Wolfgang Huber, and Simon Anders. "Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2." Genome biology 15.12 (2014): 1-21.
#' @examples
#' library(mintJansson)
#' e_data_id <- attr(rRNA_data, "cnames")$edata_cname
#' deseq_n <- med_scounts_norm(e_data = rRNA_data$e_data, e_data_id)
#'
med_scounts_norm <- function(e_data, edata_id){
e_data[e_data == 0] <- NA
#Which column has the ID variable we want to remove?
col_to_omit <- which(colnames(e_data)==edata_id)
if(length(col_to_omit)==0){
stop("The supplied edata_id name couldn't be found in e_data.")
}
log_scale_mean <- rowMeans(log(e_data[,-col_to_omit]),na.rm=TRUE)
#Compute size factors: sjs
sjs <- as.numeric(apply(e_data[,-col_to_omit], 2, function(cnts) median(exp(log(cnts) - log_scale_mean),na.rm=T)))
e_data[is.na(e_data)] <- 0
e_data[,-col_to_omit] <- data.matrix(e_data[,-col_to_omit])%*%diag(1/sjs)
names(sjs) <- colnames(e_data)[-col_to_omit]
#if(nas){
# edata_nona[which_na] <- NA
#}
return(list(normed_data=e_data, location_param=NULL, scale_param=sjs))
}
#' Normalization of count data via Poisson Sampling
#'
#' The method normalizes count data by resampling the data from a Poisson distribution with parameter estimated from the raw counts. More specifically,
#' the quantity that is returned is given by equation 2.4 of Li and Tibshirani (2013).
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details Count data resampled from the Poisson distribution
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the normalized data, location_param is NULL, scale_param is a numeric vector of DESeq scores.
#'
#' @author Bryan Stanfill
#'
#' @references
#' Li, Jun, and Robert Tibshirani. "Finding consistent patterns: a nonparametric approach for identifying differential expression in RNA-Seq data." Statistical methods in medical research 22.5 (2013): 519-536.
#'
#' @examples
#' library(mintJansson)
#' e_data <- rRNA_data$e_data
#' e_data_id <- attr(rRNA_data, "cnames")$edata_cname
#' pois_ndata <- poisson_norm(rRNA_data$e_data, e_data_id)
#'
poisson_norm <- function(e_data, edata_id){
#Compute sequence depts for each experiment
ds <- colSums(e_data[,-which(colnames(e_data) == edata_id)],na.rm=T)
#Compute geometric mean of counts by computing the mean on the log scale and exponentiating
dbar <- exp(mean(log(ds)))
#Which column has the ID variable we want to remove?
col_to_omit <- which(colnames(e_data)==edata_id)
if(length(col_to_omit)==0){
stop("The supplied edata_id name couldn't be found in e_data.")
}
#For each non-nNA cell, sample from a poisson dist with parameter (dbar/ds[i])N_{ij}
e_data[,-col_to_omit] <- apply(e_data[,-col_to_omit],2,pois_samp,dbar=dbar)
return(list(normed_data=e_data, location_param=NULL, scale_param=dbar/ds))
}
pois_samp <- function(Nij,dbar){
rmna <- which(is.na(Nij) | Nij == 0)
di <- sum(Nij,na.rm=T)
Nij[-rmna] <- rpois(length(Nij)-length(rmna),Nij[-rmna]*dbar/di)
return(Nij)
}
#' Quantile normalization of count data
#'
#' The method normalizes count data by a specified quantile (e.g., 75th quantile)
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param q a number to indicate which quantile to normalize with. Default is 0.75.
#'
#' @details Count data is normalized by a given quantile, dividing by the given quantile of the sample and multiplying by the given quantile of the entire dataset.
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the quantile-normalized data, location_param is NULL, and scale_param is a numeric vector containing, for every sample, the value of the sample at the designated quantile (q) divided by the value of the global quantile (q).
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_quant <- Quant_Norm(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#' norm_factors <- attr(cDNA_quant,"data_info")$scale_param
#'
#' @author Allison Thompson, Lisa Bramer
#'
Quant_Norm <- function(e_data, edata_id, q=0.75){
e_data[e_data == 0] <- NA
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# calculate the q quantile of data, per sample and globally #
col.q = apply(e_data_norm, 2, function(x) quantile(x, probs = q, na.rm=TRUE))
g.q = quantile(e_data_norm, probs = q, na.rm=TRUE)
# normalize omics_data data by q quantile and transform back to count data #
for(i in 1:ncol(e_data_norm)){
e_data_norm[,i] = (e_data_norm[,i] / col.q[i]) * g.q
}
# put it all back together #
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
e_data[is.na(e_data)] <- 0
return(list(normed_data=e_data, location_param=NULL, scale_param=col.q/g.q))
}
#' Rarefying of count data
#'
#' The method normalizes count data by rarefying
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param size the library size to rarefy down to. Default uses the minimum sample size.
#'
#' @details Count data is normalized by rarefying, subsampling samples down to a specified library size. If the specified library size is larger than a sample's library size, the sample will be discarded. A warning message will display which samples are discarded. This normalization method is likely not the best course of action.
#'
#' @return List containing 4 elements: norm_data is a data.frame with same structure as e_data that contains the rarefied data, location_param is NULL, and scale_param is the library size counts were rarefied to.
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_rarefy <- Rarefy(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#' library_size <- attr(cDNA_rarefy,"data_info")$scale_param
#'
#' @author Allison Thompson
#'
#' @references McMurdie, Paul J., and Susan Holmes. "Waste Not, Want Not: Why Rarefying Microbiome Data Is Inadmissible." PLOS Computational Biology. 10.4 (2014)
#'
Rarefy <- function(e_data, edata_id, size=NULL){
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# check library size
if(is.null(size)){
size <- min(colSums(e_data_norm,na.rm=TRUE))
}else{
size <- size
}
# if input library size is larger than any library size for any sample, discard that sample #
samples <- NA
if(any(colSums(e_data_norm,na.rm=TRUE) < size)){
samples <- names(which(colSums(e_data_norm,na.rm=TRUE) < size))
# write a warning stating which samples are being removed #
warning(paste("Input size is larger than the library size of samples ", paste(samples,collapse=", "), ". These samples will be removed from the data.",sep=""))
}
# Change NA to 0 in order to extract features
e_data_norm[is.na(e_data_norm)] <- 0
# Create a temporary empty matrix to fill with rarefied data
temp <- matrix(nrow=nrow(e_data_norm),ncol=ncol(e_data_norm))
rownames(temp) <- e_data[,edata_id]
colnames(temp) <- colnames(e_data_norm)
for(i in 1:ncol(e_data_norm)){
# Create a vector of features
features <- rep(e_data[,edata_id],e_data_norm[,i])
# Subsample features to a predetermined library size
subsamp <- sample(features, ifelse(length(features)>=size,size,0),replace=FALSE)
data <- table(subsamp)
# Put rarefied counts back into temporary matrix
for(j in 1:length(data)){
temp[which(rownames(temp)==names(data)[j]),i] <- data[j]
}
colnames(temp)[i] <- colnames(e_data_norm)[i]
}
if(any(rowSums(temp, na.rm=TRUE) == 0)){
# Remove any extra features
temp <- temp[-which(rowSums(temp,na.rm=TRUE)==0),]
}
# Set 0 back to NA
temp[temp==0] <- NA
# Reorganize so this resembles e_data
temp <- as.data.frame(temp)
temp <- data.frame(rownames(temp),temp)
names(temp)[1] <- edata_id
rownames(temp) <- NULL
# Replace e_data with rarefied counts
e_data <- temp
# add fake scale param for later use
scale_param <- rep(1, ncol(e_data[,-which(colnames(e_data) == edata_id)]))
names(scale_param) <- colnames(e_data)[-which(colnames(e_data) == edata_id)]
return(list(normed_data=e_data, location_param=size, scale_param=scale_param))
}
#' Trimmed Mean of M Values normalization of count data
#'
#' The method normalizes count data by the trimmed mean of m values in each sample
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#' @param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#' @param reference which column in e_data should be used as the reference, default is to use the sample with the least amount of missing data.
#' @param qm percentage by which to trim M values (gene-wise log-fold-changes), default is 0.30 (30\%)
#' @param qa percentage by which to trim A values (absolute expression levels), default is 0.05 (5\%)
#'
#' @details Count data is normalized by the trimmed mean of m values.
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the TMM-normalized data, location_param is a numeric vector of the TMM values for each sample, and scale_param is NULL.
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_TMM <- TMM_Norm(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#' norm_factors <- attr(cDNA_TMM,"data_info")$scale_param
#'
#' @author Allison Thompson, Lisa Bramer
#'
TMM_Norm <- function(e_data,edata_id, reference=NULL,qm=0.30,qa=0.05){
old <- e_data
e_data[e_data==0] <- NA
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# determine which sample should be used as the reference
if(is.null(reference)){
# determine which sample has the least amount of missing data
ref = which(apply(e_data_norm, 2, function(x) length(which(is.na(x)))) == min(apply(e_data_norm, 2,function(x) length(which(is.na(x))))))
# if more than one sample can be the reference with equal amounts of missing data, choose the one with the max mean
if(length(ref) > 1){
ref = which(apply(e_data_norm[,ref],2,function(x) mean(x,na.rm=TRUE)) == max(apply(e_data_norm[,ref],2,function(x) mean(x,na.rm=TRUE))))
}
# if still more than one, choose the one with the largest total abundance
if(length(ref) > 1){
ref = which(colSums(e_data_norm[,ref], na.rm=TRUE) == max(colSums(e_data_norm[,ref], na.rm=TRUE)))
}
# if still more than one, just pick the first one
if(length(ref) > 1){
ref = ref[1]
}
}else{
ref = reference
}
# remove genes where the reference has a value of NA
data <- e_data_norm[-which(is.na(e_data_norm[,ref])),]
names <- e_data[,edata_id]
names <- names[-which(is.na(e_data_norm[,ref]))]
l2TMM <- vector()
TMM <- vector()
for(k in 1:ncol(data)){
# remove genes where the sample of interest has a value of NA
if(length(which(is.na(data[,k]))) > 0){
trimmed.data = data[-which(is.na(data[,k])),]
}else{
trimmed.data = data
}
# determine the total number of reads in the reference and interest samples
Nr = colSums(trimmed.data,na.rm=TRUE)[ref]
Nk = colSums(trimmed.data,na.rm=TRUE)[k]
names2 <- names[-which(is.na(data[,k]))]
# create empty matrices for M and A values
Mgkr <- matrix(nrow=nrow(trimmed.data),ncol=1)
rownames(Mgkr) <- names2
Agkr <- matrix(nrow=nrow(trimmed.data),ncol=1)
rownames(Agkr) <- names2
# create empty vectors for counts
Ygr <- vector()
Ygk <- vector()
for(g in 1:nrow(trimmed.data)){
Ygr[g] = trimmed.data[g,ref]
Ygk[g] = trimmed.data[g,k]
# calculate M and A for each gene and each sample
Mgkr[g,1] = log2((Ygk[g]/Nk) / (Ygr[g]/Nr))
Agkr[g,1] = 1/2 * log2(Ygk[g]/Nk * Ygr[g]/Nr)
}
# trim M values by qm
Qm = quantile(Mgkr[,1],c(qm,1-qm),na.rm=TRUE)
Mgkr = Mgkr[which(Qm[1] < Mgkr[,1] & Mgkr[,1] < Qm[2]),]
Agkr = Agkr[which(rownames(Agkr) %in% names(Mgkr)),]
# trim A values by qa
Qa = quantile(Agkr,c(qa,1-qa),na.rm=TRUE)
Agkr = Agkr[which(Qa[1] < Agkr & Agkr < Qa[2])]
Mgkr = Mgkr[which(names(Mgkr) %in% names(Agkr))]
# only keep counts that remain after trimming
Ygr = Ygr[which(names2 %in% names(Mgkr))]
Ygk = Ygk[which(names2 %in% names(Mgkr))]
wgkr <- vector()
for(g in 1:length(Mgkr)){
# calculate weights
wgkr[g] = (Nk-Ygk[g])/(Nk*Ygk[g]) + (Nr-Ygr[g])/(Nr*Ygr[g])
}
# calculate log2(TMM) values for sample
l2TMM[k] = sum(wgkr*Mgkr) / sum(wgkr)
# normalize sample by log2(TMM)
TMM[k] <- 2^l2TMM[k]
e_data_norm[,k] <- e_data_norm[,k] / TMM[k]
}
# put data back together
e_data <- data.frame(e_data[,edata_id],e_data_norm)
names(e_data)[1] <- edata_id
e_data[,which(colnames(e_data) == names(ref))] <- old[,which(colnames(old) == names(ref))]
names(TMM) <- names(data)
TMM[is.na(TMM)] <- 1
return(list(normed_data=e_data, location_param=NULL, scale_param=TMM))
}
#' Total sum scaling normalization of count data
#'
#' The method normalizes count data by the total number of counts in each sample
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#'@param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details Count data is normalized by the total count, dividing by the total count of each sample
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the TSS-normalized data, location_param is NULL, scale_param is a numeric vector of the total count in each sample.
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_TSS <- TSS_Norm(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#' norm_factors <- attr(cDNA_TSS,"data_info")$scale_param
#'
#' @author Allison Thompson, Lisa Bramer
#'
TSS_Norm <- function(e_data, edata_id){
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# normalize by the total sum in each sample
scale_param <- colSums(e_data_norm, na.rm=T)
e_data_norm <- apply(e_data_norm, 2, function(x) x / sum(x, na.rm = TRUE))
# put data back together
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
return(list(normed_data=e_data, location_param=NULL, scale_param=scale_param))
}
#' Log transformation of count data
#'
#' The method normalizes count data by log2 transforming all of the counts
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#'@param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details Count data is normalized by a log2 transformation
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the Log2-normalized data, location_param is NULL, scale_param is a numeric vector of 1's, for later use.
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_log <- Log_Norm(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#'
#' @author Allison Thompson
#'
Log_Norm <- function(e_data, edata_id){
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# change 0 to NA #
e_data_norm[e_data_norm == 0] <- NA
# normalize by log transforming the data
e_data_norm <- log2(e_data_norm)
# put data back together
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
# add fake scale param for later use
scale_param <- rep(1, ncol(e_data[,-which(colnames(e_data) == edata_id)]))
names(scale_param) <- colnames(e_data)[-which(colnames(e_data) == edata_id)]
return(list(normed_data=e_data, location_param=NULL, scale_param=scale_param))
}
#' Centered-log Ratio Normalization of count data
#'
#' The method normalizes count data by a centered-log ratio
#'
#' @param e_data a \eqn{p \times n} data.frame of count data, where \eqn{p} is the number of features and \eqn{n} is the number of samples. Each row corresponds to data for a feature, with the first column giving the feature name.
#'@param edata_id character string indicating the name of the feature identifier. Usually obtained by calling \code{attr(omicsData, "cnames")$edata_cname}.
#'
#' @details Count data is normalized by centered-log ratios
#'
#' @return List containing 3 elements: norm_data is a data.frame with same structure as e_data that contains the Log2-normalized data, location_param is NULL, scale_param is a numeric vector of 1's, for later use.
#'
#' @examples
#' library(mintJansson)
#' data(cDNA_hiseq_data)
#' cDNA_log <- Log_Norm(e_data = cDNA_hiseq_data$e_data, edata_id = attr(cDNA_hiseq_data, "cnames")$edata_cname)
#'
#' @author Allison Thompson
#'
CLR_Norm <- function(e_data, edata_id, prior){
if((!is.numeric(prior) & !is.null(prior)) | length(prior) > 1){
stop("prior must either be NULL or a numeric number")
}
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
# Add prior, if given
if(!is.null(prior)){
e_data_norm <- e_data_norm + prior
}
# change 0 to NA #
e_data_norm[e_data_norm == 0] <- NA
# add fake scale param for later use
scale_param <- apply(e_data_norm, 2, function(x) mean(log2(x), na.rm=TRUE))
# normalize by log transforming the data
e_data_norm <- apply(log2(e_data_norm), 2, function(x) x - mean(x, na.rm=TRUE))
# put data back together
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
return(list(normed_data=e_data, location_param=NULL, scale_param=scale_param))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.