Nothing
### TMM calculation of this function was adopted from edgeR package (Robinson et al. 2010)
### The methodology behind the TMM normalization was originally described by Mark Robinson (2010)
calcFactorQuantile <- function (data, lib.size, p=0.75)
# Generalized version of upper-quartile normalization
# Mark Robinson and Gordon Smyth
# adopted from EdgeR package
{
f <- rep_len(1,ncol(data))
for (j in seq_len(ncol(data))) f[j] <- quantile(data[,j], probs=p,na.rm=TRUE)
if(min(f)==0) warning("One or more quantiles are zero")
f / lib.size
}
TMM <- function(obs, ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff=-1e10)
# TMM between two libraries
# Mark Robinson
# adopted from edgeR package (Robinson et al. 2010)
{
obs <- as.numeric(obs)
ref <- as.numeric(ref)
nO <- sum(obs)
nR <- sum(ref)
logR <- log2((obs/nO)/(ref/nR))
absE <- (log2(obs/nO) + log2(ref/nR))/2
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
logR <- logR[fin]
absE <- absE[fin]
v <- v[fin]
if(max(abs(logR)) < 1e-6) return(1)
n <- length(logR)
loL <- floor(n * logratioTrim) + 1
hiL <- n + 1 - loL
loS <- floor(n * sumTrim) + 1
hiS <- n + 1 - loS
keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS)
if(Weighting){
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
} else{
f <- mean(logR[keep], na.rm=TRUE)
}
if(is.na(f)) f <- 0
2^f
}
TMMex <- function(obs,ref, logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff=-1e10,cl)
# Gordon Smyth
# adopted from edgeR package (Robinson et al. 2010)
{
ref <- as.numeric(ref)
obs <- as.numeric(obs)
eps <- 1e-14 #floating zero
pos.obs <- (obs > eps)
pos.ref <- (ref > eps)
npos <- 2L * pos.obs + pos.ref
i <- which(npos==0L | is.na(npos))
if(length(i)) {
obs <- obs[-i]
ref <- ref[-i]
npos <- npos[-i]
}
# Check library sizes
libsize.obs <- sum(obs)
libsize.ref <- sum(ref)
zero.obs <- (npos == 1L)
zero.ref <- (npos == 2L)
k <- (zero.obs | zero.ref)
n.eligible.singles <- min( sum(zero.obs), sum(zero.ref))
if(n.eligible.singles > 0L) {
refk <- sort(ref[k],decreasing=TRUE)[1:n.eligible.singles]
obsk <- sort(obs[k],decreasing=TRUE)[1:n.eligible.singles]
obs <- c(obs[!k],obsk)
ref <- c(ref[!k],refk)
} else {
obs <- obs[!k]
ref <- ref[!k]
}
n <- length(obs)
if(n==0L) return(1)
obs.p <- obs / libsize.obs
ref.p <- ref / libsize.ref
M <- log2( obs.p / ref.p )
A <- 0.5 * log2( obs.p * ref.p )
if(max(abs(M)) < 1e-6) return(1)
obs.p.shrunk <- (obs+0.5) / (libsize.obs+0.5)
ref.p.shrunk <- (ref+0.5) / (libsize.ref+0.5)
M.shrunk <- log2( obs.p.shrunk / ref.p.shrunk )
o.M <- order(M, M.shrunk)
o.A <- order(A)
loM <- as.integer(n * logratioTrim) + 1L
hiM <- n + 1L - loM
keep.M <- rep_len(FALSE,n)
keep.M[o.M[loM:hiM]] <- TRUE
loA <- as.integer(n * sumTrim) + 1L
hiA <- n + 1L - loA
keep.A <- rep_len(FALSE,n)
keep.A[o.A[loA:hiA]] <- TRUE
keep <- keep.M & keep.A
M <- M[keep]
if(Weighting) {
obs.p <- obs.p[keep]
ref.p <- ref.p[keep]
v <- (1-obs.p)/obs.p/libsize.obs + (1-ref.p)/ref.p/libsize.ref
w <- (1+1e-6) / (v+1e-6)
TMM <- sum(w*M) / sum(w)
} else {
TMM <- mean(M)
}
2^TMM
}
## quantile normalization according to Robinson MD, and Oshlack A (2010)
# qn1
quantile_normalisation <- function(df,het.table=NULL,verbose=verbose,cl){
df_rank <- parApply(cl=cl,df,2,rank,ties.method="min")
df_sorted <- data.frame(apply(df, 2, sort,na.last=TRUE))
df_mean <- parApply(cl=cl,df_sorted, 1, mean,na.rm=TRUE)
if(!is.null(het.table)){
het.table<-het.table[,-c(1:4)]
}
if(verbose){
message("\ncalculating normalized depth")
df_final <- parLapply(cl=cl,1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table)
} else {
df_final <- parLapply(cl=cl,1:ncol(df_rank), index_to_mean, my_mean=df_mean,indx=df_rank,al=het.table)
}
return(df_final)
}
# qn2
index_to_mean <- function(x,indx, my_mean, al=NULL){
my_index<-indx[,x]
nr<-my_mean[my_index]
if(!is.null(al)){
alleles<-al[,x]
sm<-do.call(cbind,lapply(data.table::tstrsplit(alleles,","),as.numeric))
af<-proportions(sm,1)
af<-round(af*nr,0)
af<-paste0(af[,1],",",af[,2])
af[af=="NaN,NaN" | af=="NA,NA"] <- 0
return(af)
} else {
return(nr)
}
}
#' Calculate normalization factor for each sample
#'
#' This function calculates the normalization factor for each sample using
#' different methods. See details.
#'
#' @param df a data frame or matrix of allele depth values
#' (total depth per snp per sample)
#' @param method character. method to be used (see details). Default \code{TMM}
#' @param logratioTrim numeric. percentage value (0 - 1) of variation to be
#' trimmed in log transformation
#' @param sumTrim numeric. amount of trim to use on the combined absolute
#' levels (\dQuote{A} values) for method \code{TMM}
#' @param Weighting logical, whether to compute (asymptotic binomial precision)
#' weights
#' @param Acutoff numeric, cutoff on \dQuote{A} values to use before trimming
#'
#' @details Originally described for normalization of RNA sequences
#' (Robinson & Oshlack 2010), this function computes normalization (scaling)
#' factors to convert observed library sizes into effective library sizes.
#' It uses the method trimmed means of M-values proposed by Robinson &
#' Oshlack (2010). See the original publication and \code{edgeR} package
#' for more information.
#' The method \code{MedR} is median ratio normalization;
#' QN - quantile normalization (see Maza, Elie, et al. 2013 for a
#' comparison of methods).
#'
#' @return Returns a numerical vector of normalization factors for each sample
#'
#' @author Piyal Karunarathne
#'
#' @references
#' \itemize{
#' \item{Robinson MD, and Oshlack A (2010). A scaling normalization method for
#' differential expression analysis of RNA-seq data. Genome Biology 11, R25}
#' \item{Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
#' package for differential expression analysis of digital gene expression
#' data. Bioinformatics 26}
#' }
#'
#' @examples
#' \dontrun{vcf.file.path <- paste0(path.package("rCNV"), "/example.raw.vcf.gz")
#' vcf <- readVCF(vcf.file.path)
#' df<-hetTgen(vcf,"AD-tot",verbose=FALSE)
#' norm.fact(df)}
#'
#'
#' @export
norm.fact<-function(df,method=c("TMM","TMMex","MedR","QN"),logratioTrim=.3, sumTrim=0.05, Weighting=TRUE, Acutoff=-1e10){
if(setequal(c("CHROM","POS","ID"),colnames(df)[1:3])){df<-df[,-c(1:4)]}
method<-match.arg(method,several.ok = TRUE)
if(length(method)>1){ifelse(nrow(df)<10001, assign("method","TMM"),assign("method","TMMex"))}
if(method=="TMM"){
f75 <- suppressWarnings(calcFactorQuantile(data=as.matrix(df), lib.size=colSums(as.matrix(df)), p=0.75))
if(median(f75) < 1e-20) {
ref <- which.max(colSums(sqrt(as.matrix(df))))
} else {
ref <- which.min(abs(f75-mean(f75)))
}
out<-apply(df,2,FUN=TMM,ref=df[,ref],logratioTrim=logratioTrim,sumTrim=sumTrim,Weighting=Weighting,Acutoff=Acutoff)
} else if(method=="TMMex"){
ref<-which.max(colSums(sqrt(as.matrix(df))))
out<-apply(df,2,FUN=TMMex,ref=df[,ref],logratioTrim=logratioTrim,sumTrim=sumTrim,Weighting=Weighting,Acutoff=Acutoff)
} else if(method=="MedR"){
pseudo<- apply(df,1,function(xx){exp(mean(log(as.numeric(xx)[as.numeric(xx)>0])))})
out<- apply(df,2,function(xx){median(as.numeric(xx)/pseudo,na.rm=T)})
}
out<-data.frame(lib.size=colSums(df),norm.factor=out)
return(out)
}
#' Calculate normalized depth for alleles
#'
#' This function outputs the normalized depth values separately for each allele,
#' calculated using normalization factor with trimmed mean of M-values of
#' sample libraries, median ratios normalization or quantile normalization,
#' See details.
#'
#' @param het.table allele depth table generated from the function
#' \code{hetTgen}
#' @param method character. method to be used (see details). Default \code{TMM}
#' @param logratioTrim numeric. percentage value (0 - 1) of variation to be
#' trimmed in log transformation
#' @param sumTrim numeric. amount of trim to use on the combined absolute
#' levels (\dQuote{A} values) for method \code{TMM}
#' @param Weighting logical, whether to compute (asymptotic binomial precision)
#' weights
#' @param Acutoff numeric, cutoff on \dQuote{A} values to use before trimming
#' (only for TMM(ex))
#' @param verbose logical. show progress
#' @param plot logical. Plot the boxplot of sample library sizes showing outliers
#'
#' @details This function converts an observed depth value table to an
#' effective depth value table using several normalization methods;
#' 1. TMM normalization (See the original publication for more information).
#' It is different from the function \code{normz} only in calculation of the
#' counts per million is for separate alleles instead of the total depth.
#' The \code{TMMex} method is an extension of the \code{TMM} method for
#' large data sets containing SNPs exceeding 10000
#' 2. The method \code{MedR} is median ratio normalization;
#' 3. QN - quantile normalization (see Maza, Elie, et al. 2013 for a
#' comparison of methods).
#' 4. PCA - a modified Kaiser's Rule applied to depth values: Sample variation
#' of eigen values smaller than 0.7 are removed (i.e., the first eigen value < 0.7)
#' to eliminate the effect of the library size of samples
#'
#' @return Returns a list with (AD), a data frame of normalized depth values
#' similar to the output of \code{hetTgen} function and
#' (outliers) a list of outlier sample names
#'
#' @author Piyal Karunarathne, Qiujie Zhou
#'
#' @references
#' \itemize{
#' \item{Robinson MD, Oshlack A (2010). A scaling normalization method for
#' differential expression analysis of RNA-seq data. Genome Biology 11, R25}
#' \item{Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
#' package for differential expression analysis of digital gene expression
#' data. Bioinformatics 26}
#' \item{Maza, Elie, et al. "Comparison of normalization methods for
#' differential gene expression analysis in RNA-Seq experiments: a matter of
#' relative size of studied transcriptomes." Communicative & integrative
#' biology 6.6 (2013): e25849}
#' }
#'
#' @examples
#' \dontrun{data(ADtable)
#' ADnormalized<-cpm.normal(ADtable)}
#'
#'
#' @export
cpm.normal <- function(het.table, method=c("MedR","QN","pca","TMM","TMMex"),
logratioTrim=.3, sumTrim=0.05, Weighting=TRUE,
Acutoff=-1e10, verbose=TRUE, plot=TRUE){
method<-match.arg(method)
if(length(method)>1){method="MedR"}
dm <- dim(het.table)
pb_Total <- dm[2] - 4L
if(verbose){
message("calculating normalization factor")
pb <- txtProgressBar(min = 0, max = pb_Total, width = 50, style = 3)
}
# tdep<-parApply(cl=cl,(het.table[,-c(1:4)],2,function(tmp){
y1 <- y2 <- matrix(NA_integer_, dm[1], pb_Total)
for(i in seq_len(pb_Total)){
if (verbose) setTxtProgressBar(pb, i)
tmp <- stringr::str_split_fixed(het.table[,i+4L], ",", 2L)
y1[, i] <- as.integer(tmp[,1])
y2[, i] <- as.integer(tmp[,2])
}
if(verbose) close(pb)
tdep <- y1 + y2
colnames(tdep) <- colnames(het.table[-c(1:4)])
#find and warn about outliers
ot<-boxplot.stats(colSums(tdep,na.rm = T))$out
cl<-rep("dodgerblue",ncol(tdep))
ot.ind<-which(colnames(tdep) %in% names(ot))
cl[ot.ind]<-2
if(length(ot)>0){
if(plot){
barplot(colSums(tdep,na.rm = T), col=cl, border=NA, xlab="sample",
ylab="total depth")
}
message("OUTLIERS DETECTED\nConsider removing the samples:")
cat(colnames(tdep)[ot.ind])
}
if(method=="TMM" | method=="TMMex"){
if(verbose) message("\ncalculating normalized depth")
nf<-norm.fact(tdep, method=method, logratioTrim=logratioTrim,
sumTrim=sumTrim, Weighting=Weighting, Acutoff=Acutoff)
sc <- median(colSums(tdep)) / (nf[,1]*nf[,2])
y1 <- round(y1 * rep(sc, each=dm[1]), 2)
y2 <- round(y2 * rep(sc, each=dm[1]), 2)
out <- paste0(y1, ",", y2)
attributes(out) <- attributes(tdep)
} else if(method=="MedR") {
pseudo <- apply(tdep,1,function(xx){exp(mean(log(as.numeric(xx)[as.numeric(xx)>0])))})
nf <- apply(tdep,2,function(xx){median(as.numeric(xx)/pseudo,na.rm=T)})
if(verbose) message("\ncalculating normalized depth")
y1 <- round(y1 / rep(nf, each=dm[1]), 0)
y2 <- round(y2 / rep(nf, each=dm[1]), 0)
out <- paste0(y1, ",", y2)
attributes(out) <- attributes(tdep)
} else if(method=="QN"){
out <- do.call(cbind,quantile_normalisation(tdep,het.table,verbose=verbose,cl=cl))
} else if(method=="pca"){
if(verbose){message("\ncalculating normalized depth")}
new.mat <- t(tdep) ### check the direction to confirm if this step need to be done
colmean <- colMeans(new.mat)
colsd <- apply(new.mat, 2, sd)
new.mat <- apply(new.mat, 2, scale,scale = TRUE) #### essential before SVD
test.la.svd <- La.svd(new.mat)
u <- test.la.svd$u
d <- test.la.svd$d
vt <- test.la.svd$vt
## optimal PCs to remove with d values
ddl<-NULL
for(i in seq_along(1:50)){
if(i<50) ddl[i]<-d[i+1]-d[i]
}
## modified Kaiser's Rule: Sample variation of eigen values smaller than 0.7 should be kept (i.e., the first eigen value < 0.7)
rmpc<-min(which(abs(ddl)<0.7))
#plot(d[1:50],pch=19,cex=0.5)
#points(rmpc,d[rmpc],cex=1.5,col="red")
d[1:rmpc] <- 0
out <- u %*% diag(d) %*% vt
out <- apply(out, 1, FUN = function(x){round(x*colsd + colmean,0)})#### back transform to depth matrix
out[out<0]<-0
out<-t(out)
ht<-het.table[,-c(1:4)]
tout<-NULL
for(i in 1:ncol(ht)){
if(verbose){
pb <- txtProgressBar(min = 0, max = ncol(ht), style = 3, width = 50, char = "=")
setTxtProgressBar(pb, i)
}
tmp <- stringr::str_split_fixed(ht[,i], ",", 2L)
tt<-matrix(NA,nrow = nrow(tmp),ncol = 2)
tt[,1]<-as.integer(tmp[,1])
tt[,2]<-as.integer(tmp[,2])
tt <- proportions(tt,margin = 1)
tt[is.na(tt)]<-0
tt<-tt*out[i,]
tt<-paste0(round(tt[,1],0),",",round(tt[,2],0))
tout<-cbind(tout,tt)
}
out<- out #t(tout)
}
# browser()
out<-data.frame(het.table[,c(1:4)], out)
colnames(out)<-colnames(het.table)
return(list(AD=out,outliers=data.frame(column=(ot.ind+4),sample=colnames(tdep)[ot.ind])))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.