Nothing
#' Compute p values based on DMC/DVC and joint tests.
#'
#' This function loads beta matrix, covariate matrix and design formula and
#' it computes p values based on DMC/DVC and joint tests.
#'
#' @param beta beta value matrix
#' @param covariate covariate matrix
#' @param npermut number of permutation
#' @param permut.seed random seed used for permutation
#' @param corenumber number of cores used for joint-tests
#' @return A data frame out for p-values.
#' DMCP: p-value from DMC test.
#' DVCP p-value from DVC test.
#' Joint1P: joint test for DMVC (test for
#' hypermethylation and increased variance in cancer).
#' Joint2P: joint test for DMVC (test for differential methylation
#' in both direction and increased variance in cancer).
#' @export
dmvc=function(beta=beta,covariate=covariate,npermut=100,permut.seed=100,corenumber=1)
{
#A function to compute statistics matrix includes coefficients, standard devidation, z-score
fit.model.probe <- function(mat,colData){
# Fit model for each probe from data
# Call function "calculate.stats" to calculate coef & sd & residual values
# Fast version using QR factorization
# Employ the way of how glm handles NA using na.omit() by default
# If our data have NAs, the probes with NAs will be calculated seperately with NA-omitted-data
design=as.formula("~.")
calculate.stats <- function(mat, colData){
# Calculate coef & sd & residual values for selected probes from data
# Only used by function "fit.model.probe"
design.sub <- colData[rownames(colData) %in% colnames(mat),]
#remove factors with 1 level
idx_1lev=1:ncol(design.sub)
idx_1lev=idx_1lev[sapply(idx_1lev,function(j) {length(unique(design.sub[,j]))==1})]
#factors to be removed (0 means nothing to remove)
removed.factors=0
if (length(idx_1lev)>0)
{
design.sub=design.sub[,-idx_1lev]
removed.factors=paste0(idx_1lev,collapse = ";")
}
design.sub <- model.matrix(design, data = design.sub)
# Calculate coef
M <- design.sub[,-2, drop=FALSE]
M.qr <- qr(M)
M.qr.Q <- qr.Q(M.qr)
S <- diag(nrow(M)) - tcrossprod(M.qr.Q)
V <- matrix(design.sub[,2], ncol=1)
SV <- S %*% V
coef <- (mat %*% crossprod(diag(nrow(M)) - tcrossprod(M.qr.Q), design.sub[,2])) / diag(crossprod(V,SV))
# Calculate residuals
qr.X <- qr(design.sub)
qr.X.Q <- qr.Q(qr.X)
resids <- tcrossprod(diag(nrow(design.sub)) - tcrossprod(qr.X.Q), mat)
rownames(resids) <- colnames(mat)
colnames(resids) <- rownames(mat)
#Calculate fitted
fitted =tcrossprod(tcrossprod(qr.X.Q),mat)
rownames(fitted) <- colnames(mat)
# Calculate SE
qr.X.R <- qr.R(qr.X)
SE <- sqrt(chol2inv(qr.X.R)[2,2] * (colSums(resids^2) / (ncol(mat) - M.qr$rank - 1)))
# Calculate p-value
pval= t(2*pt(abs(coef/SE), ncol(mat) - M.qr$rank - 1, lower.tail=FALSE))[1,]
stats.sub <- as.matrix(cbind(coef, SE, coef / SE,pval,removed.factors))
rownames(stats.sub) <- rownames(mat)
colnames(stats.sub) <- c("coef", "sd", "zscore","pval","removedfactors")
#
# # Fill the fitted matrix/resids to full size of all samples. fitted of samples with NAs are NA.
removed.samples <- rownames(colData)[!rownames(colData) %in% colnames(mat)]
removed.mat <- matrix(NA, nrow = length(removed.samples), ncol = ncol(fitted))
rownames(removed.mat) <- removed.samples
fitted <- rbind(fitted, removed.mat)
fitted <- fitted[rownames(colData), , drop = FALSE]
resids <- rbind(resids, removed.mat)
resids <- resids[rownames(colData), , drop = FALSE]
#return(list(stats = stats.sub, residuals = resids))
return(list(stats = stats.sub,fitted=fitted,design.sub=design.sub))
}
colData.factors=NULL
for (i in 1:ncol(colData))
{
if (is.factor(colData[,i]))
{
colData.factors=c(colData.factors,i)
}
}
NA.comb <- apply(mat, 1, function(x){paste(unname(which(is.na(x))), collapse = ';')})
NA.comb.unique <- unique(NA.comb)
stats.res <- lapply(NA.comb.unique, function(x){
mat <- mat[NA.comb == x,, drop = FALSE]
##if(x != "") mat <- mat[, -as.numeric(strsplit(x, ';')[[1]]), drop = FALSE]
##change to, a probe have missing in multiple samples
if(x != "") mat <- mat[, -as.numeric(unlist(strsplit(x, ';'))), drop = FALSE]
stats.res.tmp <- calculate.stats(mat,colData)
return(stats.res.tmp)
})
stats <- NULL
stats$statistics <- do.call("rbind", lapply(stats.res, function(x){x$stats}))
stats$statistics <- stats$statistics[rownames(mat),]
stats$statistics=as.data.frame(stats$statistics)
stats$statistics$coef=as.numeric(stats$statistics$coef)
stats$statistics$sd=as.numeric(stats$statistics$sd)
stats$statistics$zscore=as.numeric(stats$statistics$zscore)
stats$statistics$pval=as.numeric(stats$statistics$pval)
stats$fitted <- do.call("cbind", lapply(stats.res, function(x){x$fitted}))
stats$fitted <- stats$fitted[,rownames(mat)]
stats$fitted=as.data.frame(stats$fitted)
stats$design=lapply(stats.res, function(x){x$design.sub})
#designidx is used to access design matrix. stats$design[[designidx$designidx[i]]] is the design matrix for probe i (mat[i,])
designidx=data.frame(matidx=1:nrow(mat),designidx=NA)
designidx$designidx=match(NA.comb,NA.comb.unique)
stats$designidx=designidx
# stats$residuals <- do.call("cbind", lapply(stats.res, function(x){x$residuals}))
# stats$residuals <- stats$residuals[,rownames(mat)]
return(stats)
}
#preprocess data
# check if covariate matrix has group variable
if (sum(colnames(covariate)=="group")==0)
{
stop("covriate matrix should include group variable")
}
#make group to be the first covariate (or else it could be a problem for rare cases where some factors have one level and get removed for some CpGs due to a lot of missing data)
tmp=colnames(covariate)
tmp=tmp[tmp!="group"]
tmp=c("group",tmp)
covariate=covariate[,tmp]
if (any(!covariate$group %in% c(0,1,NA)))
{
stop("group variable should be coded as 0 and 1")
}
#check if samples in beta match samples in covariate
if ((!all(colnames(beta) %in% rownames(covariate))) | (! all(rownames(covariate) %in% colnames(beta))))
{
warning("There are extra samples in beta or covariate matrices")
}
allsamples=intersect(colnames(beta),rownames(covariate))
if (length(allsamples)==0)
{
stop("There are no common samples in beta and covariate matrices")
}
idx=match(allsamples,colnames(beta))
beta=beta[,idx]
idx=match(allsamples,rownames(covariate))
covariate=covariate[idx,]
#remove NAs in covariate
idx1=stats::complete.cases(covariate)
if (sum(!idx1)>0)
{
warning(paste0("There are missing data in covariate matrix, ",sum(!idx1)," samples were removed"))
beta=beta[,idx1]
covariate=covariate[idx1,]
}
out=data.frame(Mean_normal=rep(NA,nrow(beta)),Mean_tumor=NA,Mean_all=NA,SD_normal=NA,SD_tumor=NA,
SD_all=NA,DMCP=rep(NA,nrow(beta)),DVCP=NA,Joint1P=NA,Joint2P=NA,LRT1=NA,LRT2=NA,pho=NA)
rownames(out)=rownames(beta)
beta[beta <= 0]=0.01
beta[beta >= 1]=0.99
out$Mean_normal=rowMeans(beta[,covariate$group==0],na.rm=T)
out$Mean_tumor=rowMeans(beta[,covariate$group==1],na.rm=T)
out$Mean_all=rowMeans(beta,na.rm=T)
out$SD_normal=matrixStats::rowSds(as.matrix(beta[,covariate$group==0]),na.rm=T)
out$SD_tumor=matrixStats::rowSds(as.matrix(beta[,covariate$group==1]),na.rm=T)
out$SD_all=matrixStats::rowSds(as.matrix(beta),na.rm=T)
#M values
dat=log(beta/(1-beta),base=2)
writeLines (paste0("Analyze ",nrow(beta)," CpGs across ",ncol(beta)," samples at ",as.character(Sys.time())))
writeLines(paste0("Start to compute DMCP pvalues at ",as.character(Sys.time()),"..."))
fit2all=fit.model.probe(mat = as.matrix(dat),colData = covariate)
out$DMCP=fit2all$statistics$pval
writeLines(paste0("Start to compute DVCP pvalues at ",as.character(Sys.time()),"..."))
dat1=as.matrix(dat)
dat1[,covariate$group==1] <- abs(dat1[,covariate$group==1]-matrixStats::rowMedians(dat1[,covariate$group==1],na.rm=T))
dat1[,covariate$group==0] <- abs(dat1[,covariate$group==0]-matrixStats::rowMedians(dat1[,covariate$group==0],na.rm=T))
fit1all=fit.model.probe(mat = dat1,colData = covariate)
out$DVCP=fit1all$statistics$pval
design=as.formula("~.")
designmatrix=model.matrix(design,data=covariate)
idxgroup=which(colnames(designmatrix)=="group")
writeLines(paste0("Start to compute joint-test pvalues at ",as.character(Sys.time()),"..."))
constrainMax <- function(coef.gg,cov.gg){
dd <- 1/(cov.gg[1,1]*cov.gg[2,2]-cov.gg[1,2]^2)
v11 <- cov.gg[2,2]
v12 <- -cov.gg[1,2]
v21 <- -cov.gg[2,1]
v22 <- cov.gg[1,1]
a1 <- v11
b1 <- -2*v11*coef.gg[1]-coef.gg[2]*(v12+v21)
c1 <- v11*coef.gg[1]^2+v22*coef.gg[2]^2+(v12+v21)*coef.gg[1]*coef.gg[2]
a2 <- v22
b2 <- -2*v22*coef.gg[2]-coef.gg[1]*(v12+v21)
c2 <- v22*coef.gg[2]^2+v11*coef.gg[1]^2+(v12+v21)*coef.gg[1]*coef.gg[2]
mlrt1 <- dd*ifelse(b1<0, c1-a1*((-b1)/(2*a1))^2,c1)
mlrt2 <- dd*ifelse(b2<0, c2-a2*((-b2)/(2*a2))^2,c2)
mlrt3 <- dd*(c2-a2*((-b2)/(2*a2))^2)
return(list(lrt1=min(mlrt1,mlrt2),lrt2=mlrt3))
}
#use to permute y
set.seed(permut.seed)
#permutate index used for probes without missing data
permutmat=matrix(NA,nrow=ncol(dat),ncol=npermut)
for (j in 1:npermut)
{
permutmat[,j]=sample(1:ncol(dat))
# permutmat[designmatrix[,idxgroup]==1,j]=sample((1:ncol(dat))[designmatrix[,idxgroup]==1],replace=T)
# permutmat[designmatrix[,idxgroup]==0,j]=sample((1:ncol(dat))[designmatrix[,idxgroup]==0],replace=T)
}
#the permutated matrix
yy1 <- matrix(NA,nrow=ncol(dat),ncol=npermut)
#outbeta <- matrix(0,npermut,2)
#iblock is the index of CpGs to work on at the same time
compute_pho=function(dat,covariate,iblock=c(1,2),permutmat)
{
y0=dat[iblock,]
y00=sapply(1:length(iblock), function(x) y0[x,][permutmat])
npermut=ncol(permutmat)
y10=matrix(y00,ncol=length(iblock)*npermut)
yy10=matrix(NA,nrow=nrow(y10),ncol=ncol(y10))
yy10[covariate$group==1,]=abs(sweep(y10[covariate$group==1,],2,matrixStats::colMedians(y10[covariate$group==1,],na.rm=T)))
yy10[covariate$group==0,]=abs(sweep(y10[covariate$group==0,],2,matrixStats::colMedians(y10[covariate$group==0,],na.rm=T)))
outbeta0 <- matrix(0,npermut*length(iblock),2)
outbeta0[,1] <- matrixStats::colMeans2(yy10[covariate$group==1,],na.rm=T)- matrixStats::colMeans2(yy10[covariate$group==0,],na.rm=T)
outbeta0[,2] <- matrixStats::colMeans2(y10[covariate$group==1,],na.rm=T)- matrixStats::colMeans2(y10[covariate$group==0,],na.rm=T)
pho0=sapply(1:length(iblock),function(x) {
segidx=seq((1+(x-1)*npermut),x*npermut)
cor(outbeta0[segidx,1],outbeta0[segidx,2])})
return(pho0)
}
pho_all=rep(NA,nrow(dat)) #store all pho
#max nblock for 4G memory
#4000*1e6/ncol(dat)/npermut/8
nblock=100 #number of CpGs together for permutation
nprobes=nrow(dat)
if (nprobes<nblock) nblock=nprobes
n.chunk=floor(nprobes/nblock)
ncount <- rep(nblock,n.chunk)
if (nprobes>sum(ncount)) ncount[n.chunk] <- ncount[n.chunk]+ nprobes-sum(ncount)
chunks <- c(0,cumsum(ncount))
if (corenumber>1) #parallel version for permutation
{
numberofcores=floor(parallel::detectCores()-1) #don't use all the cores
if (corenumber>numberofcores) corenumber=numberofcores
if (corenumber>1)
{
cl <- parallel::makeCluster(corenumber)
doParallel::registerDoParallel(cl)
if (corenumber>1) writeLines(paste0("Parallel version for permutation, total number of cores available:",numberofcores,", used cores:",corenumber))
# library(bigstatsr)
# pho_all=bigstatsr::FBM(nrow(dat),1)
pho_all=foreach::foreach(r=1:n.chunk, .combine='c') %dopar% {
iblocks=seq(chunks[r]+1,chunks[r+1])
compute_pho(dat=dat,covariate=covariate,iblock=iblocks,permutmat=permutmat)
}
# writeLines(paste0("Permutation finishes ",as.character(Sys.time())))
parallel::stopCluster(cl)
}
}
if (corenumber<=1) #sequential version for permutation
{
for (r in 1:n.chunk)
{
iblocks=seq(chunks[r]+1,chunks[r+1])
pho_all[iblocks]=compute_pho(dat=dat,covariate=covariate,iblock=iblocks,permutmat=permutmat)
}
# writeLines(paste0("Permutation part finishes ",as.character(Sys.time())))
}
tmp=sapply(1:nrow(dat),function(i){
# for (i in 1:nrow(beta))
# {
out1=rep(NA,5)
## 2 df test estimating covariance ##
pho=pho_all[i]
cov.gg <- matrix(0,2,2)
cov.gg[1,2] <- fit1all$statistics$sd[i]*fit2all$statistics$sd[i]*pho #summary(fit1)$coef[2,2]*summary(fit2)$coef[2,2]*pho
cov.gg[2,1] <- cov.gg[1,2]
cov.gg[1,1] <- (fit1all$statistics$sd[i])^2
cov.gg[2,2] <- (fit2all$statistics$sd[i])^2
coef.gg <- c(fit1all$statistics$coef[i],fit2all$statistics$coef[i])
LRTtmp1=drop(crossprod(coef.gg,solve(cov.gg)) %*% coef.gg)
constrainres=constrainMax(coef.gg,cov.gg)
if (coef.gg[1]>=0 & coef.gg[2]>=0) LRT1 <- LRTtmp1 else LRT1<- LRTtmp1 - constrainres$lrt1
#pho <- cov.gg[1,2]/sqrt(cov.gg[1,1]*cov.gg[2,2])
if (is.numeric(LRT1))
{
out1[1] <- 0.5*pchisq(LRT1,df=1,lower.tail=F)+0.5*(1-(1/pi)*acos(pho))*pchisq(LRT1,df=2,lower.tail=F)
out1[3]=LRT1
}
if (coef.gg[1]>=0) LRT2 <- LRTtmp1 else LRT2<- LRTtmp1 - constrainres$lrt2
if (is.numeric(LRT2))
{
out1[2] <- 0.5*pchisq(LRT2,df=1,lower.tail=F)+0.5*pchisq(LRT2,df=2,lower.tail=F)
out1[4]=LRT2
}
out1[5]=pho
return(out1)
}
)
tmp=t(tmp)
out$Joint1P[1:nrow(tmp)]=tmp[,1]
out$Joint2P[1:nrow(tmp)]=tmp[,2]
out$LRT1[1:nrow(tmp)]=tmp[,3]
out$LRT2[1:nrow(tmp)]=tmp[,4]
out$pho[1:nrow(tmp)]=tmp[,5]
writeLines(paste0("The result is ready at ",as.character(Sys.time())))
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.