R/RRate.R
In RRate: Estimating Replication Rate for Genome-Wide Association Studies

Documented in repRateEst repSampleSizeRR repSampleSizeRR2 SEest

# TODO: Estimating the Replication Rate
# 
# Author: wjiangaa
###############################################################################

#Woolf's method to estimate standard error of log(ORhat) 
SEest<-function(n0,n1,fU,fA){
	return(sqrt(1/(2*n0*fU*(1-fU))+(1/(2*n1*fA*(1-fA)))))
}

#Positive probability for log odds ratio test
posProb<-function(MU,SE,zalpha2){
	return(pnorm(-zalpha2-MU/SE)+pnorm(-zalpha2+MU/SE))
}

#Positive probability for log odds ratio test in the replication study
posProbR<-function(MU,SE2,zalphaR2,z){
	return(pnorm(-zalphaR2+sign(z)*MU/SE2))
}

# Infer the proportion of nonassociated SNPs (NULL hypotheses) using Storey's Method [2003]
# See Storey, J. D., & Tibshirani, R. (2003). 
# Statistical significance for genomewide studies. 
# Proceedings of the National Academy of Sciences, 100(16), 9440-9445.
# Input: P values-pvals[vector]
#        show plot estimation process-showEst[bool, T]
#        show result info-info[bool, T]
#        Output directory for inference plot when showEst=T-outputDir[str, "output"]
# Output: The proportion of nonassociated SNPs (spline)-pi0_1[scalar]
#         The proportion of nonassociated SNPs (lambda=0.5)-pi0_2[scalar]
snpNullS<-function(pval, showEst=TRUE, info=TRUE, outputDir="output"){
#	browser()
	m<-length(pval)
	pvalA<-sort(pval) #Sorting pvals with ascending order
#	lambda<-seq(0.05,0.95,0.05)
	lambda<-seq(0.05,0.8,0.05)
	idx<-1
	pi0HatF<-rep(0,length(lambda))
	for(i in 1:length(lambda)){
		for(idx in idx:m) if(pvalA[idx]>=lambda[i]) break
		pi0HatF[i]<-(m-idx+1)/(m*(1-lambda[i]))
	}
#	library('splines')
	fHat <- lm(pi0HatF ~ ns(lambda, df = 3))
	pi0Pred<- predict(fHat,data.frame(lambda=lambda))
#	pi0<-min(pi0Pred[length(lambda)],1)
	pi0<-min(predict(fHat,data.frame(lambda=1)),1)

#	fHat <- smooth.spline(lambda, pi0HatF, df = 3)
#	pi0Pred <- predict(fHat, x = lambda)$y
##	pi0<-min(pi0Pred[length(lambda)],1)
#	pi0<-min(predict(fHat, x = 1)$y,1)

	if(showEst==TRUE){
		dir.create(outputDir,showWarnings=F)
		pdf(paste(outputDir,"pi0infer.pdf",sep="/"))
		plot(lambda,pi0HatF, xlab = "lambda", ylab = "pi0_hat",main="Estimating pi0")
		lines(lambda, pi0Pred)
		dev.off()
	}
	
#	pi0_2<-min(pi0HatF[51],1)
	if(info) 
		cat("Estimated pi0 (spline):",pi0,"\n")
	return(list(pi0=pi0,pi0Full=pi0HatF,pi0Pred=pi0Pred))
}

#Estimate the variance of log(OR), i.e. sigma0^2
snpMUvar<-function(z,pi0, SE){
	m<-length(z)
	if(pi0==1) return(0)
	return(max(((sum(z^2)-m*pi0)/(1-pi0)-m)/sum(1/SE^2),0))
}

#local fdr based on two parameters model
lfdr2<-function(z,SE,sigma02, pi0){
	f0<-dnorm(z)
	f1<-dnorm(z,0,sqrt(1+sigma02/(SE^2)))
	return(pi0*f0/(pi0*f0+(1-pi0)*f1))
}

#replication rate /true non-replication rate/ average RR
repRate<-function(MUhat,SE, SE2,sigma02,pi0,zalphaR2, output=TRUE, dir='output'){ #num is the number of sampling points
	alphaR<-2*(1-pnorm(zalphaR2))
	lambda<-sigma02/(SE^2+sigma02)
	zstar<-lambda*MUhat/SE2
	sigmaStar2<-1+lambda*(SE^2)/(SE2^2)
	#replication predictive power
	repRateH1<-pnorm((-zalphaR2+sign(MUhat)*zstar)/sqrt(sigmaStar2))

	lfdr<-lfdr2(MUhat/SE,SE,sigma02,pi0)
	RR<-lfdr*alphaR/2+(1-lfdr)*repRateH1
	TNR<-ifelse(RR==1, 0, lfdr*(1-alphaR/2)/(1-RR))
#	prob1<-dnorm((zR-zstar)/sqrt(sigmaStar2))
#	fir<-(1-lfdr)*prob1/(lfdr*dnorm(zR)+(1-lfdr)*prob1)
	avgRR<-mean(RR)
	
	if(output==T){
		dir.create(dir,showWarnings=F)
		write.table(cbind(RR,TNR,lfdr,repRateH1),paste(dir,'rate.txt',sep='/'),
				col.names=c('RR','TNR','lfdr','Pred.Power'),row.names=F,sep='\t',quote=F)
		write.table(avgRR,paste(dir,'summary.txt',sep='/'),
				col.names='Avg.RR',row.names=F,sep='\t',quote=F)
	}
	
	return(list(RR=RR,TNR=TNR,lfdr=lfdr,predPower=repRateH1, GRR=avgRR))
}

#Bootstrap to estimate pi0 and sigma0^2
snpBoot<-function(z,SE,num=100,pval=FALSE){ #,pvalEst=1){ #num is the resampling numbers
	pi0Boot<-rep(0,num)
	sigma02Boot<-rep(0,num)
	if(is.logical(pval)==T && pval==F) 
		pval<-2*(1-pnorm(abs(z)))
	for(i in 1:num){
		repeat{
			idx<-sample.int(length(z),replace=T)
			tmpZ<-z[idx]
			tmpSE<-SE[idx]		
			tmpPval<-pval[idx]
	#		pi0Boot[i]<-switch(pvalEst,snpNullS(tmpPval,showEst=F,info=F)$pi0_1,snpNullS(tmpPval,showEst=F,info=F)$pi0_2)
	#		pi0Boot[i]<-propTrueNull(tmpPval)
			pi0Boot[i]<-snpNullS(tmpPval,showEst=F,info=F)$pi0
			if(pi0Boot[i]!=1) break
		}
		sigma02Boot[i]<-snpMUvar(tmpZ,pi0Boot[i], tmpSE)
#		cat(i,pi0Boot[i],sigma02Boot[i],'\n')
	}
#	pi0BootCI<-quantile(pi0Boot,c(0.025,1-0.025))
#	sigma02BootCI<-quantile(sigma02Boot,c(0.025,1-0.025))
#	idx<-(pi0BootCI[1]<=pi0Boot) & (pi0Boot<=pi0BootCI[2]) & (sigma02BootCI[1]<=sigma02Boot) & (sigma02Boot<sigma02BootCI[2])
	return(list(pi0=pi0Boot,sigma02=sigma02Boot))
}

quantile<-function(x,q,na.rm=TRUE){
	result<-rep(0,length(q))
	if(na.rm) x.sorted<-sort(x[!is.na(x)])
	n<-length(x)
	for(i in 1:length(q)){
		result[i]<-x.sorted[floor(n*q[i])]
	}
	return(result)
}

#Pipeline to estimate RR/TNR/avgRR and their CI
repRateEst<-function(MUhat,SE, SE2,zalpha2,zalphaR2,boot=100,output=TRUE,idx=TRUE, dir='output',info=TRUE){#,pvalEst=1){
	#Point Estimation
	z<-MUhat/SE
#	if(is.logical(pval)==T && pval==F) 
	pval<-2*(1-pnorm(abs(z)))

	pi0hat<-snpNullS(pval,outputDir=dir,showEst=output,info=info)$pi0
	if(info) cat('Estimated Pi0:',pi0hat,'\n')
	sigma02hat<-snpMUvar(z,pi0hat,SE)
	if(info) cat('Estimated sigma0^2:',sigma02hat,'\n')
	sigIdx<-(pval<=2*(1-pnorm(zalpha2))) & idx
#	browser()
	repResult<-repRate(MUhat[sigIdx],SE[sigIdx],SE2[sigIdx],sigma02hat,pi0hat,zalphaR2,output=F)
	
	bootResult<-snpBoot(z,SE,boot,pval)#,pvalEst=pvalEst)
	bootRRdist<-matrix(0,nrow=sum(sigIdx),ncol=boot)
	bootTNRdist<-matrix(0,nrow=sum(sigIdx),ncol=boot)
	bootlfdrDist<-matrix(0,nrow=sum(sigIdx),ncol=boot)
	bootPredPowerDist<-matrix(0,nrow=sum(sigIdx),ncol=boot)
	bootAvgRRdist<-rep(0,boot)
	for(i in 1:boot){
		tmpRepResult<-repRate(MUhat[sigIdx],SE[sigIdx],SE2[sigIdx],
				bootResult$sigma02[i],bootResult$pi0[i],zalphaR2,output=F)
		bootRRdist[,i]<-tmpRepResult$RR
		bootTNRdist[,i]<-tmpRepResult$TNR
		bootlfdrDist[,i]<-tmpRepResult$lfdr
		bootPredPowerDist[,i]<-tmpRepResult$predPower
		bootAvgRRdist[i]<-tmpRepResult$GRR
	}
	bootRRlow<-rep(0,sum(sigIdx))
	bootRRhigh<-rep(0,sum(sigIdx))
	bootTNRlow<-rep(0,sum(sigIdx))
	bootTNRhigh<-rep(0,sum(sigIdx))
	bootlfdrLow<-rep(0,sum(sigIdx))
	bootlfdrHigh<-rep(0,sum(sigIdx))
	bootPredPowerLow<-rep(0,sum(sigIdx))
	bootPredPowerHigh<-rep(0,sum(sigIdx))

	#percentile CI
	for(i in 1:sum(sigIdx)){
		bootRR<-quantile(bootRRdist[i,],c(0.025,1-0.025))
		bootRRlow[i]<-bootRR[1]
		bootRRhigh[i]<-bootRR[2]
		bootTNR<-quantile(bootTNRdist[i,],c(0.025,1-0.025),na.rm = T)
		bootTNRlow[i]<-bootTNR[1] 
		bootTNRhigh[i]<-bootTNR[2]
		bootlfdr<-quantile(bootlfdrDist[i,],c(0.025,1-0.025))
		bootlfdrLow[i]<-bootlfdr[1] 
		bootlfdrHigh[i]<-bootlfdr[2]
		bootPredPower<-quantile(bootPredPowerDist[i,],c(0.025,1-0.025))
		bootPredPowerLow[i]<-bootPredPower[1]
		bootPredPowerHigh[i]<-bootPredPower[2] 
	}
	avgBootRR<-quantile(bootAvgRRdist,c(0.025,1-0.025))
	avgBootRRlow<-avgBootRR[1]
	avgBootRRhigh<-avgBootRR[2]
	
	if(output==T){
		dir.create(dir,showWarnings=F)
		write.table(cbind((1:length(z))[sigIdx], z[sigIdx], pval[sigIdx], repResult$RR, bootRRlow , bootRRhigh),
				paste(dir,'rate.txt',sep='/'), col.names=c('Index','Z','P','RR','RR95%low','RR95%upper'),row.names=F,sep='\t',quote=F)
		write.table(cbind(repResult$GRR,avgBootRRlow,avgBootRRhigh),
				paste(dir,'summary.txt',sep='/'), col.names=c('Avg.RR','95%low','95%upper'),row.names=F,sep='\t',quote=F)
	}
	
	return(list(idx=(1:length(z))[sigIdx], pi0=pi0hat, sigma02=sigma02hat,
				RR=repResult$RR, RRlow=bootRRlow, RRhigh=bootRRhigh, #RRdist=bootRRdist,
				lfdr=repResult$lfdr, lfdrLow=bootlfdrLow, lfdrHigh=bootlfdrHigh, #lfdrDist=bootlfdrDist,
				predPower=repResult$predPower, predPowerLow=bootPredPowerLow, predPowerHigh=bootPredPowerHigh, #predPowerDist=bootPredPowerDist,
				GRR=repResult$GRR,GRRlow=avgBootRRlow,GRRhigh=avgBootRRhigh))#, avgRRdist=bootAvgRRdist))
}

#Bisection to find positive x such that fun(x)=y, fun is a monotonic increasing function
incSearch<-function(fun,y,init=1000,tol=0.01){
	x<-init
	lowSign<-FALSE
	highSign<-FALSE
	repeat{
		currY<-fun(x)
		if(currY<y){
			lowX<-x
			x<-2*x
			lowSign<-TRUE
		}else if(currY>y){
			highX<-x
			x<-x/2
			highSign<-TRUE
		}else return(x)
		if(lowSign && highSign) break
	}
	while(highX-lowX>tol){
		x<-(lowX+highX)/2
		currY<-fun(x)
		if(currY<y){
			lowX<-x
		}else if(currY>y){
			highX<-x
		}else return(x)
	}
	return(x)
}

#Sample size determination for the replication study (Control to case ratio is the same with the original study)
repSampleSizeRR<-function(GRR, n, MUhat,SE,zalpha2,zalphaR2,idx=TRUE){
	z<-MUhat/SE
	pval<-2*(1-pnorm(abs(z)))
	pi0hat<-snpNullS(pval,showEst=FALSE,info=FALSE)$pi0
	#pi0hat<-propTrueNull(pval)
	sigma02hat<-snpMUvar(z,pi0hat,SE)
	sigIdx<-(abs(z)>=zalpha2)& idx
	GRRest<-function(n2){
		SE2<-SE/sqrt(n2/n)
		return(repRate(MUhat[sigIdx],SE[sigIdx], SE2[sigIdx],sigma02hat,pi0hat,zalphaR2, output=F)$GRR)
	}
	n2<-incSearch(GRRest,GRR)
	return(round(n2))
}

#Sample size determination for the replication study (Different control to case ratio)
repSampleSizeRR2<-function(GRR,CCR2, MUhat,SE,fU,fA,zalpha2,zalphaR2, idx=TRUE){
	z<-MUhat/SE
	pval<-2*(1-pnorm(abs(z)))
#	pi0hat<-propTrueNull(pval)
	pi0hat<-snpNullS(pval,showEst=FALSE,info=FALSE)$pi0
	sigma02hat<-snpMUvar(z,pi0hat,SE)
	sigIdx<-(abs(z)>=zalpha2)& idx
	GRRest<-function(n2){
		caseNum<-round(n2/(CCR2+1))
		SE2<-SEest(n2-caseNum,caseNum,fU,fA)
		return(repRate(MUhat[sigIdx],SE[sigIdx],SE2[sigIdx],sigma02hat,pi0hat,zalphaR2, output=F)$GRR)
	}
	n2<-incSearch(GRRest,GRR)
	return(round(n2))
}

Any scripts or data that you put into this service are public.

RRate documentation built on May 1, 2019, 8:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

RRate
Estimating Replication Rate for Genome-Wide Association Studies

R/RRate.R
In RRate: Estimating Replication Rate for Genome-Wide Association Studies

Defines functions SEest posProb posProbR snpNullS snpMUvar lfdr2 repRate snpBoot quantile repRateEst incSearch repSampleSizeRR repSampleSizeRR2

Documented in repRateEst repSampleSizeRR repSampleSizeRR2 SEest

Try the RRate package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RRate Estimating Replication Rate for Genome-Wide Association Studies

R/RRate.R In RRate: Estimating Replication Rate for Genome-Wide Association Studies

Defines functions SEest posProb posProbR snpNullS snpMUvar lfdr2 repRate snpBoot quantile repRateEst incSearch repSampleSizeRR repSampleSizeRR2

Documented in repRateEst repSampleSizeRR repSampleSizeRR2 SEest

Try the RRate package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

RRate
Estimating Replication Rate for Genome-Wide Association Studies

R/RRate.R
In RRate: Estimating Replication Rate for Genome-Wide Association Studies