R/PQLseq.R

Defines functions PQLseq.AI PQLseq.fit pqlseq

Documented in pqlseq

########################################################################################################################
# Package: PQLseq
# Version: 1.2
# Date   : 2020-05-01
# Title  : Heritability Estimation and Differential Analysis with Generalized Linear Mixed Models in Large-Scale Genomic Sequencing Studies
# Authors: Shiquan Sun, Jiaqiang Zhu, and Xiang Zhou
# Contact: jiaqiang@umich.edu 
#          University of Michigan, Department of Biostatistics
########################################################################################################################


pqlseq <- function(RawCountDataSet, Phenotypes, Covariates=NULL, RelatednessMatrix=NULL, LibSize=NULL, 
                  fit.model="PMM", fit.method = "AI.REML", fit.maxiter=500, fit.tol=1e-5, numCore=1, 
				  filtering=TRUE, verbose=FALSE, ...) {
	# specify the number of cores we want to use
	if(numCore > 1){
		if(numCore>detectCores()){warning("PQLseq:: the number of cores you're setting is larger than detected cores!");numCore = detectCores()-1}
	}

	# registerDoParallel(numCore)

	cl <- makeCluster(numCore)
	registerDoParallel(cl,cores=numCore)
	# on.exit(stopCluster(cl))
	
	# filtering genes/sites
	if (filtering & fit.model == "PMM"){
		unfilterIdx <- apply(RawCountDataSet, 1, function(x){length(x[x>5])>=2} )
		CountData   <- RawCountDataSet[unfilterIdx,]
	}else{
		CountData   <- RawCountDataSet
	}
	rm(RawCountDataSet)
    
	numVar <- dim(CountData)[1]
	numIDV <- dim(CountData)[2]
	
	# remove the intercept
	if(length(unique(Covariates[,1])) == 1){
		Covariates<- Covariates[,-1]
	}

	if(is.null(Covariates)){
		numCov <- 0
	}else{
		numCov     <- dim(Covariates)[2]
		Covariates <- as.matrix(Covariates)
	}
  
	cat(paste("## number of total individuals: ", numIDV,"\n"))
	cat(paste("## number of total genes/sites: ", numVar,"\n"))
	cat(paste("## number of adjusted covariates: ", numCov,"\n"))
  
  
	CountData  <- as.matrix(CountData)
	Phenotypes <- as.matrix(Phenotypes)
  
  
	if(is.null(RelatednessMatrix)){
		stop("PQLseq::please input relatedness matrix!")
	}else{
		RelatednessMatrix <- as.matrix(RelatednessMatrix)
		scalerM           <- diag(numIDV)-(rep(1,numIDV)%*%t(rep(1,numIDV)))/numIDV
		eig               <- eigen(RelatednessMatrix)
		eigval            <- eig$value
		eigvector         <- eig$vectors
		if(any(eigval<1e-10)){ 
			warning("PQLseq::the relatedness matrix is singular, it has been modified!")
			RelatednessMatrix <- as.matrix(nearPD(RelatednessMatrix,corr=T)$mat)	
		}
		rm(scalerM)
		rm(eig)
		rm(eigval)
		rm(eigvector)
	}

	RelatednessMatrix <- list(RelatednessMatrix, diag(numIDV))
  
	#***********************************#
	#       Poisson Mixed Model         #
	#***********************************#
	if(fit.model == "PMM"){
		cat("# fitting Poisson mixed model ... \n")
		if(is.null(LibSize)){
			LibSize <- apply(CountData, 2, sum)
			LibSize <- as.matrix(LibSize)
		}else{
			LibSize <- as.matrix(t(LibSize))
		}
	 	 
	 
		# do parallel using foreach function
		iVar   <- NULL
		resPMM <-foreach(iVar=1:numVar,.combine=rbind)%dopar%{
			numAnalysis <- beta <- tau1 <- tau2 <- se_beta <- pvalue <- converged <- h2 <- sigma2 <- overdisp <- NA
			if(numCov==0){
				model0 <- try(glm(formula = CountData[iVar,]~Phenotypes + offset(log(LibSize)), family = poisson(link="log")))
				idx   <- match(rownames(model.frame(formula = CountData[iVar,]~Phenotypes + offset(log(LibSize)), na.action = na.omit)),
      				      rownames(model.frame(formula = CountData[iVar,]~Phenotypes + offset(log(LibSize)), na.action = na.pass)))
			}else{
				model0 <- try(glm(formula = CountData[iVar,]~Covariates + Phenotypes + offset(log(LibSize)), family = poisson(link="log")))
				idx   <- match(rownames(model.frame(formula = CountData[iVar,]~Covariates + Phenotypes + offset(log(LibSize)), na.action = na.omit)),
                          rownames(model.frame(formula = CountData[iVar,]~Covariates + Phenotypes + offset(log(LibSize)), na.action = na.pass)))
			}
		
			if(verbose) {cat(paste("NO. Gene = ",iVar,"\n"))}
    
			tmpRelatednessMatrix <- RelatednessMatrix
			if(is(tmpRelatednessMatrix,"matrix")) {
				tmpRelatednessMatrix <- tmpRelatednessMatrix[idx, idx]
			}else {
				for(ik in seq_len(length(tmpRelatednessMatrix)) ) {tmpRelatednessMatrix[[ik]] <- tmpRelatednessMatrix[[ik]][idx, idx]}
			}
      
			names(tmpRelatednessMatrix) <- paste("kins", 1:length(tmpRelatednessMatrix), sep="")

			if(class(model0)[1]!="try-error"){
				# t1 <- system.time(model1 <- try(PQLseq.fit(model0, tmpRelatednessMatrix)))
				model1 <- try(PQLseq.fit(model0, tmpRelatednessMatrix))
			}else{
				model1 <- NULL
			}
	
			if(!is.null(model1)&(class(model1)!="try-error")){				
				if(verbose){cat(paste("PQLseq::PMM::tau = ", model1$theta,"\n"))}
				numAnalysis <- length(idx)
				beta        <- model1$coefficients[length(model1$coefficients)]
				se_beta     <- sqrt(diag(model1$cov)[length(model1$coefficients)] )
				pvalue      <- pchisq( (beta/se_beta)^2, 1, lower.tail = F)
				sigma2      <- model1$theta[2]+model1$theta[3]
				h2          <- model1$theta[2]/(sigma2)
				tau1        <- model1$theta[2]
				tau2        <- model1$theta[3]
				converged   <- model1$converged
			}else{converged <- FALSE}
			
			res <- data.frame(numIDV = numAnalysis, beta = beta, se_beta = se_beta, 
							  pvalue = pvalue, h2 = h2, sigma2 = sigma2, 
							  converged = converged) 
		}# end for iVar, parallel
		rm(iVar)
		# closeAllConnections()
		# if(nrow(showConnections())!=0){closeAllConnections()}
		# cons <- suppressWarnings(showConnections(all = TRUE))
		# rm(cons)

		parallel::stopCluster(cl)

		rownames(resPMM) <- rownames(CountData)
		return(resPMM)
	}# end PMM 
	#***********************************#
	#       Binomial Mixed Model        #
	#***********************************#
	if(fit.model == "BMM"){ 
		cat("# fitting binomial mixed model ... \n")
		if(is.null(LibSize)){
			stop("PQLseq::BMM::ERROR: please input the LibSize (total counts) file!!")
		}else{
			LibSize <- as.matrix(LibSize)
		}
  
		ratio               <- CountData/LibSize
		ratio[is.na(ratio)] <- 0
		flag                <- ratio>1.0
		sumflag             <- apply(flag,1, sum)
		idx                 <- which(sumflag>0)

		if (length(idx)>0){
			CountData <- CountData[-idx,]
			LibSize   <- LibSize[-idx,]
		}else{
			CountData <- CountData
			LibSize   <- LibSize
		}
		
		numVar <- dim(CountData)[1]
		numIDV <- dim(CountData)[2]	
		iVar   <- NULL
     
		# do parallel
		resBMM <- foreach(iVar=1:numVar,.combine=rbind)%dopar%{
			numAnalysis <- beta <- tau1 <- tau2 <- se_beta <- pvalue <- converged <- h2 <- sigma2 <- overdisp <- NA
			if(verbose){cat(paste("NO. Gene/Site = ",iVar,"\n"))}
			if(sum(dim(LibSize)==dim(CountData)) != 2){
				stop("PQLseq::BMM::ERROR: the dimensions of read counts and total read counts do not match!")
			}
    
			LibSize <- as.matrix(LibSize)
    
			if(numCov == 0){
				model0 <- glm(formula = CountData[iVar,]/LibSize[iVar,]~Phenotypes, family = binomial(link = "logit"), weights = LibSize[iVar,])
				idx    <- match(rownames(model.frame(formula = CountData[iVar,]/LibSize[iVar,]~Phenotypes, na.action = na.omit)),
                          rownames(model.frame(formula = CountData[iVar,]/LibSize[iVar,]~Phenotypes, na.action = na.pass)))
			}else{
				model0 <- glm(formula = CountData[iVar,]/LibSize[iVar,]~Covariates + Phenotypes, family = binomial(link = "logit"), weights = LibSize[iVar,] )
				idx    <- match(rownames(model.frame(formula = CountData[iVar,]/LibSize[iVar,]~Covariates + Phenotypes, na.action = na.omit)),
						  rownames(model.frame(formula = CountData[iVar,]/LibSize[iVar,]~Covariates + Phenotypes, na.action = na.pass)))
			}
	 	
			model0$numTotal <- LibSize[iVar,idx]
			model0$numSucc  <- CountData[iVar,idx]
     
			redflag <- FALSE
			for( ierr in c(2:dim(model.matrix(model0))[2])){
				if(length(unique(model.matrix(model0)[,ierr])) == 1){
					warning(paste("PQLseq::BMM::the ",ierr-1,"-th column of covariates are the same for gene/site ",rownames(CountData)[iVar],"!",sep = "") )
					redflag <- TRUE
				}
			}
			if(!redflag){
      
				tmpRelatednessMatrix <- RelatednessMatrix
				if(is(tmpRelatednessMatrix,"matrix")) {
					tmpRelatednessMatrix <- tmpRelatednessMatrix[idx, idx]
				}else {
					for(ik in seq_len(length(tmpRelatednessMatrix)) ) {
						tmpRelatednessMatrix[[ik]] <- tmpRelatednessMatrix[[ik]][idx, idx]
					}
				}
				names(tmpRelatednessMatrix) <- paste("kins", 1:length(tmpRelatednessMatrix), sep="")

				# t1 <- system.time(model1 <- try( PQLseq.fit(model0, tmpRelatednessMatrix) ))
				model1 <- try(PQLseq.fit(model0, tmpRelatednessMatrix))
		
				if(class(model1) != "try-error"&!is.null(model1)){
					if(verbose){cat(paste("PQLseq::BMM::tau = ", model1$theta,"\n"))}
					numAnalysis <- length(idx)
					beta        <- model1$coefficients[ length(model1$coefficients) ]# the last one
					se_beta     <- sqrt( diag(model1$cov)[ length(model1$coefficients) ] )
					pvalue      <- pchisq( (beta/se_beta)^2, 1, lower.tail = F)
					sigma2      <- model1$theta[2]+model1$theta[3]
					h2          <- model1$theta[2]/(sigma2)
					tau1        <- model1$theta[2]
					tau2        <- model1$theta[3]
					converged   <- model1$converged
				}else{converged <- FALSE}
		
				res <- data.frame(numIDV = numAnalysis, beta = beta, se_beta = se_beta, 
							pvalue = pvalue, h2 = h2, sigma2 = sigma2, 
							converged = converged)
			}# end for iVar, parallel
		
		}
		rm(iVar)

		# if(nrow(showConnections())!=0){closeAllConnections()}
		# closeAllConnections()
		# cons <- suppressWarnings(showConnections(all = TRUE))
		# rm(cons)

		parallel::stopCluster(cl)

		rownames(resBMM) <- rownames(CountData)
		return(resBMM)
	}# end BMM
	
}# end function PQLseq


##########################################################
#           	   PQLseq FIT FUNCTION					 #
##########################################################

PQLseq.fit <- function(model0, RelatednessMatrix, method = "REML", method.optim = "AI", maxiter = 500, tol = 1e-5, verbose = FALSE) {
	
	names(RelatednessMatrix) <- paste("kins", 1:length(RelatednessMatrix), sep="")
	# if((method.optim == "AI")&(!sum(model0$fitted.values<1e-5))) {
	if(method.optim == "AI") {
		fixtau.old 	<- rep(0, length(RelatednessMatrix)+1)
		# to use average information method to fit alternative model
		model1 		<- PQLseq.AI(model0, RelatednessMatrix, maxiter = maxiter, tol = tol, verbose = verbose)
		fixtau.new 	<- 1*(model1$theta < 1.01 * tol)

		while(any(fixtau.new != fixtau.old)) {
			fixtau.old <- fixtau.new
			model1 	<- PQLseq.AI(model0, RelatednessMatrix, fixtau = fixtau.old, maxiter = maxiter, tol = tol, verbose = verbose)
			fixtau.new <- 1*(model1$theta < 1.01 * tol)
		}
	}else{
		model1 <- NULL
	}
	return(model1)
}

##########################################################
#       PQLseq FIT AVERAGE INFORMATION FUNCTION			 #
##########################################################

PQLseq.AI <- function(model0, RelatednessMatrix, tau = rep(0, length(RelatednessMatrix)+1), fixtau = rep(0, length(RelatednessMatrix)+1), maxiter = 500, tol = 1e-5, verbose = FALSE) {

	if(model0$family$family %in% c("binomial")){
		y <- model0$numSucc
	}else{
		y <- model0$y
	}
	numIDV <- length(y)
	offset <- model0$offset
	if(is.null(offset)) {offset <- rep(0, numIDV)}
	
	family <- model0$family
	eta <- model0$linear.predictors
	mu <- model0$fitted.values
	mu.eta <- family$mu.eta(eta)
    D <- mu.eta/sqrt(model0$family$variance(mu))
  
	if(family$family %in% c("binomial")){
	  mu.eta <- model0$numTotal*mu.eta
	  D <- mu.eta/sqrt(model0$numTotal*model0$family$variance(mu))
	  mu <- model0$numTotal*mu
	}

	Y <- eta - offset + (y - mu)/mu.eta	
	X <- model.matrix(model0)
	alpha <- model0$coef
	
	if(family$family %in% c("poisson", "binomial")) {
		tau[1] <- 1
		fixtau[1] <- 1
	}
	numK <- length(RelatednessMatrix)
	idxtau <- which(fixtau == 0)
	numK2 <- sum(fixtau == 0)
	if(numK2 > 0) {
		tau[fixtau == 0] <- rep(min(0.9,var(Y)/(numK+1)), numK2)  
	  
		H <- tau[1]*diag(1/D^2)
		for(ik in 1:numK) {H <- H + tau[ik+1]*RelatednessMatrix[[ik]]}
	
		Hinv 	<- chol2inv(chol(H))
		HinvX 	<- crossprod(Hinv, X)
		XHinvX 	<- crossprod(X, HinvX)

		P <- try(Hinv - tcrossprod(tcrossprod(HinvX, chol2inv(chol( XHinvX ))), HinvX))
	
		if(is(P,"try-error")){
			stop("Error in P matrix calculation!")
		}

		PY <- crossprod(P, Y)
		tau0 <- tau
		for(ik in 1:numK2) {
		        if(ik == 1 && fixtau[1] == 0) tau[1] <- max(0, tau0[1] + tau0[1]^2 * (sum((PY/D)^2) - sum(diag(P)/D^2))/numIDV)
			else {
				PAPY <- crossprod(P, crossprod(RelatednessMatrix[[idxtau[ik]-1]], PY))
				tau[idxtau[ik]] <- max(0, tau0[idxtau[ik]] + tau0[idxtau[ik]]^2 * (crossprod(Y, PAPY) - sum(P*RelatednessMatrix[[idxtau[ik]-1]]))/numIDV)
			}
		}
	} 
	
	for (iter in seq_len(maxiter)) {	
		alpha0 	<- alpha
		tau0 	<- tau
		model1 	<- AI(Y, X, length(RelatednessMatrix), RelatednessMatrix, D^2, tau, fixtau, tol)
		
		tau <- as.numeric(model1$tau)
		cov <- as.matrix(model1$cov)
		alpha <- as.numeric(model1$alpha)
		eta <- as.numeric(model1$eta) + offset
		
		
		mu <- family$linkinv(eta)
		mu.eta <- family$mu.eta(eta)
		D <- mu.eta/sqrt(family$variance(mu))

		if(family$family %in% c("binomial")){
		  mu.eta <- model0$numTotal*mu.eta
		  D <- mu.eta/sqrt(model0$numTotal*family$variance(mu))
		  mu <- model0$numTotal*mu
		}
		
		Y <- eta - offset + (y - mu)/mu.eta

		if(2*max(abs(alpha - alpha0)/(abs(alpha) + abs(alpha0) + tol), abs(tau - tau0)/(abs(tau) + abs(tau0) + tol)) < tol) {break}
		if(max(tau) > tol^(-2)|any(is.infinite(D))|any(is.infinite(mu))|any(is.infinite(eta)) ) {
			
			iter <- maxiter
			break
		}
	}

	converged <- ifelse(iter < maxiter, TRUE, FALSE)
	res <- y - mu
	P <- model1$P
	return(list(theta = tau, coefficients = alpha, linear.predictors = eta, fitted.values = mu, Y = Y, P = P, residuals = res, cov = cov, converged = converged))
}# end function


#########################################
#             CODE END                  #
#########################################

Try the PQLseq package in your browser

Any scripts or data that you put into this service are public.

PQLseq documentation built on June 6, 2021, 5:06 p.m.