R/fitMixedModelDE.R

Defines functions plotCompareP .standardized_t_stat .change_t_df .standard_transform dream .checkNA .eval_lmm .getContrastInit .getAllUniContrasts getContrast residuals.MArrayLM2

Documented in dream .getAllUniContrasts getContrast plotCompareP .standard_transform

#' Class MArrayLM2
#'
#' Class \code{MArrayLM2} 
#'
#' @name MArrayLM2-class
#' @rdname MArrayLM2-class
#' @exportClass MArrayLM2
setClass("MArrayLM2",
#  Linear model fit
representation("MArrayLM")#, varComp="data.frame", sigGStruct='list')
)

setIs("MArrayLM2","LargeDataObject")
setAs(from='MArrayLM', to='MArrayLM2', function(from){
	structure(from, class="MArrayLM2")
	})


#' Class FMT 
#'
#' Class \code{FMT} 
#'
#' @name FMT-class
#' @rdname FMT-class
#' @exportClass FMT
#' @keywords internal
setClass('FMT', contains='MArrayLM2')

setAs("MArrayLM2", "FMT", function(from, to ){

	res = new( to, from )
	names(res) = names(from)
	res
})

# define S3 version of these functions

#' @export  
residuals.MArrayLM2 = function( object, ...){
	if( is.null(object$residuals) ){
		stop( "Residuals were not computed, must run:\n dream(...,computeResiduals=TRUE)")
	}
	if( nargs() > 1 ){#& is.null(suppressWarnings) ){
		warning("\n Second argument is ignored here,\n but can be passed for compatability with limma.\n Results are the same either way")
	}
	object$residuals
}


# #' @importFrom limma residuals.MArrayLM
# # @export  
# residuals.MArrayLM = function( object, ...){

# 	if( is.null(object$residuals) ){
# 		# use residuals computed by limma
# 		res = limma::residuals.MArrayLM( object, ...)
# 	}else{
# 		# use precomputed residuals
# 		res = object$residuals
# 	}
# 	res
# }


# S4 methpds

#' residuals for MArrayLM
#'
#' residuals for MArrayLM
#'
#' @param object MArrayLM object from dream
#' @param ... other arguments, currently ignored
#'
#' @return results of residuals
#' @export
setMethod("residuals", "MArrayLM",
	function( object, ...){
		if( nargs() == 1 ){
			result = object$residuals
		}else{
			result = residuals.MArrayLM(object,...)
		}
		result
	})


#' residuals for MArrayLM2
#'
#' residuals for MArrayLM2
#'
#' @param object MArrayLM2 object from dream
#' @param ... other arguments, currently ignored
#'
#' @return results of residuals
#' @export
setMethod("residuals", "MArrayLM2",
	function( object, ...){
		residuals.MArrayLM2(object,...)
		})






#' Extract contrast matrix for linear mixed model
#' 
#' Extract contrast matrix, L, testing a single variable.  Contrasts involving more than one variable can be constructed by modifying L directly
#'
#' @param exprObj matrix of expression data (g genes x n samples), or \code{ExpressionSet}, or \code{EList} returned by \code{voom()} from the \code{limma} package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work
#' @param data data.frame with columns corresponding to formula 
#' @param coefficient the coefficient to use in the hypothesis test
#' 
#' @return
#' Contrast matrix testing one variable
#'
#' @examples
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' # get contrast matrix testing if the coefficient for Batch2 is zero 
#' # The variable of interest must be a fixed effect
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' L = getContrast( geneExpr, form, info, "Batch3")
#'
#' # get contrast matrix testing if Batch3 - Batch2 = 0
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' L = getContrast( geneExpr, form, info, c("Batch3", "Batch2"))
#'
#' # To test against Batch1 use the formula:
#' # 	~ 0 + Batch + (1|Individual) + (1|Tissue) 
#' # to estimate Batch1 directly instead of using it as the baseline
#'
#' @export
# @docType methods
#' @rdname getContrast-method
getContrast = function( exprObj, formula, data, coefficient){ 

	if( length(coefficient) > 2){
		stop("Length of coefficient array limited to 2")
	}

	L = .getContrastInit( exprObj, formula, data)

	# assign coefficient coding
	if( any(!coefficient %in% names(L)) ){
		stop("coefficient is not in the formula.  Valid coef are:\n", paste(names(L), collapse=', '))
	}
	L[coefficient[1]] = 1

	if( length(coefficient) == 2){			
		L[coefficient[2]] = -1
	}
	
	L
}

#' Get all univariate contrasts
#'
#' Get all univariate contrasts
#'
#' @param exprObj matrix of expression data (g genes x n samples), or ExpressionSet, or EList returned by voom() from the limma package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work
#' @param data data.frame with columns corresponding to formula 
#'
#' @return
#'  Matrix testing each variable one at a time.  Contrasts are on rows
#'
#' @keywords internal
.getAllUniContrasts = function( exprObj, formula, data){ 

	Linit = .getContrastInit( exprObj, formula, data)

	Lall = lapply( seq_len(length(Linit)), function(i){
		Linit[i] = 1
		Linit
		})
	names(Lall) = names(Linit)
	Lall = do.call("rbind", Lall)

	# remove intercept contrasts
	# Lall[,-1,drop=FALSE]
	Lall
}

.getContrastInit = function( exprObj, formula, data){ 

	exprObj = as.matrix( exprObj )
	formula = stats::as.formula( formula )

	REML=TRUE
	useWeights=TRUE
	weightsMatrix=NULL
	showWarnings=FALSE
	dream=TRUE
	fxn=identity
	colinearityCutoff=.999
	control = lme4::lmerControl(calc.derivs=FALSE, check.rankX="stop.deficient" )

	# check dimensions of reponse and covariates
	if( ncol(exprObj) != nrow(data) ){		
		stop( "the number of samples in exprObj (i.e. cols) must be the same as in data (i.e. rows)" )
	}

	# if weightsMatrix is not specified, set useWeights to FALSE
	if( useWeights && is.null(weightsMatrix) ){
		# warning("useWeights was ignored: no weightsMatrix was specified")
		useWeights = FALSE
	}

	# if useWeights, and (weights and expression are the same size)
	if( useWeights && !identical( dim(exprObj), dim(weightsMatrix)) ){
		 stop( "exprObj and weightsMatrix must be the same dimensions" )
	}

	# If samples names in exprObj (i.e. columns) don't match those in data (i.e. rows)
	if( ! identical(colnames(exprObj), rownames(data)) ){
		 warning( "Sample names of responses (i.e. columns of exprObj) do not match\nsample names of metadata (i.e. rows of data).  Recommend consistent\nnames so downstream results are labeled consistently." )
	}

	# add response (i.e. exprObj[j,]) to formula
	# Use term 'responsePlaceholder' to store the value of reponse j (exprObj[j,])
	# This is not an ideal way to structure the evaluation of response j.
	# 	The formula is evaluated a different scope, (i.e. within lmer()), and there is no need to pass the
	# 	entire exprObj object into that scope.  With lexical scope, I found it was possible that  
	# 	the value of exprObj[j,] could be different when evaluated in the lower scope
	# This placeholder term addresses that issue
	form = paste( "responsePlaceholder$E", paste(as.character( formula), collapse=''))

	# run lmer() to see if the model has random effects
	# if less run lmer() in the loop
	# else run lm()
	responsePlaceholder = nextElem(exprIter(exprObj, weightsMatrix, useWeights))
	possibleError <- tryCatch( lmer( eval(parse(text=form)), data=data,control=control ), error = function(e) e)

	mesg <- "No random effects terms specified in formula"
	method = ''
	if( inherits(possibleError, "error") && identical(possibleError$message, mesg) ){
		
		design = model.matrix( formula, data)

		L = rep(0, ncol(design))
		names(L) = colnames(design)

		 # detect error when variable in formula does not exist
	}else if( inherits(possibleError, "error") && length( grep("object '.*' not found", possibleError$message)) > 0 ){
		stop("Variable in formula is not found: ", gsub("object '(.*)' not found", "\\1", possibleError$message) )
	}
	else{

		if( inherits(possibleError, "error") && grep('the fixed-effects model matrix is column rank deficient', possibleError$message) == 1 ){
			stop(paste(possibleError$message, "\n\nSuggestion: rescale fixed effect variables.\nThis will not change the variance fractions or p-values."))
		} 		

		fit = lmer( eval(parse(text=form)), data=data,control=control, REML=TRUE )
		
		L = rep(0, length(fixef(fit)))
		names(L) = names(fixef(fit))
	}
	L
}




# Evaluate contrasts for linear mixed model
# 
# Evaluate contrasts for linear mixed model
#
# @param fit model fit
# @param L contrast matrix
# @param ddf Specifiy "Satterthwaite" or "Kenward-Roger" method to estimate effective degress of freedom for hypothesis testing in the linear mixed model.  Note that Kenward-Roger is more accurate, but is *much* slower.  Satterthwaite is a good enough approximation for most datasets.
# 
# @return
# df, sigma, beta, SE of model
#
# @details If the Kenward-Roger covariance matrix is not positive definite, the Satterthwaite method is used
#
# @export
# @docType methods
# @rdname eval_lmm-method
.eval_lmm = function( fit, L, ddf ){

	j = 1
	# evaluate each contrast
	# cons = lmerTest::contest(fit, L, ddf=ddf)
	cons = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		lmerTest::contest(fit, L[,j], ddf=ddf)
	}

	df = as.numeric(cons[,'DenDF'])

	if(ddf == "Kenward-Roger"){
		# KR
		V = pbkrtest::vcovAdj.lmerMod(fit, 0)

		# if matrix is not PSD
		if( min(diag(as.matrix(V))) < 0){			
			warning("The adjusted Kenward-Roger covariance matrix is not positive definite.\nUsing Satterthwaite approximation instead")

			# Satterthwaite
			V = vcov(fit)
		}
		# df = pbkrtest::get_Lb_ddf(fit, L)
	}else{
		# Satterthwaite
		V = vcov(fit)
		# df = as.numeric(contest(fit, L, ddf="Sat")['DenDF'])
	}

	sigma = attr(lme4::VarCorr(fit), "sc")

	# get contrasts
	# beta = as.matrix(sum(L * fixef(fit)), ncol=1)
	# colnames(beta) = "logFC"

	beta = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		as.matrix(sum(L[,j] * fixef(fit)), ncol=1)
	}
	colnames(beta) = "logFC"
	rownames(beta) = colnames(L)

	# SE = as.matrix(sqrt(sum(L * (V %*% L))), ncol=1)		
	# colnames(SE) = "logFC"
	SE = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		as.matrix(sqrt(sum(L[,j] * (V %*% L[,j]))), ncol=1)
	}
	colnames(SE) = "logFC"
	rownames(SE) = colnames(L)

	# pValue = 2*pt(as.numeric(abs(beta / SE)), df, lower.tail=FALSE)
	pValue = as.numeric(cons[,'Pr(>F)'])

	list(cons = cons,
		df		= df,
		sigma	= sigma,
		beta	= beta,
		SE		= SE,
		pValue	= pValue,
		vcov 	= V )
}



.checkNA = function(exprObj){
	# check if values are NA
	countNA = sum(is.nan(exprObj)) + sum(!is.finite(exprObj))
	if( countNA > 0 ){
		stop("There are ", countNA, " NA/NaN/Inf values in exprObj\nMissing data is not allowed")
	}

	# check if all genes have variance
	rv = apply( exprObj, 1, var)
	if( any( rv == 0) ){
		idx = which(rv == 0)
		stop(paste("Response variable", idx[1], 'has a variance of 0'))
	}		
}

#' Differential expression with linear mixed model
#' 
#' Fit linear mixed model for differential expression and preform hypothesis test on fixed effects as specified in the contrast matrix L
#'
#' @param exprObj matrix of expression data (g genes x n samples), or ExpressionSet, or EList returned by voom() from the limma package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work, and lmFit() followed by contrasts.fit() are run.
#' @param data data.frame with columns corresponding to formula 
#' @param L contrast matrix specifying a linear combination of fixed effects to test
#' @param ddf Specifiy "Satterthwaite" or "Kenward-Roger" method to estimate effective degress of freedom for hypothesis testing in the linear mixed model.  Note that Kenward-Roger is more accurate, but is *much* slower.  Satterthwaite is a good enough approximation for most datasets.
#' @param useWeights if TRUE, analysis uses heteroskedastic error estimates from \code{voom()}.  Value is ignored unless exprObj is an \code{EList()} from \code{voom()} or \code{weightsMatrix} is specified
#' @param weightsMatrix matrix the same dimension as exprObj with observation-level weights from \code{voom()}.  Used only if useWeights is TRUE 
#' @param control control settings for \code{lmer()}
#' @param suppressWarnings if TRUE, do not stop because of warnings or errors in model fit
#' @param quiet suppress message, default FALSE
#' @param BPPARAM parameters for parallel evaluation
#' @param computeResiduals if TRUE, compute residuals and extract with \code{residuals(fit)}.  Setting to FALSE saves memory
#' @param REML use restricted maximum likelihood to fit linear mixed model. default is TRUE.  See Details.
#' @param ... Additional arguments for \code{lmer()} or \code{lm()}
#' 
#' @return 
#' MArrayLM2 object (just like MArrayLM from limma), and the directly estimated p-value (without eBayes)
#'
#' @details 
#' A linear (mixed) model is fit for each gene in exprObj, using formula to specify variables in the regression.  If categorical variables are modeled as random effects (as is recommended), then a linear mixed model us used.  For example if formula is \code{~ a + b + (1|c)}, then the model is 
#'
#' \code{fit <- lmer( exprObj[j,] ~ a + b + (1|c), data=data)}
#'
#' \code{useWeights=TRUE} causes \code{weightsMatrix[j,]} to be included as weights in the regression model.
#'
#' Note: Fitting the model for 20,000 genes can be computationally intensive.  To accelerate computation, models can be fit in parallel using \code{BiocParallel} to run code in parallel.  Parallel processing must be enabled before calling this function.  See below.
#' 
#' The regression model is fit for each gene separately. Samples with missing values in either gene expression or metadata are omitted by the underlying call to lmer.
#'
#' Hypothesis tests and degrees of freedom are producted by \code{lmerTest} and \code{pbkrtest} pacakges
#'
#' While \code{REML=TRUE} is required by \code{lmerTest} when ddf='Kenward-Roger', ddf='Satterthwaite' can be used with \code{REML} as \code{TRUE} or \code{FALSE}.  Since the Kenward-Roger method gave the best power with an accurate control of false positive rate in our simulations, and since the Satterthwaite method with REML=TRUE gives p-values that are slightly closer to the Kenward-Roger p-values, \code{REML=TRUE} is the default.  See Vignette "3) Theory and practice of random effects and REML"
#'
#' @examples
#'
#' # load library
#' # library(variancePartition)
#' library(BiocParallel)
#'
#' # Intialize parallel backend with 4 cores
#' library(BiocParallel)
#' register(SnowParam(4))
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' 
#' # Fit linear mixed model for each gene
#' # run on just 10 genes for time
#' fit = dream( geneExpr[1:10,], form, info)
#'
#' # view top genes
#' topTable( fit )
#'
#' # get contrast matrix testing if the coefficient for Batch2 is 
#' # different from coefficient for Batch3
#' # The variable of interest must be a fixed effect
#' L = getContrast( geneExpr, form, info, c("Batch2", "Batch3"))
#' 
#' # plot contrasts
#' plotContrasts( L )
#' 
#' # Fit linear mixed model for each gene
#' # run on just 10 genes for time
#' # Note that that dream() is not compatible with eBayes()
#' fit2 = dream( geneExpr[1:10,], form, info, L)
#' 
#' # view top genes
#' topTable( fit2 )
#' 
#' # Parallel processing using multiple cores with reduced memory usage
#' param = SnowParam(4, "SOCK", progressbar=TRUE)
#' fit3 = dream( geneExpr[1:10,], form, info, L, BPPARAM = param)
#'
#' # Fit fixed effect model for each gene
#' # Use lmFit in the backend
#' # Need to run eBayes afterward
#' form <- ~ Batch 
#' fit4 = dream( geneExpr[1:10,], form, info)
#' fit4 = eBayes( fit4 )
#' 
#' # view top genes
#' topTable( fit4 )
#'
#' # Compute residuals using dream
#' fit5 = dream( geneExpr[1:10,], form, info, L, BPPARAM = param, computeResiduals=TRUE)
#' 
#' # Return residuals
#' residuals(fit5)
#' 
#' @export
# @docType methods
#' @rdname dream-method
#' @importFrom pbkrtest get_SigmaG
#' @importFrom BiocParallel bpiterate bpparam
#' @importFrom lme4 VarCorr 
#' @importFrom stats hatvalues
#' @import doParallel foreach
#'
dream <- function( exprObj, formula, data, L, ddf = c("Satterthwaite", "Kenward-Roger"), useWeights=TRUE, weightsMatrix=NULL, control = lme4::lmerControl(calc.derivs=FALSE, check.rankX="stop.deficient" ),suppressWarnings=FALSE, quiet=FALSE, BPPARAM=bpparam(), computeResiduals=TRUE, REML=TRUE, ...){ 

	exprObjInit = exprObj
	
	exprObjMat = as.matrix( exprObj )
	formula = stats::as.formula( formula )
	ddf = match.arg(ddf)
	colinearityCutoff = 0.999

	if( ! is.data.frame(data) ){
		stop("data must be a data.frame")
	}

	# check dimensions of reponse and covariates
	if( ncol(exprObj) != nrow(data) ){		
		stop( "the number of samples in exprObj (i.e. cols) must be the same as in data (i.e rows)" )
	}

	# assign weightsMatrix from exprObj
	if( is( exprObj, "EList") ){
		if( useWeights ){
			weightsMatrix = exprObj$weights
		}
		.checkNA( exprObj$E )
	}else{
		.checkNA( exprObj )
	}


	if( !(ddf %in% c("Kenward-Roger", 'Satterthwaite')) ){
		stop("Specify ddf correctly")
	}

	if( ddf == "Kenward-Roger" & ! REML ){
		stop("Kenward-Roger must be used with REML")
	}

	# if weightsMatrix is not specified, set useWeights to FALSE
	if( useWeights && is.null(weightsMatrix) ){
		# warning("useWeights was ignored: no weightsMatrix was specified")
		useWeights = FALSE
	}

	# if useWeights, and (weights and expression are the same size)
	if( useWeights && !identical( dim(exprObj), dim(weightsMatrix)) ){
		 stop( "exprObj and weightsMatrix must be the same dimensions" )
	}

	if( ! useWeights ){
		weightsMatrix = NULL	
	}
	
	# If samples names in exprObj (i.e. columns) don't match those in data (i.e. rows)
	if( ! identical(colnames(exprObj), rownames(data)) ){
		 warning( "Sample names of responses (i.e. columns of exprObj) do not match\nsample names of metadata (i.e. rows of data).  Recommend consistent\nnames so downstream results are labeled consistently." )
	}

	if( .isDisconnected() ){
		stop("Cluster connection lost. Either stopCluster() was run too soon\n, or connection expired")
	}

	# Contrasts
	###########

	univariateContrasts = FALSE
	if( missing(L) ){
		# all univariate contrasts
		L = .getAllUniContrasts( exprObj, formula, data)
		univariateContrasts = TRUE
	}else{
		# format contrasts 
		if( is(L, "numeric") ){
			L = as.matrix(L, ncol=1)
		}else if( is(L, 'data.frame') ){
			L = as.matrix(L)
		}
		if( is.null(colnames(L)) ){
			colnames(L) = paste0('L', seq_len(ncol(L)))
		}

		# check columns that only have a single 1
		tst = apply(L, 2, function(x){
			length(x[x!=0]) == 1
			})
		if( any(tst) ){
			warning("Contrasts with only a single non-zero term are already evaluated by default.")
		}
		# remove univariate contrasts
		# L = L[,!tst,drop=FALSE]

		# add all univariate contrasts
		Luni = .getAllUniContrasts( exprObj, formula, data)
		L = cbind(L, Luni)
	}

	if( ncol(L) == 0){
		stop( "Must include fixed effect in the model for hypothesis testing")
	}

	# check rownames of contrasts
	if( length(unique(colnames(L))) != ncol(L) ){
		stop(paste("Contrast names must be unique: ", paste(colnames(L), collapse=', ')))
	}

	# Evaluate model on each gene
	#############################
	
	if( ! .isMixedModelFormula( formula ) ){
		if( !quiet ){
			message("Fixed effect model, using limma directly...")
			message("User can apply eBayes() afterwards...")
		}

		# weights are always used
		design = model.matrix( formula, data)

		# least squares fit
		ret = lmFit( exprObj, design, weights=weightsMatrix )

		if( computeResiduals ){
			# Evaluate residuals here, since can't be evaluate after contrasts.fit() is run
			# add this to the standard MArrayLM object for use by custom residuals function
			ret$residuals = residuals( ret, exprObj )
		}

		# apply contrasts
		if( ! univariateContrasts ){
			ret = contrasts.fit( ret, L)
		}

	}else{

		# add response (i.e. exprObj[j,]) to formula
		# Use term 'responsePlaceholder' to store the value of reponse j (exprObj[j,])
		# This is not an ideal way to structure the evaluation of response j.
		# 	The formula is evaluated a different scope, (i.e. within lmer()), and there is no need to pass the
		# 	entire exprObj object into that scope.  With lexical scope, I found it was possible that  
		# 	the value of exprObj[j,] could be different when evaluated in the lower scope
		# This placeholder term addresses that issue
		form = paste( "responsePlaceholder$E", paste(as.character( formula), collapse=''))

		# fit first model to initialize other model fits
		# this make the other models converge faster
		responsePlaceholder = nextElem(exprIter(exprObjMat, weightsMatrix, useWeights))

		timeStart = proc.time()
		fitInit <- lmerTest::lmer( eval(parse(text=form)), data=data,..., REML=REML, control=control )

		# extract covariance matrices  
		sigGStruct = get_SigmaG( fitInit )$G
		
		# check L
		if( ! identical(rownames(L), names(fixef(fitInit))) ){
			stop("Names of entries in L must match fixed effects")
		}
		
		# extract statistics from model
		mod = .eval_lmm( fitInit, L, ddf)
		timediff = proc.time() - timeStart

		# check that model fit is valid, and throw warning if not
		checkModelStatus( fitInit, showWarnings=!suppressWarnings, dream=TRUE, colinearityCutoff=colinearityCutoff )

		a = names(fixef(fitInit))
		b = rownames(L)

		if( ! identical( a,b) ){
			stop("Terms in contrast matrix L do not match model:\n  Model: ", paste(a, collapse=',' ), "\n  L: ", paste(b, collapse=',' ), "\nNote thhat order must be the same")
		}

		# specify gene explicitly in data 
		# required for downstream processing with lmerTest
		data2 = data.frame(data, expr=responsePlaceholder$E, check.names=FALSE)
		form = paste( "expr", paste(as.character( formula), collapse=''))

		pb <- progress_bar$new(format = ":current/:total [:bar] :percent ETA::eta", total = nrow(exprObj), width= 60, clear=FALSE)

		pids = .get_pids()

		timeStart = proc.time()

		# Define function for parallel evaluation
		.eval_models = function(responsePlaceholder, data2, form, REML, theta, control, na.action=stats::na.exclude,...){	
			# modify data2 for this gene
			data2$expr = responsePlaceholder$E
 
			# fit linear mixed model
			suppressWarnings({
				fit <- lmerTest::lmer( eval(parse(text=form)), data=data2, REML=REML,..., weights=responsePlaceholder$weights, control=control,na.action=na.action)
				})

			# extract statistics from model
			mod = .eval_lmm( fit, L, ddf)

			res = NULL
			if( computeResiduals ){
				 res = residuals(fit)
			}

			ret = list(	coefficients 	= mod$beta, 
						design 			= fit@pp$X, 
						df.residual 	= mod$df, 
						Amean 			= mean(fit@frame[,1]), 
						method 			= 'lmer',
						sigma 			= mod$sigma,
						stdev.unscaled 	= mod$SE/mod$sigma,
						pValue 			= mod$pValue,
						residuals 		= res ) 

			# get variance terms for random effects
			varComp <- lapply(lme4::VarCorr(fit), function(fit) attr(fit, "stddev")^2)
			varComp[['resid']] = attr(lme4::VarCorr(fit), 'sc')^2

			if( univariateContrasts ){
				V = mod$vcov
			}else{
				# do expensive evaluation only when L is defined by the user
				V = crossprod(chol(mod$vcov) %*% L)
			}

			list( 	ret 	= new("MArrayLM", ret),
					varComp = varComp,
					edf		= sum(hatvalues(fit)), # effective degrees of freedom as sum of diagonals of hat matrix
					vcov 	= V)
		}

		.eval_master = function( obj, data2, form, REML, theta, control, na.action=stats::na.exclude,... ){

			lapply(seq_len(nrow(obj$E)), function(j){
				.eval_models( list(E=obj$E[j,], weights=obj$weights[j,]), data2, form, REML, theta, control, na.action,...)
			})
		}

		# Evaluate function
		####################

		it = iterBatch(exprObjMat, weightsMatrix, useWeights, n_chunks = 100)

		if( !quiet ) message(paste0("Dividing work into ",attr(it, "n_chunks")," chunks..."))

		resList <- bpiterate( it, .eval_master, 
			data2=data2, form=form, REML=REML, theta=fitInit@theta, control=control,..., 
			 REDUCE=c,
		    reduce.in.order=TRUE,	
			BPPARAM=BPPARAM)
	
		
		names(resList) = seq_len(length(resList))

		if( !quiet ) message("\nTotal:", paste(format((proc.time() - timeStart)[3], digits=0), "s"))		

		x = 1
		# extract results
		coefficients = foreach( x = resList, .combine=cbind ) %do% {x$ret$coefficients}
		df.residual = foreach( x = resList, .combine=cbind ) %do% {	x$ret$df.residual}
		pValue = foreach( x = resList, .combine=cbind ) %do% { x$ret$pValue }
		stdev.unscaled  = foreach( x = resList, .combine=cbind ) %do% {x$ret$stdev.unscaled} 
		if( computeResiduals ){
			residuals = foreach( x = resList, .combine=cbind ) %do% { x$ret$residuals }
		}

		# transpose
		coefficients = t( coefficients )
		df.residual = t( df.residual )
		pValue = t( pValue )
		stdev.unscaled = t( stdev.unscaled )
		
		if( computeResiduals ){
			residuals = t(residuals)
			rownames(residuals) = rownames(exprObj)
		}

		colnames(coefficients) = colnames(L)
		rownames(coefficients) = rownames(exprObj)

		design = resList[[1]]$ret$design

		colnames(df.residual) = colnames(L)
		rownames(df.residual) = rownames(exprObj)

		colnames(pValue) = colnames(L)
		rownames(pValue) = rownames(exprObj)

		Amean = sapply( resList, function(x) x$ret$Amean) 
		names(Amean ) = rownames(exprObj)
		method = "lmer"
		sigma = sapply( resList, function(x) x$ret$sigma)
		names(sigma) = rownames(exprObj)

		varComp = lapply(resList, function(x) as.data.frame(x$varComp))
		varComp = do.call("rbind", varComp)
		rownames(varComp) = rownames(coefficients)

		edf = sapply(resList, function(x) x$edf)

		colnames(stdev.unscaled) = colnames(L)
		rownames(stdev.unscaled) = rownames(exprObj)

		ret = list( coefficients 	= coefficients,
		 			design 			= design, 
		 			df.residual 	= df.residual, 
		 			Amean 			= Amean, 
		 			method 			= method, 
		 			sigma 			= sigma, 
		 			contrasts 		= L,
		 			stdev.unscaled 	= stdev.unscaled)

		if( 'genes' %in% names(exprObjInit) ){
			ret$genes = exprObjInit$genes
		}

		ret = new("MArrayLM", ret)
		# ret$pValue = pValue

		# set covariance between covariates
		# C = solve(crossprod(ret$design))
		# C = chol2inv(chol(crossprod(ret$design)))
		V = chol2inv(qr(ret$design)$qr)
		rownames(V) = colnames(ret$design)
		colnames(V) = colnames(ret$design)

		# remove intercept term if it exists
		# fnd = colnames(C) == "(Intercept)" 
		# if( any(fnd) ){
		# 	C = C[!fnd,!fnd,drop=FALSE]
		# }

		if( ! univariateContrasts ){
			# do expensive evaluation only when L is defined by the user
			V = crossprod(chol(V) %*% L)
		}

		# covariance used by limma.  
		# Assumes covariance is the same for all genes
		ret$cov.coefficients = V

		# allows covariance to differ for each gene based on variance components
		ret$cov.coefficients.list = lapply(resList, function(x) as.matrix(x$vcov))

		if( computeResiduals ){
			ret$residuals = residuals
		}

		ret = as(ret, "MArrayLM2")
		# add additional information for pinnacle
		# ret = new("MArrayLM2", ret, varComp, sigGStruct)
		attr(ret, "varComp") = varComp
		attr(ret, "sigGStruct") = sigGStruct
		attr(ret, "edf") = edf

		# compute standard values for t/F/p without running eBayes
		# eBayes can be run afterwards, if wanted
		ret = .standard_transform( ret )
	}
	ret
}


#' Compute standard post-processing values
#' 
#' These values are typically computed by eBayes
#' 
#' @param fit result of dream (MArrayLM2)
#'
#' @return MArrayLM2 object with values computed
#' 
#' @importFrom stats pchisq pf
#' @keywords internal
.standard_transform = function(fit){

	# get values
	coefficients <- fit$coefficients
	stdev.unscaled <- fit$stdev.unscaled
	sigma <- fit$sigma
	df.residual <- fit$df.residual
	
	# t-test
	out = fit
	out$t <- coefficients / stdev.unscaled / sigma
	# out$df.total <- df.residual
	out$p.value <- 2*pt(-abs(out$t),df=df.residual)

	# F-test
	if(!is.null(out$design) && is.fullrank(out$design)) {

		# only evaluate F-stat on real coefficients, not contrasts
		realcoef = colnames(out)[colnames(out) %in% colnames(out$design)]
		realcoef = realcoef[realcoef!="(Intercept)"]

		if( is.null(realcoef) || (length(realcoef) == 0) ){

			# this happends when only the intercept term is included
			warning("No testable fixed effects were included in the model.\n  Running topTable() will fail.")
		}else{
			df = rowMeans(out[,realcoef]$df.residual)

			F.stat <- classifyTestsF(out[,realcoef], df=df, fstat.only=TRUE)
			out$F <- as.vector(F.stat)
			df1 <- attr(F.stat,"df1")
			df2 <- attr(F.stat,"df2")
			if(df2[1] > 1e6){ # Work around bug in R 2.1
				out$F.p.value <- pchisq(df1*out$F,df1,lower.tail=FALSE)
			}else{
				out$F.p.value <- pf(out$F,df1,df2,lower.tail=FALSE)
			}
		}
	}

	out
}




#' Subseting for MArrayLM2
#'
#' Enable subsetting on MArrayLM2 object. Same as for MArrayLM, but apply column subsetting to df.residual and cov.coefficients.list
#'
#' @param object MArrayLM2
#' @param i row
#' @param j col
#'
#' @name [.MArrayLM2
#' @return subset
#' @export
# @S3method [ MArrayLM2
#' @rawNamespace S3method("[", MArrayLM2)
#' @importFrom stats p.adjust
#' @rdname subset.MArrayLM2-method
#' @aliases subset.MArrayLM2,MArrayLM2-method
#' @keywords internal
assign("[.MArrayLM2",
	function(object, i, j){
	if(nargs() != 3){
		stop("Two subscripts required",call.=FALSE)
	}

	# apply standard MArrayLM subsetting
	obj = as(object, 'MArrayLM')

	if(!missing(j)){
		obj = obj[,j]
	}
	if(!missing(i)){
		obj = obj[i,]
	}

	# custom code to deal with df.total and df.residual
	if( is.null(ncol(object$df.total)) ){
		if(!missing(i)){
			obj$df.total = object$df.total[i]
		}else{
			obj$df.total = object$df.total
		}
	}else{			
		tmp = object$df.total
		if(!missing(i)){
			tmp = object$df.total[i,,drop=FALSE]
		}		
		if(!missing(j)){
			tmp = tmp[,j,drop=FALSE]
		}	
		obj$df.total = tmp
	}

	if( is.null(ncol(object$df.residual)) ){
		if(!missing(i)){
			obj$df.residual = object$df.residual[i]
		}else{			
			obj$df.residual = object$df.residual
		}
	}else{	
		tmp = object$df.residual
		if(!missing(i)){
			tmp = object$df.residual[i,,drop=FALSE]
		}		
		if(!missing(j)){
			tmp = tmp[,j,drop=FALSE]
		}	
		obj$df.residual = tmp
	}

	# obj$pValue = object$pValue[i,j]
	obj$s2.prior = object$s2.prior
	obj$df.prior = object$df.prior

	obj = as(obj, "MArrayLM2")

	#  copy gene-specific covariance, if it exists
	if( ! is.null(object$cov.coefficients.list) ){
		if(!missing(i)){
			obj$cov.coefficients.list = object$cov.coefficients.list[i]
		}else{
			obj$cov.coefficients.list = object$cov.coefficients.list
		}
	}	

	if( is.null(obj$df.total)){
		obj$df.total = rowMeans(obj$df.residual)
	}

	# the F-statistic and p-value are evaluated when subsetting is applied
	# so need to apply df2 here
	# If columns have been subsetted, need to re-generate F
	if(!is.null(obj[["F"]]) && !missing(j)) {

		F.stat <- classifyTestsF(obj,df=obj$df.total,fstat.only=TRUE)
		obj$F <- as.vector(F.stat)
		df1 <- attr(F.stat,"df1")
		df2 <- attr(F.stat,"df2")
		if (df2[1] > 1e6){ 
			obj$F.p.value <- pchisq(df1*obj$F,df1,lower.tail=FALSE)
		}else{
			obj$F.p.value <- pf(obj$F,df1,df2,lower.tail=FALSE)
		}
	}
	obj	
})


setGeneric("eBayes", function(fit, proportion = 0.01, stdev.coef.lim = c(0.1, 4), 
    trend = FALSE, robust = FALSE, winsor.tail.p = c(0.05, 0.1) ){

	eBayes(fit, proportion, stdev.coef.lim, trend, robust, winsor.tail.p  )
})


#' eBayes for MArrayLM2
#'
#' eBayes for MArrayLM2.  It simply returns the orginal input with no change
#'
#' @param fit fit
#' @param proportion proportion
#' @param stdev.coef.lim stdev.coef.lim
#' @param trend trend
#' @param robust robust
#' @param winsor.tail.p winsor.tail.p 
#'
#' @return results of eBayes
#' @details Note that empirical Bayes is problematic for linear mixed models.  This function returns its original input with no change.  It is included here just for compatibility.
#' @export
#' @rdname eBayes-method
#' @aliases eBayes,MArrayLM2-method
setMethod("eBayes", "MArrayLM2",
function(fit, proportion = 0.01, stdev.coef.lim = c(0.1, 4), 
    trend = FALSE, robust = FALSE, winsor.tail.p = c(0.05, 0.1)){

	warning("Empirical Bayes moderated test is no longer supported for dream analysis\nReturning original results for use downstream")

	fit
})



# change_t_df
#
# change_t_df
#
# @param t t-statistic
# @param df current degrees of freedom
# @param d_target target degrees of freedom
#
# @return Given a t-statistic with degrees of freedom df, return a new t-statistic corresponding to a target degrees of freedom.  Both t-statistics give the same p-value when compared to their respective degrees of freedom
.change_t_df = function(t, df, d_target){
  p = pt(abs(t), df, lower.tail=FALSE)
  qt(p, d_target, lower.tail=FALSE)
}

# standardized_t_stat
#
# standardized_t_stat
#
# @param fit model fit from dream()
#
# @return Given a t-statistic with degrees of freedom df, return a new t-statistic corresponding to a target degrees of freedom.  Both t-statistics give the same p-value when compared to their respective degrees of freedom
#
# @details Since dream() used degrees of freedom estimated from the data, the t-statistics across genes have different df values used to compute p-values.  Therefore the order of the raw t-statistics doesn't correspond to the order of the p-values since the df is different for each gene.  This function resolved this issue by transofrming the t-statistics to all have the same df.  Under a fixed effects model, df = N - p, where N is the sample size and p is the number of covariates.  Here, df is the target degrees of freedom used in the transformation
.standardized_t_stat = function( fit ){

  res = data.frame(t=as.numeric(fit$t), df.sat=as.numeric(fit$df.residual))

  # d_target = max(res$df.sat)
  d_target = nrow(fit$design) - ncol(fit$design)

  res$t_dot = sign(res$t) * .change_t_df( res$t, res$df.sat, d_target )

  fit2 = fit
  fit2$t = matrix(res$t_dot, nrow=nrow(fit$t))
  rownames( fit2$t) = rownames( fit$t)
  colnames( fit2$t) = colnames( fit$t)

  # fit2$df.residual = rep(d_target, nrow(res))

  # if df.prior is defined, set new df.total, since df.residual has changed
  if( !is.null(fit2$df.prior) ){
	fit2$df.total = fit2$df.residual + fit2$df.prior
  }

  fit2
}


#' Compare p-values from two analyses
#'
#' Plot -log10 p-values from two analyses and color based on donor component from variancePartition analysis
#'
#' @param p1 p-value from first analysis
#' @param p2 p-value from second analysis
#' @param vpDonor donor component for each gene from variancePartition analysis
#' @param dupcorvalue scalar donor component from duplicateCorrelation
#' @param fraction fraction of highest/lowest values to use for best fit lines
#' @param xlabel for x-axis
#' @param ylabel label for y-axis
#'
#' @return ggplot2 plot
#'
#' @examples
#'
#' # load library
#' # library(variancePartition)
#'
#' # Intialize parallel backend with 4 cores
#' library(BiocParallel)
#' register(SnowParam(4))
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' # Perform very simple analysis for demonstration
#' 
#' # Analysis 1
#' form <- ~ Batch 
#' fit = dream( geneExpr, form, info)
#' fit = eBayes( fit )
#' res = topTable( fit, number=Inf, coef="Batch3" )
#' 
#' # Analysis 2
#' form <- ~ Batch + (1|Tissue)
#' fit2 = dream( geneExpr, form, info)
# fitEB2 = eBayes( fit2 )
#' res2 = topTable( fit2, number=Inf, coef="Batch3" )
#' 
#' # Compare p-values
#' plotCompareP( res$P.Value, res2$P.Value, runif(nrow(res)), .3 )
#' 
# # stop cluster
# stopCluster(cl)
#'
#' @export
# @docType methods
#' @rdname plotCompareP-method
plotCompareP = function( p1, p2, vpDonor, dupcorvalue, fraction=.2, xlabel=bquote(duplicateCorrelation~(-log[10]~p)), ylabel=bquote(dream~(-log[10]~p))){

	if( length(unique(c(length(p1), length(p2), length(vpDonor)))) != 1){
		stop("p1, p2 and vpDonor must have the same number of entries")
	}
	if( length(dupcorvalue) != 1){
		stop("dupcorvalue must be a scalar")
	}

	df2 = data.frame(p1=-log10(p1), p2=-log10(p2), vpDonor=vpDonor, delta = vpDonor - dupcorvalue)

	N = nrow(df2)

	c1 = sort(df2$delta)[N*fraction]
	c2 = sort(df2$delta)[length(df2$delta) - N*fraction]

	l1 = lm(p2 ~ p1, df2[df2$delta >= c2,])
	l2 = lm(p2 ~ p1, df2[df2$delta <= c1,])

	df_line = data.frame(rbind(coef(l1), coef(l2)))
	colnames(df_line) = c('a', 'b')
	df_line$type = c('darkred', 'navy')

	lim = c(0, max(max(df2$p1), max(df2$p2)))

	# xlab("duplicateCorrelation (-log10 p)") + ylab("dream (-log10 p)")
	ggplot(df2, aes(p1, p2, color = vpDonor)) + geom_abline() + geom_point(size=2) + theme_bw(17) +  theme(aspect.ratio=1,plot.title = element_text(hjust = 0.5)) + xlim(lim) + ylim(lim) + xlab(xlabel) + ylab(ylabel) + 
		geom_abline( intercept=df_line$a, slope=df_line$b, color=df_line$type, linetype=2) + scale_color_gradientn(name = "Donor", colours = c("blue","green","red"), 
	                       values = rescale(c(0, dupcorvalue, 1)),
	                       guide = "colorbar", limits=c(0, 1))
}

#' Multiple Testing Genewise Across Contrasts
#' 
#'  For each gene, classify a series of related t-statistics as up, down or not significant.
#' 
#' @param object numeric matrix of t-statistics or an 'MArrayLM2' object from which the t-statistics may be extracted.
#' @param ... additional arguments
#' 
#' @details Works like limma::classifyTestsF, except object can have a list of covariance matrices object$cov.coefficients.list, instead of just one in object$cov.coefficients
#' @seealso limma::classifyTestsF
#' @export
setGeneric("classifyTestsF", signature="object",
  function(object, ...)
      standardGeneric("classifyTestsF")
)



#' Multiple Testing Genewise Across Contrasts
#' 
#'  For each gene, classify a series of related t-statistics as up, down or not significant.
#' 
#' @param object numeric matrix of t-statistics or an 'MArrayLM2' object from which the t-statistics may be extracted.
#' @param cor.matrix covariance matrix of each row of t-statistics.  Defaults to the identity matrix.
#' @param df numeric vector giving the degrees of freedom for the t-statistics.  May have length 1 or length equal to the number of rows of tstat.
#' @param p.value numeric value between 0 and 1 giving the desired size of the test
#' @param fstat.only logical, if 'TRUE' then return the overall F-statistic as for 'FStat' instead of classifying the test results
#' 
#' @details Works like limma::classifyTestsF, except object can have a list of covariance matrices object$cov.coefficients.list, instead of just one in object$cov.coefficients
#' @seealso limma::classifyTestsF
#' @importFrom stats qf
#' @export
setMethod("classifyTestsF", "MArrayLM2",
  function(object,cor.matrix=NULL, df=Inf, p.value=0.01, fstat.only=FALSE) {
#	Use F-tests to classify vectors of t-test statistics into outcomes
#	Gordon Smyth
#	20 Mar 2003.  Last revised 6 June 2009.

#	Method intended for MArrayLM objects but accept unclassed lists as well
	if(is.list(object)) {
		if(is.null(object$t)) stop("tstat cannot be extracted from object")
		computeCorrMat = ifelse(is.null(cor.matrix), TRUE, FALSE)
		if(missing(df) && !is.null(object$df.prior) && !is.null(object$df.residual)) df <- object$df.prior+object$df.residual
		tstat <- as.matrix(object$t)
	} else {
		tstat <- as.matrix(object)
	}
	ngenes <- nrow(tstat)
	ntests <- ncol(tstat)
	if(ntests == 1) {
		if(fstat.only) {
			fstat <- tstat^2
			attr(fstat,"df1") <- 1
			attr(fstat,"df2") <- df
			return(fstat)
		} else {
			p <- 2 * pt(abs(tstat), df, lower.tail=FALSE)
			return(new("TestResults", sign(tstat) * (p < p.value) ))
		}
	}

	# F test of multiple coefficients
	#-------------------------------

	if( ngenes != length(object$cov.coefficients.list) ){
		msg = paste0("Number of genes does not equal number of elements in cov.coefficients.list\n", ngenes, " != ", length(object$cov.coefficients.list))
		stop(msg)
	}

	fstat <- rep(NA, ngenes) 
	names(fstat) = rownames(tstat)

	result <- matrix(0,ngenes,ntests,dimnames=dimnames(tstat))

	for(i in seq_len(ngenes) ){

		if( computeCorrMat ){
			if( is.null(object$cov.coefficients.list) ){
				C <- object$cov.coefficients
			}else{			
				C <- object$cov.coefficients.list[[i]]
			}
			# subset based on coefficient names
			C = C[colnames(object),colnames(object)]
			cor.matrix <- cov2cor( C )
		}else{
			cor.matrix = cov2cor(cor.matrix)
		}

		# cor.matrix is estimated correlation matrix of the coefficients
		# and also the estimated covariance matrix of the t-statistics
		if(is.null(cor.matrix)){
			r <- ntests
			Q <- diag(r)/sqrt(r)
		}else{
			E <- eigen(cor.matrix,symmetric=TRUE)
			r <- sum(E$values/E$values[1] > 1e-8)
			Q <- limma:::.matvec( E$vectors[,1:r], 1/sqrt(E$values[1:r]))/sqrt(r)
		}

		# Return overall moderated F-statistic only
		if(fstat.only) {
			fstat[i] <- drop( (tstat[i,,drop=FALSE] %*% Q)^2 %*% array(1,c(r,1)) )
		}
		if( i == 1){					
			attr(fstat,"df1") <- r
			attr(fstat,"df2") <- df[i]
		}

		# Return TestResults matrix
		qF <- qf(p.value, r, df[i], lower.tail=FALSE)
		if(length(qF)==1) qF <- rep(qF,ngenes) 
		x <- tstat[i,]
		if(any(is.na(x)))
			result[i,] <- NA
		else
			if( crossprod(crossprod(Q,x)) > qF[i] ) {
				ord <- order(abs(x),decreasing=TRUE)
				result[i,ord[1]] <- sign(x[ord[1]])
				for (j in 2:ntests) {
					bigger <- ord[1:(j-1)]
					x[bigger] <- sign(x[bigger]) * abs(x[ord[j]])
					if( crossprod(crossprod(Q,x)) > qF[i] )
						result[i,ord[j]] <- sign(x[ord[j]])
					else
						break
				}
			}
	}

	if(fstat.only){
		return( fstat )
	}

	new("TestResults",result)
})

Try the variancePartition package in your browser

Any scripts or data that you put into this service are public.

variancePartition documentation built on Nov. 8, 2020, 5:18 p.m.