R/fitMixedModelDE.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

Documented in dream .getAllUniContrasts getContrast plotCompareP .standard_transform

#' Class MArrayLM2
#'
#' Class \code{MArrayLM2} 
#'
#' @name MArrayLM2-class
#' @rdname MArrayLM2-class
#' @exportClass MArrayLM2
setClass("MArrayLM2",
#  Linear model fit
representation("MArrayLM")#, varComp="data.frame", sigGStruct='list')
)

setIs("MArrayLM2","LargeDataObject")
setAs(from='MArrayLM', to='MArrayLM2', function(from){
	structure(from, class="MArrayLM2")
	})


#' Class FMT 
#'
#' Class \code{FMT} 
#'
#' @name FMT-class
#' @rdname FMT-class
#' @exportClass FMT
#' @keywords internal
setClass('FMT', contains='MArrayLM2')

setAs("MArrayLM2", "FMT", function(from, to ){

	res = new( to, from )
	names(res) = names(from)
	res
})

# define S3 version of these functions

#' @export  
residuals.MArrayLM2 = function( object, ...){
	if( is.null(object$residuals) ){
		stop( "Residuals were not computed, must run:\n dream(...,computeResiduals=TRUE)")
	}
	if( nargs() > 1 ){#& is.null(suppressWarnings) ){
		warning("\n Second argument is ignored here,\n but can be passed for compatability with limma.\n Results are the same either way")
	}
	object$residuals
}


# #' @importFrom limma residuals.MArrayLM
# # @export  
# residuals.MArrayLM = function( object, ...){

# 	if( is.null(object$residuals) ){
# 		# use residuals computed by limma
# 		res = limma::residuals.MArrayLM( object, ...)
# 	}else{
# 		# use precomputed residuals
# 		res = object$residuals
# 	}
# 	res
# }


# S4 methpds

#' residuals for MArrayLM
#'
#' residuals for MArrayLM
#'
#' @param object MArrayLM object from dream
#' @param ... other arguments, currently ignored
#'
#' @return results of residuals
#' @export
setMethod("residuals", "MArrayLM",
	function( object, ...){
		if( nargs() == 1 ){
			result = object$residuals
		}else{
			result = residuals.MArrayLM(object,...)
		}
		result
	})


#' residuals for MArrayLM2
#'
#' residuals for MArrayLM2
#'
#' @param object MArrayLM2 object from dream
#' @param ... other arguments, currently ignored
#'
#' @return results of residuals
#' @export
setMethod("residuals", "MArrayLM2",
	function( object, ...){
		residuals.MArrayLM2(object,...)
		})






#' Extract contrast matrix for linear mixed model
#' 
#' Extract contrast matrix, L, testing a single variable.  Contrasts involving more than one variable can be constructed by modifying L directly
#'
#' @param exprObj matrix of expression data (g genes x n samples), or \code{ExpressionSet}, or \code{EList} returned by \code{voom()} from the \code{limma} package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work
#' @param data data.frame with columns corresponding to formula 
#' @param coefficient the coefficient to use in the hypothesis test
#' 
#' @return
#' Contrast matrix testing one variable
#'
#' @examples
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' # get contrast matrix testing if the coefficient for Batch2 is zero 
#' # The variable of interest must be a fixed effect
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' L = getContrast( geneExpr, form, info, "Batch3")
#'
#' # get contrast matrix testing if Batch3 - Batch2 = 0
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' L = getContrast( geneExpr, form, info, c("Batch3", "Batch2"))
#'
#' # To test against Batch1 use the formula:
#' # 	~ 0 + Batch + (1|Individual) + (1|Tissue) 
#' # to estimate Batch1 directly instead of using it as the baseline
#'
#' @export
# @docType methods
#' @rdname getContrast-method
getContrast = function( exprObj, formula, data, coefficient){ 

	if( length(coefficient) > 2){
		stop("Length of coefficient array limited to 2")
	}

	L = .getContrastInit( exprObj, formula, data)

	# assign coefficient coding
	if( any(!coefficient %in% names(L)) ){
		stop("coefficient is not in the formula.  Valid coef are:\n", paste(names(L), collapse=', '))
	}
	L[coefficient[1]] = 1

	if( length(coefficient) == 2){			
		L[coefficient[2]] = -1
	}
	
	L
}

#' Get all univariate contrasts
#'
#' Get all univariate contrasts
#'
#' @param exprObj matrix of expression data (g genes x n samples), or ExpressionSet, or EList returned by voom() from the limma package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work
#' @param data data.frame with columns corresponding to formula 
#'
#' @return
#'  Matrix testing each variable one at a time.  Contrasts are on rows
#'
#' @keywords internal
.getAllUniContrasts = function( exprObj, formula, data){ 

	Linit = .getContrastInit( exprObj, formula, data)

	Lall = lapply( seq_len(length(Linit)), function(i){
		Linit[i] = 1
		Linit
		})
	names(Lall) = names(Linit)
	Lall = do.call("rbind", Lall)

	# remove intercept contrasts
	# Lall[,-1,drop=FALSE]
	Lall
}

.getContrastInit = function( exprObj, formula, data){ 

	exprObj = as.matrix( exprObj )
	formula = stats::as.formula( formula )

	REML=TRUE
	useWeights=TRUE
	weightsMatrix=NULL
	showWarnings=FALSE
	dream=TRUE
	fxn=identity
	colinearityCutoff=.999
	control = lme4::lmerControl(calc.derivs=FALSE, check.rankX="stop.deficient" )

	# check dimensions of reponse and covariates
	if( ncol(exprObj) != nrow(data) ){		
		stop( "the number of samples in exprObj (i.e. cols) must be the same as in data (i.e. rows)" )
	}

	# if weightsMatrix is not specified, set useWeights to FALSE
	if( useWeights && is.null(weightsMatrix) ){
		# warning("useWeights was ignored: no weightsMatrix was specified")
		useWeights = FALSE
	}

	# if useWeights, and (weights and expression are the same size)
	if( useWeights && !identical( dim(exprObj), dim(weightsMatrix)) ){
		 stop( "exprObj and weightsMatrix must be the same dimensions" )
	}

	# If samples names in exprObj (i.e. columns) don't match those in data (i.e. rows)
	if( ! identical(colnames(exprObj), rownames(data)) ){
		 warning( "Sample names of responses (i.e. columns of exprObj) do not match\nsample names of metadata (i.e. rows of data).  Recommend consistent\nnames so downstream results are labeled consistently." )
	}

	# add response (i.e. exprObj[j,]) to formula
	# Use term 'responsePlaceholder' to store the value of reponse j (exprObj[j,])
	# This is not an ideal way to structure the evaluation of response j.
	# 	The formula is evaluated a different scope, (i.e. within lmer()), and there is no need to pass the
	# 	entire exprObj object into that scope.  With lexical scope, I found it was possible that  
	# 	the value of exprObj[j,] could be different when evaluated in the lower scope
	# This placeholder term addresses that issue
	form = paste( "responsePlaceholder$E", paste(as.character( formula), collapse=''))

	# run lmer() to see if the model has random effects
	# if less run lmer() in the loop
	# else run lm()
	responsePlaceholder = nextElem(exprIter(exprObj, weightsMatrix, useWeights))
	possibleError <- tryCatch( lmer( eval(parse(text=form)), data=data,control=control ), error = function(e) e)

	mesg <- "No random effects terms specified in formula"
	method = ''
	if( inherits(possibleError, "error") && identical(possibleError$message, mesg) ){
		
		design = model.matrix( formula, data)

		L = rep(0, ncol(design))
		names(L) = colnames(design)

		 # detect error when variable in formula does not exist
	}else if( inherits(possibleError, "error") && length( grep("object '.*' not found", possibleError$message)) > 0 ){
		stop("Variable in formula is not found: ", gsub("object '(.*)' not found", "\\1", possibleError$message) )
	}
	else{

		if( inherits(possibleError, "error") && grep('the fixed-effects model matrix is column rank deficient', possibleError$message) == 1 ){
			stop(paste(possibleError$message, "\n\nSuggestion: rescale fixed effect variables.\nThis will not change the variance fractions or p-values."))
		} 		

		fit = lmer( eval(parse(text=form)), data=data,control=control, REML=TRUE )
		
		L = rep(0, length(fixef(fit)))
		names(L) = names(fixef(fit))
	}
	L
}




# Evaluate contrasts for linear mixed model
# 
# Evaluate contrasts for linear mixed model
#
# @param fit model fit
# @param L contrast matrix
# @param ddf Specifiy "Satterthwaite" or "Kenward-Roger" method to estimate effective degress of freedom for hypothesis testing in the linear mixed model.  Note that Kenward-Roger is more accurate, but is *much* slower.  Satterthwaite is a good enough approximation for most datasets.
# 
# @return
# df, sigma, beta, SE of model
#
# @details If the Kenward-Roger covariance matrix is not positive definite, the Satterthwaite method is used
#
# @export
# @docType methods
# @rdname eval_lmm-method
.eval_lmm = function( fit, L, ddf ){

	j = 1
	# evaluate each contrast
	# cons = lmerTest::contest(fit, L, ddf=ddf)
	cons = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		lmerTest::contest(fit, L[,j], ddf=ddf)
	}

	df = as.numeric(cons[,'DenDF'])

	if(ddf == "Kenward-Roger"){
		# KR
		V = pbkrtest::vcovAdj.lmerMod(fit, 0)

		# if matrix is not PSD
		if( min(diag(as.matrix(V))) < 0){			
			warning("The adjusted Kenward-Roger covariance matrix is not positive definite.\nUsing Satterthwaite approximation instead")

			# Satterthwaite
			V = vcov(fit)
		}
		# df = pbkrtest::get_Lb_ddf(fit, L)
	}else{
		# Satterthwaite
		V = vcov(fit)
		# df = as.numeric(contest(fit, L, ddf="Sat")['DenDF'])
	}

	sigma = attr(lme4::VarCorr(fit), "sc")

	# get contrasts
	# beta = as.matrix(sum(L * fixef(fit)), ncol=1)
	# colnames(beta) = "logFC"

	beta = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		as.matrix(sum(L[,j] * fixef(fit)), ncol=1)
	}
	colnames(beta) = "logFC"
	rownames(beta) = colnames(L)

	# SE = as.matrix(sqrt(sum(L * (V %*% L))), ncol=1)		
	# colnames(SE) = "logFC"
	SE = foreach( j = 1:ncol(L), .combine=rbind) %do% {
		as.matrix(sqrt(sum(L[,j] * (V %*% L[,j]))), ncol=1)
	}
	colnames(SE) = "logFC"
	rownames(SE) = colnames(L)

	# pValue = 2*pt(as.numeric(abs(beta / SE)), df, lower.tail=FALSE)
	pValue = as.numeric(cons[,'Pr(>F)'])

	list(cons = cons,
		df		= df,
		sigma	= sigma,
		beta	= beta,
		SE		= SE,
		pValue	= pValue,
		vcov 	= V )
}



.checkNA = function(exprObj){
	# check if values are NA
	countNA = sum(is.nan(exprObj)) + sum(!is.finite(exprObj))
	if( countNA > 0 ){
		stop("There are ", countNA, " NA/NaN/Inf values in exprObj\nMissing data is not allowed")
	}

	# check if all genes have variance
	rv = apply( exprObj, 1, var)
	if( any( rv == 0) ){
		idx = which(rv == 0)
		stop(paste("Response variable", idx[1], 'has a variance of 0'))
	}		
}

#' Differential expression with linear mixed model
#' 
#' Fit linear mixed model for differential expression and preform hypothesis test on fixed effects as specified in the contrast matrix L
#'
#' @param exprObj matrix of expression data (g genes x n samples), or ExpressionSet, or EList returned by voom() from the limma package
#' @param formula specifies variables for the linear (mixed) model.  Must only specify covariates, since the rows of exprObj are automatically used a a response. e.g.: \code{~ a + b + (1|c)}  Formulas with only fixed effects also work, and lmFit() followed by contrasts.fit() are run.
#' @param data data.frame with columns corresponding to formula 
#' @param L contrast matrix specifying a linear combination of fixed effects to test
#' @param ddf Specifiy "Satterthwaite" or "Kenward-Roger" method to estimate effective degress of freedom for hypothesis testing in the linear mixed model.  Note that Kenward-Roger is more accurate, but is *much* slower.  Satterthwaite is a good enough approximation for most datasets.
#' @param useWeights if TRUE, analysis uses heteroskedastic error estimates from \code{voom()}.  Value is ignored unless exprObj is an \code{EList()} from \code{voom()} or \code{weightsMatrix} is specified
#' @param weightsMatrix matrix the same dimension as exprObj with observation-level weights from \code{voom()}.  Used only if useWeights is TRUE 
#' @param control control settings for \code{lmer()}
#' @param suppressWarnings if TRUE, do not stop because of warnings or errors in model fit
#' @param quiet suppress message, default FALSE
#' @param BPPARAM parameters for parallel evaluation
#' @param computeResiduals if TRUE, compute residuals and extract with \code{residuals(fit)}.  Setting to FALSE saves memory
#' @param REML use restricted maximum likelihood to fit linear mixed model. default is TRUE.  See Details.
#' @param ... Additional arguments for \code{lmer()} or \code{lm()}
#' 
#' @return 
#' MArrayLM2 object (just like MArrayLM from limma), and the directly estimated p-value (without eBayes)
#'
#' @details 
#' A linear (mixed) model is fit for each gene in exprObj, using formula to specify variables in the regression.  If categorical variables are modeled as random effects (as is recommended), then a linear mixed model us used.  For example if formula is \code{~ a + b + (1|c)}, then the model is 
#'
#' \code{fit <- lmer( exprObj[j,] ~ a + b + (1|c), data=data)}
#'
#' \code{useWeights=TRUE} causes \code{weightsMatrix[j,]} to be included as weights in the regression model.
#'
#' Note: Fitting the model for 20,000 genes can be computationally intensive.  To accelerate computation, models can be fit in parallel using \code{BiocParallel} to run code in parallel.  Parallel processing must be enabled before calling this function.  See below.
#' 
#' The regression model is fit for each gene separately. Samples with missing values in either gene expression or metadata are omitted by the underlying call to lmer.
#'
#' Hypothesis tests and degrees of freedom are producted by \code{lmerTest} and \code{pbkrtest} pacakges
#'
#' While \code{REML=TRUE} is required by \code{lmerTest} when ddf='Kenward-Roger', ddf='Satterthwaite' can be used with \code{REML} as \code{TRUE} or \code{FALSE}.  Since the Kenward-Roger method gave the best power with an accurate control of false positive rate in our simulations, and since the Satterthwaite method with REML=TRUE gives p-values that are slightly closer to the Kenward-Roger p-values, \code{REML=TRUE} is the default.  See Vignette "3) Theory and practice of random effects and REML"
#'
#' @examples
#'
#' # load library
#' # library(variancePartition)
#' library(BiocParallel)
#'
#' # Intialize parallel backend with 4 cores
#' library(BiocParallel)
#' register(SnowParam(4))
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' form <- ~ Batch + (1|Individual) + (1|Tissue) 
#' 
#' # Fit linear mixed model for each gene
#' # run on just 10 genes for time
#' fit = dream( geneExpr[1:10,], form, info)
#'
#' # view top genes
#' topTable( fit )
#'
#' # get contrast matrix testing if the coefficient for Batch2 is 
#' # different from coefficient for Batch3
#' # The variable of interest must be a fixed effect
#' L = getContrast( geneExpr, form, info, c("Batch2", "Batch3"))
#' 
#' # plot contrasts
#' plotContrasts( L )
#' 
#' # Fit linear mixed model for each gene
#' # run on just 10 genes for time
#' # Note that that dream() is not compatible with eBayes()
#' fit2 = dream( geneExpr[1:10,], form, info, L)
#' 
#' # view top genes
#' topTable( fit2 )
#' 
#' # Parallel processing using multiple cores with reduced memory usage
#' param = SnowParam(4, "SOCK", progressbar=TRUE)
#' fit3 = dream( geneExpr[1:10,], form, info, L, BPPARAM = param)
#'
#' # Fit fixed effect model for each gene
#' # Use lmFit in the backend
#' # Need to run eBayes afterward
#' form <- ~ Batch 
#' fit4 = dream( geneExpr[1:10,], form, info)
#' fit4 = eBayes( fit4 )
#' 
#' # view top genes
#' topTable( fit4 )
#'
#' # Compute residuals using dream
#' fit5 = dream( geneExpr[1:10,], form, info, L, BPPARAM = param, computeResiduals=TRUE)
#' 
#' # Return residuals
#' residuals(fit5)
#' 
#' @export
# @docType methods
#' @rdname dream-method
#' @importFrom pbkrtest get_SigmaG
#' @importFrom BiocParallel bpiterate bpparam
#' @importFrom lme4 VarCorr 
#' @importFrom stats hatvalues
#' @import doParallel foreach
#'
dream <- function( exprObj, formula, data, L, ddf = c("Satterthwaite", "Kenward-Roger"), useWeights=TRUE, weightsMatrix=NULL, control = lme4::lmerControl(calc.derivs=FALSE, check.rankX="stop.deficient" ),suppressWarnings=FALSE, quiet=FALSE, BPPARAM=bpparam(), computeResiduals=TRUE, REML=TRUE, ...){ 

	exprObjInit = exprObj
	
	exprObjMat = as.matrix( exprObj )
	formula = stats::as.formula( formula )
	ddf = match.arg(ddf)
	colinearityCutoff = 0.999

	if( ! is.data.frame(data) ){
		stop("data must be a data.frame")
	}

	# check dimensions of reponse and covariates
	if( ncol(exprObj) != nrow(data) ){		
		stop( "the number of samples in exprObj (i.e. cols) must be the same as in data (i.e rows)" )
	}

	# assign weightsMatrix from exprObj
	if( is( exprObj, "EList") ){
		if( useWeights ){
			weightsMatrix = exprObj$weights
		}
		.checkNA( exprObj$E )
	}else{
		.checkNA( exprObj )
	}


	if( !(ddf %in% c("Kenward-Roger", 'Satterthwaite')) ){
		stop("Specify ddf correctly")
	}

	if( ddf == "Kenward-Roger" & ! REML ){
		stop("Kenward-Roger must be used with REML")
	}

	# if weightsMatrix is not specified, set useWeights to FALSE
	if( useWeights && is.null(weightsMatrix) ){
		# warning("useWeights was ignored: no weightsMatrix was specified")
		useWeights = FALSE
	}

	# if useWeights, and (weights and expression are the same size)
	if( useWeights && !identical( dim(exprObj), dim(weightsMatrix)) ){
		 stop( "exprObj and weightsMatrix must be the same dimensions" )
	}

	if( ! useWeights ){
		weightsMatrix = NULL	
	}
	
	# If samples names in exprObj (i.e. columns) don't match those in data (i.e. rows)
	if( ! identical(colnames(exprObj), rownames(data)) ){
		 warning( "Sample names of responses (i.e. columns of exprObj) do not match\nsample names of metadata (i.e. rows of data).  Recommend consistent\nnames so downstream results are labeled consistently." )
	}

	if( .isDisconnected() ){
		stop("Cluster connection lost. Either stopCluster() was run too soon\n, or connection expired")
	}

	# Contrasts
	###########

	univariateContrasts = FALSE
	if( missing(L) ){
		# all univariate contrasts
		L = .getAllUniContrasts( exprObj, formula, data)
		univariateContrasts = TRUE
	}else{
		# format contrasts 
		if( is(L, "numeric") ){
			L = as.matrix(L, ncol=1)
		}else if( is(L, 'data.frame') ){
			L = as.matrix(L)
		}
		if( is.null(colnames(L)) ){
			colnames(L) = paste0('L', seq_len(ncol(L)))
		}

		# check columns that only have a single 1
		tst = apply(L, 2, function(x){
			length(x[x!=0]) == 1
			})
		if( any(tst) ){
			warning("Contrasts with only a single non-zero term are already evaluated by default.")
		}
		# remove univariate contrasts
		# L = L[,!tst,drop=FALSE]

		# add all univariate contrasts
		Luni = .getAllUniContrasts( exprObj, formula, data)
		L = cbind(L, Luni)
	}

	if( ncol(L) == 0){
		stop( "Must include fixed effect in the model for hypothesis testing")
	}

	# check rownames of contrasts
	if( length(unique(colnames(L))) != ncol(L) ){
		stop(paste("Contrast names must be unique: ", paste(colnames(L), collapse=', ')))
	}

	# Evaluate model on each gene
	#############################
	
	if( ! .isMixedModelFormula( formula ) ){
		if( !quiet ){
			message("Fixed effect model, using limma directly...")
			message("User can apply eBayes() afterwards...")
		}

		# weights are always used
		design = model.matrix( formula, data)

		# least squares fit
		ret = lmFit( exprObj, design, weights=weightsMatrix )

		if( computeResiduals ){
			# Evaluate residuals here, since can't be evaluate after contrasts.fit() is run
			# add this to the standard MArrayLM object for use by custom residuals function
			ret$residuals = residuals( ret, exprObj )
		}

		# apply contrasts
		if( ! univariateContrasts ){
			ret = contrasts.fit( ret, L)
		}

	}else{

		# add response (i.e. exprObj[j,]) to formula
		# Use term 'responsePlaceholder' to store the value of reponse j (exprObj[j,])
		# This is not an ideal way to structure the evaluation of response j.
		# 	The formula is evaluated a different scope, (i.e. within lmer()), and there is no need to pass the
		# 	entire exprObj object into that scope.  With lexical scope, I found it was possible that  
		# 	the value of exprObj[j,] could be different when evaluated in the lower scope
		# This placeholder term addresses that issue
		form = paste( "responsePlaceholder$E", paste(as.character( formula), collapse=''))

		# fit first model to initialize other model fits
		# this make the other models converge faster
		responsePlaceholder = nextElem(exprIter(exprObjMat, weightsMatrix, useWeights))

		timeStart = proc.time()
		fitInit <- lmerTest::lmer( eval(parse(text=form)), data=data,..., REML=REML, control=control )

		# extract covariance matrices  
		sigGStruct = get_SigmaG( fitInit )$G
		
		# check L
		if( ! identical(rownames(L), names(fixef(fitInit))) ){
			stop("Names of entries in L must match fixed effects")
		}
		
		# extract statistics from model
		mod = .eval_lmm( fitInit, L, ddf)
		timediff = proc.time() - timeStart

		# check that model fit is valid, and throw warning if not
		checkModelStatus( fitInit, showWarnings=!suppressWarnings, dream=TRUE, colinearityCutoff=colinearityCutoff )

		a = names(fixef(fitInit))
		b = rownames(L)

		if( ! identical( a,b) ){
			stop("Terms in contrast matrix L do not match model:\n  Model: ", paste(a, collapse=',' ), "\n  L: ", paste(b, collapse=',' ), "\nNote thhat order must be the same")
		}

		# specify gene explicitly in data 
		# required for downstream processing with lmerTest
		data2 = data.frame(data, expr=responsePlaceholder$E, check.names=FALSE)
		form = paste( "expr", paste(as.character( formula), collapse=''))

		pb <- progress_bar$new(format = ":current/:total [:bar] :percent ETA::eta", total = nrow(exprObj), width= 60, clear=FALSE)

		pids = .get_pids()

		timeStart = proc.time()

		# Define function for parallel evaluation
		.eval_models = function(responsePlaceholder, data2, form, REML, theta, control, na.action=stats::na.exclude,...){	
			# modify data2 for this gene
			data2$expr = responsePlaceholder$E
 
			# fit linear mixed model
			suppressWarnings({
				fit <- lmerTest::lmer( eval(parse(text=form)), data=data2, REML=REML,..., weights=responsePlaceholder$weights, control=control,na.action=na.action)
				})

			# extract statistics from model
			mod = .eval_lmm( fit, L, ddf)

			res = NULL
			if( computeResiduals ){
				 res = residuals(fit)
			}

			ret = list(	coefficients 	= mod$beta, 
						design 			= fit@pp$X, 
						df.residual 	= mod$df, 
						Amean 			= mean(fit@frame[,1]), 
						method 			= 'lmer',
						sigma 			= mod$sigma,
						stdev.unscaled 	= mod$SE/mod$sigma,
						pValue 			= mod$pValue,
						residuals 		= res ) 

			# get variance terms for random effects
			varComp <- lapply(lme4::VarCorr(fit), function(fit) attr(fit, "stddev")^2)
			varComp[['resid']] = attr(lme4::VarCorr(fit), 'sc')^2

			if( univariateContrasts ){
				V = mod$vcov
			}else{
				# do expensive evaluation only when L is defined by the user
				V = crossprod(chol(mod$vcov) %*% L)
			}

			list( 	ret 	= new("MArrayLM", ret),
					varComp = varComp,
					edf		= sum(hatvalues(fit)), # effective degrees of freedom as sum of diagonals of hat matrix
					vcov 	= V)
		}

		.eval_master = function( obj, data2, form, REML, theta, control, na.action=stats::na.exclude,... ){

			lapply(seq_len(nrow(obj$E)), function(j){
				.eval_models( list(E=obj$E[j,], weights=obj$weights[j,]), data2, form, REML, theta, control, na.action,...)
			})
		}

		# Evaluate function
		####################

		it = iterBatch(exprObjMat, weightsMatrix, useWeights, n_chunks = 100)

		if( !quiet ) message(paste0("Dividing work into ",attr(it, "n_chunks")," chunks..."))

		resList <- bpiterate( it, .eval_master, 
			data2=data2, form=form, REML=REML, theta=fitInit@theta, control=control,..., 
			 REDUCE=c,
		    reduce.in.order=TRUE,	
			BPPARAM=BPPARAM)
	
		
		names(resList) = seq_len(length(resList))

		if( !quiet ) message("\nTotal:", paste(format((proc.time() - timeStart)[3], digits=0), "s"))		

		x = 1
		# extract results
		coefficients = foreach( x = resList, .combine=cbind ) %do% {x$ret$coefficients}
		df.residual = foreach( x = resList, .combine=cbind ) %do% {	x$ret$df.residual}
		pValue = foreach( x = resList, .combine=cbind ) %do% { x$ret$pValue }
		stdev.unscaled  = foreach( x = resList, .combine=cbind ) %do% {x$ret$stdev.unscaled} 
		if( computeResiduals ){
			residuals = foreach( x = resList, .combine=cbind ) %do% { x$ret$residuals }
		}

		# transpose
		coefficients = t( coefficients )
		df.residual = t( df.residual )
		pValue = t( pValue )
		stdev.unscaled = t( stdev.unscaled )
		
		if( computeResiduals ){
			residuals = t(residuals)
			rownames(residuals) = rownames(exprObj)
		}

		colnames(coefficients) = colnames(L)
		rownames(coefficients) = rownames(exprObj)

		design = resList[[1]]$ret$design

		colnames(df.residual) = colnames(L)
		rownames(df.residual) = rownames(exprObj)

		colnames(pValue) = colnames(L)
		rownames(pValue) = rownames(exprObj)

		Amean = sapply( resList, function(x) x$ret$Amean) 
		names(Amean ) = rownames(exprObj)
		method = "lmer"
		sigma = sapply( resList, function(x) x$ret$sigma)
		names(sigma) = rownames(exprObj)

		varComp = lapply(resList, function(x) as.data.frame(x$varComp))
		varComp = do.call("rbind", varComp)
		rownames(varComp) = rownames(coefficients)

		edf = sapply(resList, function(x) x$edf)

		colnames(stdev.unscaled) = colnames(L)
		rownames(stdev.unscaled) = rownames(exprObj)

		ret = list( coefficients 	= coefficients,
		 			design 			= design, 
		 			df.residual 	= df.residual, 
		 			Amean 			= Amean, 
		 			method 			= method, 
		 			sigma 			= sigma, 
		 			contrasts 		= L,
		 			stdev.unscaled 	= stdev.unscaled)

		if( 'genes' %in% names(exprObjInit) ){
			ret$genes = exprObjInit$genes
		}

		ret = new("MArrayLM", ret)
		# ret$pValue = pValue

		# set covariance between covariates
		# C = solve(crossprod(ret$design))
		# C = chol2inv(chol(crossprod(ret$design)))
		V = chol2inv(qr(ret$design)$qr)
		rownames(V) = colnames(ret$design)
		colnames(V) = colnames(ret$design)

		# remove intercept term if it exists
		# fnd = colnames(C) == "(Intercept)" 
		# if( any(fnd) ){
		# 	C = C[!fnd,!fnd,drop=FALSE]
		# }

		if( ! univariateContrasts ){
			# do expensive evaluation only when L is defined by the user
			V = crossprod(chol(V) %*% L)
		}

		# covariance used by limma.  
		# Assumes covariance is the same for all genes
		ret$cov.coefficients = V

		# allows covariance to differ for each gene based on variance components
		ret$cov.coefficients.list = lapply(resList, function(x) as.matrix(x$vcov))

		if( computeResiduals ){
			ret$residuals = residuals
		}

		ret = as(ret, "MArrayLM2")
		# add additional information for pinnacle
		# ret = new("MArrayLM2", ret, varComp, sigGStruct)
		attr(ret, "varComp") = varComp
		attr(ret, "sigGStruct") = sigGStruct
		attr(ret, "edf") = edf

		# compute standard values for t/F/p without running eBayes
		# eBayes can be run afterwards, if wanted
		ret = .standard_transform( ret )
	}
	ret
}


#' Compute standard post-processing values
#' 
#' These values are typically computed by eBayes
#' 
#' @param fit result of dream (MArrayLM2)
#'
#' @return MArrayLM2 object with values computed
#' 
#' @importFrom stats pchisq pf
#' @keywords internal
.standard_transform = function(fit){

	# get values
	coefficients <- fit$coefficients
	stdev.unscaled <- fit$stdev.unscaled
	sigma <- fit$sigma
	df.residual <- fit$df.residual
	
	# t-test
	out = fit
	out$t <- coefficients / stdev.unscaled / sigma
	# out$df.total <- df.residual
	out$p.value <- 2*pt(-abs(out$t),df=df.residual)

	# F-test
	if(!is.null(out$design) && is.fullrank(out$design)) {

		# only evaluate F-stat on real coefficients, not contrasts
		realcoef = colnames(out)[colnames(out) %in% colnames(out$design)]
		realcoef = realcoef[realcoef!="(Intercept)"]

		if( is.null(realcoef) || (length(realcoef) == 0) ){

			# this happends when only the intercept term is included
			warning("No testable fixed effects were included in the model.\n  Running topTable() will fail.")
		}else{
			df = rowMeans(out[,realcoef]$df.residual)

			F.stat <- classifyTestsF(out[,realcoef], df=df, fstat.only=TRUE)
			out$F <- as.vector(F.stat)
			df1 <- attr(F.stat,"df1")
			df2 <- attr(F.stat,"df2")
			if(df2[1] > 1e6){ # Work around bug in R 2.1
				out$F.p.value <- pchisq(df1*out$F,df1,lower.tail=FALSE)
			}else{
				out$F.p.value <- pf(out$F,df1,df2,lower.tail=FALSE)
			}
		}
	}

	out
}




#' Subseting for MArrayLM2
#'
#' Enable subsetting on MArrayLM2 object. Same as for MArrayLM, but apply column subsetting to df.residual and cov.coefficients.list
#'
#' @param object MArrayLM2
#' @param i row
#' @param j col
#'
#' @name [.MArrayLM2
#' @return subset
#' @export
# @S3method [ MArrayLM2
#' @rawNamespace S3method("[", MArrayLM2)
#' @importFrom stats p.adjust
#' @rdname subset.MArrayLM2-method
#' @aliases subset.MArrayLM2,MArrayLM2-method
#' @keywords internal
assign("[.MArrayLM2",
	function(object, i, j){
	if(nargs() != 3){
		stop("Two subscripts required",call.=FALSE)
	}

	# apply standard MArrayLM subsetting
	obj = as(object, 'MArrayLM')

	if(!missing(j)){
		obj = obj[,j]
	}
	if(!missing(i)){
		obj = obj[i,]
	}

	# custom code to deal with df.total and df.residual
	if( is.null(ncol(object$df.total)) ){
		if(!missing(i)){
			obj$df.total = object$df.total[i]
		}else{
			obj$df.total = object$df.total
		}
	}else{			
		tmp = object$df.total
		if(!missing(i)){
			tmp = object$df.total[i,,drop=FALSE]
		}		
		if(!missing(j)){
			tmp = tmp[,j,drop=FALSE]
		}	
		obj$df.total = tmp
	}

	if( is.null(ncol(object$df.residual)) ){
		if(!missing(i)){
			obj$df.residual = object$df.residual[i]
		}else{			
			obj$df.residual = object$df.residual
		}
	}else{	
		tmp = object$df.residual
		if(!missing(i)){
			tmp = object$df.residual[i,,drop=FALSE]
		}		
		if(!missing(j)){
			tmp = tmp[,j,drop=FALSE]
		}	
		obj$df.residual = tmp
	}

	# obj$pValue = object$pValue[i,j]
	obj$s2.prior = object$s2.prior
	obj$df.prior = object$df.prior

	obj = as(obj, "MArrayLM2")

	#  copy gene-specific covariance, if it exists
	if( ! is.null(object$cov.coefficients.list) ){
		if(!missing(i)){
			obj$cov.coefficients.list = object$cov.coefficients.list[i]
		}else{
			obj$cov.coefficients.list = object$cov.coefficients.list
		}
	}	

	if( is.null(obj$df.total)){
		obj$df.total = rowMeans(obj$df.residual)
	}

	# the F-statistic and p-value are evaluated when subsetting is applied
	# so need to apply df2 here
	# If columns have been subsetted, need to re-generate F
	if(!is.null(obj[["F"]]) && !missing(j)) {

		F.stat <- classifyTestsF(obj,df=obj$df.total,fstat.only=TRUE)
		obj$F <- as.vector(F.stat)
		df1 <- attr(F.stat,"df1")
		df2 <- attr(F.stat,"df2")
		if (df2[1] > 1e6){ 
			obj$F.p.value <- pchisq(df1*obj$F,df1,lower.tail=FALSE)
		}else{
			obj$F.p.value <- pf(obj$F,df1,df2,lower.tail=FALSE)
		}
	}
	obj	
})


setGeneric("eBayes", function(fit, proportion = 0.01, stdev.coef.lim = c(0.1, 4), 
    trend = FALSE, robust = FALSE, winsor.tail.p = c(0.05, 0.1) ){

	eBayes(fit, proportion, stdev.coef.lim, trend, robust, winsor.tail.p  )
})


#' eBayes for MArrayLM2
#'
#' eBayes for MArrayLM2.  It simply returns the orginal input with no change
#'
#' @param fit fit
#' @param proportion proportion
#' @param stdev.coef.lim stdev.coef.lim
#' @param trend trend
#' @param robust robust
#' @param winsor.tail.p winsor.tail.p 
#'
#' @return results of eBayes
#' @details Note that empirical Bayes is problematic for linear mixed models.  This function returns its original input with no change.  It is included here just for compatibility.
#' @export
#' @rdname eBayes-method
#' @aliases eBayes,MArrayLM2-method
setMethod("eBayes", "MArrayLM2",
function(fit, proportion = 0.01, stdev.coef.lim = c(0.1, 4), 
    trend = FALSE, robust = FALSE, winsor.tail.p = c(0.05, 0.1)){

	warning("Empirical Bayes moderated test is no longer supported for dream analysis\nReturning original results for use downstream")

	fit
})



# change_t_df
#
# change_t_df
#
# @param t t-statistic
# @param df current degrees of freedom
# @param d_target target degrees of freedom
#
# @return Given a t-statistic with degrees of freedom df, return a new t-statistic corresponding to a target degrees of freedom.  Both t-statistics give the same p-value when compared to their respective degrees of freedom
.change_t_df = function(t, df, d_target){
  p = pt(abs(t), df, lower.tail=FALSE)
  qt(p, d_target, lower.tail=FALSE)
}

# standardized_t_stat
#
# standardized_t_stat
#
# @param fit model fit from dream()
#
# @return Given a t-statistic with degrees of freedom df, return a new t-statistic corresponding to a target degrees of freedom.  Both t-statistics give the same p-value when compared to their respective degrees of freedom
#
# @details Since dream() used degrees of freedom estimated from the data, the t-statistics across genes have different df values used to compute p-values.  Therefore the order of the raw t-statistics doesn't correspond to the order of the p-values since the df is different for each gene.  This function resolved this issue by transofrming the t-statistics to all have the same df.  Under a fixed effects model, df = N - p, where N is the sample size and p is the number of covariates.  Here, df is the target degrees of freedom used in the transformation
.standardized_t_stat = function( fit ){

  res = data.frame(t=as.numeric(fit$t), df.sat=as.numeric(fit$df.residual))

  # d_target = max(res$df.sat)
  d_target = nrow(fit$design) - ncol(fit$design)

  res$t_dot = sign(res$t) * .change_t_df( res$t, res$df.sat, d_target )

  fit2 = fit
  fit2$t = matrix(res$t_dot, nrow=nrow(fit$t))
  rownames( fit2$t) = rownames( fit$t)
  colnames( fit2$t) = colnames( fit$t)

  # fit2$df.residual = rep(d_target, nrow(res))

  # if df.prior is defined, set new df.total, since df.residual has changed
  if( !is.null(fit2$df.prior) ){
	fit2$df.total = fit2$df.residual + fit2$df.prior
  }

  fit2
}


#' Compare p-values from two analyses
#'
#' Plot -log10 p-values from two analyses and color based on donor component from variancePartition analysis
#'
#' @param p1 p-value from first analysis
#' @param p2 p-value from second analysis
#' @param vpDonor donor component for each gene from variancePartition analysis
#' @param dupcorvalue scalar donor component from duplicateCorrelation
#' @param fraction fraction of highest/lowest values to use for best fit lines
#' @param xlabel for x-axis
#' @param ylabel label for y-axis
#'
#' @return ggplot2 plot
#'
#' @examples
#'
#' # load library
#' # library(variancePartition)
#'
#' # Intialize parallel backend with 4 cores
#' library(BiocParallel)
#' register(SnowParam(4))
#'
#' # load simulated data:
#' # geneExpr: matrix of gene expression values
#' # info: information/metadata about each sample
#' data(varPartData)
#' 
#' # Perform very simple analysis for demonstration
#' 
#' # Analysis 1
#' form <- ~ Batch 
#' fit = dream( geneExpr, form, info)
#' fit = eBayes( fit )
#' res = topTable( fit, number=Inf, coef="Batch3" )
#' 
#' # Analysis 2
#' form <- ~ Batch + (1|Tissue)
#' fit2 = dream( geneExpr, form, info)
# fitEB2 = eBayes( fit2 )
#' res2 = topTable( fit2, number=Inf, coef="Batch3" )
#' 
#' # Compare p-values
#' plotCompareP( res$P.Value, res2$P.Value, runif(nrow(res)), .3 )
#' 
# # stop cluster
# stopCluster(cl)
#'
#' @export
# @docType methods
#' @rdname plotCompareP-method
plotCompareP = function( p1, p2, vpDonor, dupcorvalue, fraction=.2, xlabel=bquote(duplicateCorrelation~(-log[10]~p)), ylabel=bquote(dream~(-log[10]~p))){

	if( length(unique(c(length(p1), length(p2), length(vpDonor)))) != 1){
		stop("p1, p2 and vpDonor must have the same number of entries")
	}
	if( length(dupcorvalue) != 1){
		stop("dupcorvalue must be a scalar")
	}

	df2 = data.frame(p1=-log10(p1), p2=-log10(p2), vpDonor=vpDonor, delta = vpDonor - dupcorvalue)

	N = nrow(df2)

	c1 = sort(df2$delta)[N*fraction]
	c2 = sort(df2$delta)[length(df2$delta) - N*fraction]

	l1 = lm(p2 ~ p1, df2[df2$delta >= c2,])
	l2 = lm(p2 ~ p1, df2[df2$delta <= c1,])

	df_line = data.frame(rbind(coef(l1), coef(l2)))
	colnames(df_line) = c('a', 'b')
	df_line$type = c('darkred', 'navy')

	lim = c(0, max(max(df2$p1), max(df2$p2)))

	# xlab("duplicateCorrelation (-log10 p)") + ylab("dream (-log10 p)")
	ggplot(df2, aes(p1, p2, color = vpDonor)) + geom_abline() + geom_point(size=2) + theme_bw(17) +  theme(aspect.ratio=1,plot.title = element_text(hjust = 0.5)) + xlim(lim) + ylim(lim) + xlab(xlabel) + ylab(ylabel) + 
		geom_abline( intercept=df_line$a, slope=df_line$b, color=df_line$type, linetype=2) + scale_color_gradientn(name = "Donor", colours = c("blue","green","red"), 
	                       values = rescale(c(0, dupcorvalue, 1)),
	                       guide = "colorbar", limits=c(0, 1))
}

#' Multiple Testing Genewise Across Contrasts
#' 
#'  For each gene, classify a series of related t-statistics as up, down or not significant.
#' 
#' @param object numeric matrix of t-statistics or an 'MArrayLM2' object from which the t-statistics may be extracted.
#' @param ... additional arguments
#' 
#' @details Works like limma::classifyTestsF, except object can have a list of covariance matrices object$cov.coefficients.list, instead of just one in object$cov.coefficients
#' @seealso limma::classifyTestsF
#' @export
setGeneric("classifyTestsF", signature="object",
  function(object, ...)
      standardGeneric("classifyTestsF")
)



#' Multiple Testing Genewise Across Contrasts
#' 
#'  For each gene, classify a series of related t-statistics as up, down or not significant.
#' 
#' @param object numeric matrix of t-statistics or an 'MArrayLM2' object from which the t-statistics may be extracted.
#' @param cor.matrix covariance matrix of each row of t-statistics.  Defaults to the identity matrix.
#' @param df numeric vector giving the degrees of freedom for the t-statistics.  May have length 1 or length equal to the number of rows of tstat.
#' @param p.value numeric value between 0 and 1 giving the desired size of the test
#' @param fstat.only logical, if 'TRUE' then return the overall F-statistic as for 'FStat' instead of classifying the test results
#' 
#' @details Works like limma::classifyTestsF, except object can have a list of covariance matrices object$cov.coefficients.list, instead of just one in object$cov.coefficients
#' @seealso limma::classifyTestsF
#' @importFrom stats qf
#' @export
setMethod("classifyTestsF", "MArrayLM2",
  function(object,cor.matrix=NULL, df=Inf, p.value=0.01, fstat.only=FALSE) {
#	Use F-tests to classify vectors of t-test statistics into outcomes
#	Gordon Smyth
#	20 Mar 2003.  Last revised 6 June 2009.

#	Method intended for MArrayLM objects but accept unclassed lists as well
	if(is.list(object)) {
		if(is.null(object$t)) stop("tstat cannot be extracted from object")
		computeCorrMat = ifelse(is.null(cor.matrix), TRUE, FALSE)
		if(missing(df) && !is.null(object$df.prior) && !is.null(object$df.residual)) df <- object$df.prior+object$df.residual
		tstat <- as.matrix(object$t)
	} else {
		tstat <- as.matrix(object)
	}
	ngenes <- nrow(tstat)
	ntests <- ncol(tstat)
	if(ntests == 1) {
		if(fstat.only) {
			fstat <- tstat^2
			attr(fstat,"df1") <- 1
			attr(fstat,"df2") <- df
			return(fstat)
		} else {
			p <- 2 * pt(abs(tstat), df, lower.tail=FALSE)
			return(new("TestResults", sign(tstat) * (p < p.value) ))
		}
	}

	# F test of multiple coefficients
	#-------------------------------

	if( ngenes != length(object$cov.coefficients.list) ){
		msg = paste0("Number of genes does not equal number of elements in cov.coefficients.list\n", ngenes, " != ", length(object$cov.coefficients.list))
		stop(msg)
	}

	fstat <- rep(NA, ngenes) 
	names(fstat) = rownames(tstat)

	result <- matrix(0,ngenes,ntests,dimnames=dimnames(tstat))

	for(i in seq_len(ngenes) ){

		if( computeCorrMat ){
			if( is.null(object$cov.coefficients.list) ){
				C <- object$cov.coefficients
			}else{			
				C <- object$cov.coefficients.list[[i]]
			}
			# subset based on coefficient names
			C = C[colnames(object),colnames(object)]
			cor.matrix <- cov2cor( C )
		}else{
			cor.matrix = cov2cor(cor.matrix)
		}

		# cor.matrix is estimated correlation matrix of the coefficients
		# and also the estimated covariance matrix of the t-statistics
		if(is.null(cor.matrix)){
			r <- ntests
			Q <- diag(r)/sqrt(r)
		}else{
			E <- eigen(cor.matrix,symmetric=TRUE)
			r <- sum(E$values/E$values[1] > 1e-8)
			Q <- limma:::.matvec( E$vectors[,1:r], 1/sqrt(E$values[1:r]))/sqrt(r)
		}

		# Return overall moderated F-statistic only
		if(fstat.only) {
			fstat[i] <- drop( (tstat[i,,drop=FALSE] %*% Q)^2 %*% array(1,c(r,1)) )
		}
		if( i == 1){					
			attr(fstat,"df1") <- r
			attr(fstat,"df2") <- df[i]
		}

		# Return TestResults matrix
		qF <- qf(p.value, r, df[i], lower.tail=FALSE)
		if(length(qF)==1) qF <- rep(qF,ngenes) 
		x <- tstat[i,]
		if(any(is.na(x)))
			result[i,] <- NA
		else
			if( crossprod(crossprod(Q,x)) > qF[i] ) {
				ord <- order(abs(x),decreasing=TRUE)
				result[i,ord[1]] <- sign(x[ord[1]])
				for (j in 2:ntests) {
					bigger <- ord[1:(j-1)]
					x[bigger] <- sign(x[bigger]) * abs(x[ord[j]])
					if( crossprod(crossprod(Q,x)) > qF[i] )
						result[i,ord[j]] <- sign(x[ord[j]])
					else
						break
				}
			}
	}

	if(fstat.only){
		return( fstat )
	}

	new("TestResults",result)
})
Any scripts or data that you put into this service are public.
variancePartition documentation built on Nov. 8, 2020, 5:18 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
variancePartition
Quantify and interpret divers of variation in multilevel gene expression experiments

R/fitMixedModelDE.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

Defines functions plotCompareP .standardized_t_stat .change_t_df .standard_transform dream .checkNA .eval_lmm .getContrastInit .getAllUniContrasts getContrast residuals.MArrayLM2

Documented in dream .getAllUniContrasts getContrast plotCompareP .standard_transform

Try the variancePartition package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

variancePartition Quantify and interpret divers of variation in multilevel gene expression experiments

R/fitMixedModelDE.R In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments

Defines functions plotCompareP .standardized_t_stat .change_t_df .standard_transform dream .checkNA .eval_lmm .getContrastInit .getAllUniContrasts getContrast residuals.MArrayLM2

Documented in dream .getAllUniContrasts getContrast plotCompareP .standard_transform

Try the variancePartition package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

variancePartition
Quantify and interpret divers of variation in multilevel gene expression experiments

R/fitMixedModelDE.R
In variancePartition: Quantify and interpret divers of variation in multilevel gene expression experiments