## Azin's note: error handling code adapted from R core team/Martin Maechler
## as noted below.
##================================================================##
### In longer simulations, aka computer experiments, ###
### you may want to ###
### 1) catch all errors and warnings (and continue) ###
### 2) store the error or warning messages ###
### ###
### Here's a solution (see R-help mailing list, Dec 9, 2010): ###
##================================================================##
##' We want to catch *and* save both errors and warnings, and in the case of
##' a warning, also keep the computed result.
##'
##' @title tryCatch both warnings and errors
##' @param expr
##' @return a list with 'value' and 'warning', where
##' 'value' may be an error caught.
##' @author Martin Maechler
# Copyright (C) 2010 The R Core Team
tryCatchWarningsErrors <- function(expr)
{
result = NULL
W = NULL
E = NULL
# Warning Handler
warningHandler = function(w){W <<- w; invokeRestart("muffleWarning")}
executed = withCallingHandlers(tryCatch(expr, error=function(e) e), warning = warningHandler)
if("error" %in% class(executed)) {
E = executed
} else {
result = executed
}
final = list(
result = result,
error = E,
warning = W)
return(final)
}
fitModel = function(dat, modelFormula, modelEngine="blme", prior="wishart") {
model = NULL
if(modelEngine == "blme") {
# if weights are specified, fit a weighted model, otherwise fit an unweighted model.
if(is.null(dat$W)) {
model = tryCatchWarningsErrors(
blmer(modelFormula, data=dat, REML=FALSE,
control=lmerControl(optimizer="Nelder_Mead"),
fixef.prior = NULL,
var.prior = NULL,
cov.prior = prior)
)
} else {
# Fit a data-weighted hierarchical model with weakly informative priors.
model = tryCatchWarningsErrors(
blmer(modelFormula, data=dat, weights=W, REML=FALSE,
control=lmerControl(optimizer="Nelder_Mead"),
fixef.prior = NULL,
var.prior = NULL,
cov.prior = prior)
)
}
} else {
### Fit linear mixed model using the lme4 package, without any weakly informative priors
# if weights are specified, fit a weighted model, otherwise fit an unweighted model.
if(is.null(dat$W)) {
# Fit a data-weighted hierarchical model with no (ie: uniform) priors.
model = tryCatchWarningsErrors(
lmer(modelFormula,
data=dat,
control=lmerControl(optimizer="Nelder_Mead"),
REML=FALSE)
)
} else {
# Fit a data-weighted hierarchical model with no (ie: uniform) priors.
model = tryCatchWarningsErrors(
lmer(modelFormula,
data=dat,
weights=W,
control=lmerControl(optimizer="Nelder_Mead"),
REML=FALSE)
)
}
}
return(model)
}
###
# Compare 2 nested models using Likelihood Ratio Test, with error handling
###
fitAnova = function(fit0, fit1) {
anov = NULL
anov = tryCatchWarningsErrors(anova(fit0, fit1))
return(anov)
}
###
# Extract and return measures of model fit to data (AIC, log-Likelihood)
# These are useful to compare alternate, possibly non-nested, model structures
###
summaryFitLogLik = function(fit) {
if(is.null(fit)) {
lst = data.frame(AIC=NA, logLik=NA)
} else {
aic = extractAIC(fit)[2]
logL = as.numeric(logLik(fit))
lst = data.frame(AIC=aic, logLik=logL)
}
return(lst)
}
summaryFitEffectCoefs = function(fit) {
colcount = 8
if(is.null(fit)) {
lst = data.frame(matrix(NA, nrow=1, ncol=colcount))
} else {
coefs = coef(summary(fit))
rownames(coefs) = gsub("I\\(.*\\^2\\)", "timeNum2", rownames(coefs))
rownames(coefs) = gsub("[\\(\\)]", "", rownames(coefs))
rownames(coefs) = gsub(":", "_", rownames(coefs))
rownames(coefs) = gsub("COVARIATE2", "", rownames(coefs))
rownames(coefs) = gsub("COVARIATE", "", rownames(coefs))
colnames(coefs) = c("estimate", "SE", "t")
flat = list()
for(i in 1:nrow(coefs)) {
currRow = coefs[i,,drop=F]
colnames(currRow) = paste(rownames(currRow), colnames(currRow), sep="_")
flat[[i]] = currRow
}
lst = do.call(cbind,flat)
cols = tolower(colnames(lst))
cols = gsub("timenum_(estimate|t|se)$", "dropout_rate_\\1", cols)
# All other estimates are differences relative to the baseline covariate level
cols = gsub("timenum_(.*)_(estimate|t|se)$", "dropout_rate_diff_\\1_\\2", cols)
cols = gsub("timenum2_(estimate|t|se)$", "dropout_quadratic_\\1", cols)
# All other estimates are differences relative to the baseline covariate level
cols = gsub("timenum2_(.*)_(estimate|t|se)$", "dropout_quadratic_diff_\\1_\\2", cols)
colnames(lst) = cols
}
return(lst)
}
summaryAnova = function(anov) {
colcount = 3
if(is.null(anov)) {
lst = data.frame(matrix(NA, nrow=1, ncol=colcount))
} else {
chisq = round(anov$`Chisq`[2], digits=2)
dfs = anov$`Chi Df`[2]
pvalue_lrt = anov$`Pr(>Chisq)`[2]
lst = data.frame(chisq, dfs, pvalue_lrt)
}
colnames(lst) = c("chisq", "chisq_df", "pvalue_lrt")
return(lst)
}
###
# Common model fitting function called for any model formula.
# Other functions are responsible for defining the type of model
# to fit, and calling this function to do the grunt work.
###
fitModelAndFormatOutput = function(dat,
modelFormula,
#modelFormulaNoCovariate,
modelType,
pvalueType="gaussian",
modelEngine="blme",
prior="wishart",
outputLevel=c("summary")) {
allLevels = c("minimal", "summary", "full")
outputLevel = intersect(outputLevel, allLevels)
if(length(outputLevel) < 1) {
specified = paste(outputLevel, sep="", collapse=", ")
allowed = paste(allLevels, sep="", collapse=", ")
msg = paste("ERROR in fitModelAndFormatOutput():",
"\n\nNo valid output levels specified. Specified levels: ", specified,
"\n\nOutput evels must be some combination of: ", allowed,
"\n\n\n\n", paste = "")
stop(msg)
}
modelTypes = c("isogenicRg", "isogenicGene", "panelGene", "panelRg")
if(length(outputLevel) < 1) {
allowed = paste(modelTypes, sep="", collapse=", ")
msg = paste("ERROR in fitModelAndFormatOutput():",
"\n\nInvalid model type specified: ", modelType,
"\n\nModel type must be one of: ", allowed,
"\n\n\n\n", paste = "")
stop(msg)
}
# Evaluate model fit with and without the timeNum parameter to evaluate
# dropout significance
# f0 = modelFormulaNoCovariate
f1 = modelFormula
# m0 = NA
m1 = NA
minimalSummary = NA
finalSummary = NA
# Weakly informative priors are only supported by the blme package, not lme4, so remove any prior specification
# in case the model engine is different.
if(modelEngine != "blme") {
prior = NULL
}
# While currently disabled, we also have the option of fitting models with and without the covariate
# and perform a LRT to determine significance.
# m0 = fitModel(dat, f0, prior)
m1 = fitModel(dat, f1, modelEngine=modelEngine, prior=prior)
# Extract and capture any warning/error messages that were returned during model fitting process.
warningMsg = ifelse(is.null(m1$warning), "", m1$warning$message)
errorMsg = ifelse(is.null(m1$error), "", m1$error$message)
if(!(is.null(m1$result)) & !(is.null(m1$result))) {
# anov01 = fitAnova(m0$result, m1$result)
summaryFitMetrics = summaryFitLogLik(m1$result)
summaryFixedEffects = summaryFitEffectCoefs(m1$result)
# P-value calculations based on the t distribution require providing Denominator Degrees of Freedom (ddf)
# values, which are calculated using a heuristic and valid only for the default siMEM model structures
# For analyses containing hundreds or thousands of measurements, both calculations produce very similar results
# Since the t with high ddf values (> 30 to 50) is well-approximated by a gaussian, at least for the vast bulk
# of non-extreme p-values in the analysis, which determine the FDR adjustment used to set significance thresholds.
# The extreme p-values will rank at the head of the list and survive FDR adjustment regardless of approach used.
if(pvalueType=="t") {
ddf = withinBetweenDF(dat, modelType=modelType)
summaryFixedEffectsPValues = getModelPValues(summaryFixedEffects, pvalueType, ddf)
} else {
summaryFixedEffectsPValues = getModelPValues(summaryFixedEffects, pvalueType)
}
# summaryAOV = summaryAnova(anov01$result)
finalSummary = cbind.data.frame(summaryFitMetrics,
summaryFixedEffects,
# summaryAOV,
summaryFixedEffectsPValues)
if("minimal" %in% outputLevel) {
minimalSummary = getMinimalSummary(finalSummary)
}
}
final = list()
if("minimal" %in% outputLevel) {
final[["minimal"]] = minimalSummary
}
if("summary" %in% outputLevel) {
final[["summary"]] = finalSummary
}
if("full" %in% outputLevel) {
final[["full"]] = m1
}
final[["warning"]] = warningMsg
final[["error"]] = errorMsg
return(final)
}
###
#
# Fit a single hairpin model for sets of isogenic screens. Isogenic screens are screens performed on the same
# underlying cell line, e.g., cell line +/- drug, or cell line +/- introduced mutation. The presumption being
# that observed differences are due to the perturbation. We don't group by cell line in an isogenic analysis,
# since the underlying cell line is the same.
#
###
fitIsogenicCovariateRg = function(dat,
formulas=getModelFormulas(),
pvalueType="gaussian",
modelEngine="blme",
outputLevel=c("summary")) {
f1 = formulas$isogenicRg
# look for the "timeNum" parameter in the model formula to determine whether we're doing an end-point
# or short time-course analysis
equationHasTime = grep("timeNum", as.character(f1))
# If the parameter is not found in the equation, integer(0) is returned, and has length 0
equationHasTime = length(equationHasTime) > 0
if(endPoint == TRUE | !equationHasTime) {
prior = "gamma"
} else {
prior = "wishart"
}
final = fitModelAndFormatOutput(dat=dat,
modelFormula=f1,
modelType="isogenicRg",
pvalueType=pvalueType,
modelEngine=modelEngine,
prior=prior,
outputLevel=outputLevel)
return(final)
}
fitIsogenicCovariateGene = function(dat,
formulas=getModelFormulas(),
pvalueType="gaussian",
modelEngine="blme",
outputLevel=c("summary")) {
f1 = formulas$isogenicGene
# look for the "timeNum" parameter in the model formula to determine whether we're doing an end-point
# or short time-course analysis
equationHasTime = grep("timeNum", as.character(f1))
# If the parameter is not found in the equation, integer(0) is returned, and has length 0
equationHasTime = length(equationHasTime) > 0
if(endPoint == TRUE | !equationHasTime) {
prior = "gamma"
} else {
prior = "wishart"
}
final = fitModelAndFormatOutput(dat=dat,
modelFormula=f1,
modelType="isogenicGene",
pvalueType=pvalueType,
modelEngine=modelEngine,
prior=prior,
outputLevel=outputLevel)
return(final)
}
fitPanelCovariateGene = function(dat,
formulas=getModelFormulas(),
pvalueType="gaussian",
modelEngine="blme",
outputLevel=c("summary"),
endPoint=FALSE) {
f1 = formulas$panelGene
# look for the "timeNum" parameter in the model formula to determine whether we're doing an end-point
# or short time-course analysis
equationHasTime = grep("timeNum", as.character(f1))
# If the parameter is not found in the equation, integer(0) is returned, and has length 0
equationHasTime = length(equationHasTime) > 0
if(endPoint==TRUE | !equationHasTime) {
prior = "gamma"
} else {
prior = "wishart"
}
final = fitModelAndFormatOutput(dat=dat,
modelFormula=f1,
modelType="panelGene",
pvalueType=pvalueType,
modelEngine=modelEngine,
prior=prior,
outputLevel=outputLevel)
return(final)
}
###
# Fit a single reagent differential essentiality model assayed in multiple cell lines with heterogenous background
# This produces hairpin-level results
###
fitPanelCovariateRg = function(dat,
formulas=getModelFormulas(),
pvalueType="gaussian",
modelEngine="blme",
outputLevel=c("summary"),
endPoint=FALSE) {
f1 = formulas$panelRg
# look for the "timeNum" parameter in the model formula to determine whether we're doing an end-point
# or short time-course analysis
equationHasTime = grep("timeNum", as.character(f1))
# If the parameter is not found in the equation, integer(0) is returned, and has length 0
equationHasTime = length(equationHasTime) > 0
if(endPoint == TRUE | !equationHasTime) {
prior = "gamma"
} else {
prior = "wishart"
}
final = fitModelAndFormatOutput(dat=dat,
modelFormula=f1,
modelType="panelRg",
pvalueType=pvalueType,
modelEngine=modelEngine,
prior=prior,
outputLevel=outputLevel)
return(final)
}
###
# Specify different (simpler) random effects structures to the fixed effects
# and fit all the alternate model structures using the data.
# Pick the best fitting result that didn't produce a model fitting error or warning
# This should eventually use an anova-based step function, like the LMERConvenienceFunctions
# package, but this package seems to add a lot of unnecessary junk to the model formula
# (redundant ranef statements, etc...) as a result of its forward fitting procedure
###
fitSimplerRandomEffects = function( dat,
fixef1,
randomVector,
modelType,
pvalueType="gaussian",
modelEngine="blme",
prior="wishart",
outputLevel) {
formula1 = paste0(fixef1, " + ", randomVector)
models = cbind.data.frame(formula1, randomEffect=randomVector, stringsAsFactors=F)
allFits = list()
allLogL = list()
for(i in 1:nrow(models)) {
f1 = as.formula(models[i,"formula1"])
currFit = fitModelAndFormatOutput(dat=dat,
modelFormula=f1,
modelType=modelType,
modelEngine=modelEngine,
prior=prior,
outputLevel="summary")
allFits[[i]] = currFit
logLikelihood = ifelse(sum(is.na(currFit$summary)) > 0, NA, currFit[["summary"]][1,"logLik"])
allLogL[[i]] = cbind.data.frame(index=i,
logL=logLikelihood,
warn=currFit$warning,
error=currFit$error,
stringsAsFactors=F)
}
allLogL = do.call(rbind, allLogL)
noErrors = allLogL[allLogL$warn == "" & allLogL$error == "", ]
if(nrow(noErrors) > 0) {
bestModel = noErrors[which.max(noErrors$logL), "index"]
final = allFits[[bestModel]]
randomEffect = models[bestModel, "randomEffect"]
final$warning = paste0("NOTE: Due to error/warning resulting from full model fit, model with simpler random effect structure used: ", randomEffect)
} else {
final = NA
}
return(final)
}
##
# Extract a subset of the columns from the full output, namely:
# - basic identifiers
# - for difference(s) in trends: magnitude & pvalue
# - relative dropout rate if available
##
getMinimalSummary = function(modelSummary, calculateRelativeDropout=FALSE, endPoint=FALSE) {
cols = colnames(modelSummary)
ids = intersect(c("reagentId", "geneId", "symbol"), cols)
if(endPoint==TRUE) {
magnitudes = cols[grep("^.*_estimate$", cols)]
pvalues = cols[grep("^.*_pvalue$", cols)]
# Because of end-point results formatting, need to explicitly exclude intercept values from the summary.
magnitudes = setdiff(magnitudes, "intercept_estimate")
pvalues = setdiff(pvalues, "intercept_pvalue")
minimalSummary = modelSummary[,c(ids, magnitudes, pvalues)]
} else {
dropouts = cols[grep("^dropout_rate_diff_.*estimate$", cols)]
pvalues = cols[grep("^dropout_rate_diff_.*pvalue$", cols)]
if(calculateRelativeDropout) {
relatives = cols[grep("^relative_dropout_rate_.*$", cols)]
minimalSummary = modelSummary[,c(ids, dropouts, relatives, pvalues)]
} else {
minimalSummary = modelSummary[,c(ids, dropouts, pvalues)]
}
}
return(minimalSummary)
}
##
# Instead of absolute difference, examine the difference relative to dropout rate
# for the category used as comparison baseline
# Ensure that the denominator of the ratio has a minimal non-zero value to prevent
# huge or undefined ratios due to flat dropout rate for the baseline category.
##
getRelativeDropoutRates = function(results, geneOrReagent="gene") {
# quant = ifelse(geneOrReagent == "gene", 0.75, 0.5)
# Use the median of the baseline dropout rates as the minimum
quant = 0.5
baselineColumn = "dropout_rate_estimate"
comparisonColumns = colnames(results)[grep("^dropout_rate_diff_.*_estimate$", colnames(results))]
relativeDropoutColumns = gsub("^dropout_rate_diff_(.*)_estimate$", "relative_dropout_rate_\\1", comparisonColumns)
# Calculate relative dropout rates for each non-baseline category and add them
# to the results matrix
for(i in 1:length(comparisonColumns)) {
results[,relativeDropoutColumns[i]] = getRelativeDropout(results[,baselineColumn], results[,comparisonColumns[i]], quant=quant)
}
return(results)
}
getRelativeDropout = function(baseline, delta, quant=0.5) {
if(quant < 0.01 | quant > 0.99) {
quant = 0.5
}
# A minimum value for the ratio denominator so that the ratio doesn't blow up
dropoutFloor = abs(quantile(baseline, probs=quant, na.rm=TRUE))
# More or less lethal?
deltaSign = sign(delta)
alternative = baseline + delta
minMax = t(apply(cbind(baseline, alternative), 1, sort))
# Without proper NA handling in the sort function, rows with NAs are removed
# and the casting to matrix/DF fails, thus breaking the code.
minMax = t(apply(cbind(baseline, alternative), 1, sort, na.last=T))
minMax = as.data.frame(minMax)
colnames(minMax) = c("numerator", "denominator")
# If the the greater numeric trend is above 0, shift both trends down so
# that the greater one is now at -dropoutFloor.
shifts = ifelse(minMax$denominator > -dropoutFloor, -(minMax$denominator+dropoutFloor), 0)
minMax$numerator = minMax$numerator + shifts
minMax$denominator = minMax$denominator + shifts
# Set minimal magnitude of ratio denominator to avoid extreme/undefined ratios
# minMax$denominator = ifelse(minMax$denominator > dropoutFloor, dropoutFloor, minMax$denominator)
# Negative relative dropouts are more essential in the second condition
# whereas positive relative dropouts are more essential in the baseline condition
ratio = deltaSign * abs(minMax$numerator/minMax$denominator)
return(ratio)
}
##
# Given the t-values, and calculated degrees of freedom, generate pvalues for
# model fixed effects.
#
# For the t-based p-value calculation, we need to specify the Denominator Degrees of Freedom
# which is specified for the siMEM model structure using the Between-Within heuristic
# (as implemented in the 'nlme' R package)
##
getModelPValues = function(fixedEffects, pvalueType="gaussian", ddf=NULL) {
# Get the t-statistics output by the model
summaryT = unlist(fixedEffects[,grep("_t$", colnames(fixedEffects)), drop=T])
cols = names(summaryT)
# Generate column names for p-values
colsPvalues = gsub("_t$", "_pvalue", cols)
# For the current siMEM model structure, all 3 parameter (intercept, baseline slope, slope difference) ddfs are identical
# This will not be the case, for example, if we have a 4 parameter (+ COVARIATE) model
pvalueCount = length(cols)
if(pvalueType=="t") {
colsDF = gsub("_t$", "_df", cols)
commonDF = ddf[["intercept"]]
matrixDF = rep(commonDF, pvalueCount)
pvalues = 2*pt(-abs(summaryT), df=commonDF)
pvalues = data.frame(t(c(matrixDF, pvalues)))
colnames(pvalues) = c(colsDF, colsPvalues)
} else {
# Here we Rely on the Gaussian approximation of the t distribution as ddf increases past 30-50
pvalues = 2*pnorm(-abs(summaryT))
pvalues = data.frame(t(pvalues))
colnames(pvalues) = colsPvalues
}
return(pvalues)
}
##
# Apply Benjamini-Hochberg adjustment to each P-value column in the results
##
applyFDR = function(results) {
if(is.data.frame(results) && nrow(results) > 1) {
pvals = results[,grep("pvalue", colnames(results)),drop=F]
cols = colnames(pvals)
colsFDR = gsub("pvalue", "fdr", cols)
fdr = apply(pvals, 2, p.adjust, method="BH")
colnames(fdr) = colsFDR
results = cbind.data.frame(results, fdr, stringsAsFactors=F)
}
return(results)
}
##
# given a list of T values, and associated degrees of freedom, calculate the
# associated pvalues
##
pvaluesT = function(tvalues, dfs) {
# if for whatever reason, the nlme call failed, use the smallest degrees of
# freedom (most "conservative") to get pvalue associated with each T value
mindf = min(dfs)
dfs = ifelse(dfs == -1, mindf, dfs)
tvalues = -abs(tvalues)
vect = cbind.data.frame(tvalues, dfs)
pvalues = apply(vect, 1, function(v){2*pt(v[1], df=v[2])})
return(pvalues)
}
##
# For larger T degrees of freedom (Rule of Thumb, T DoF > 30-50)
# T distribution is well-approximated by Gaussian
# so T statistic is treated as Z statistic and p-value computed using Gaussian
# approximation to T distribution with large degrees of Freedom
#
# For a panel of 20+ screens
# Estimated degrees of freedom for panel analysis range in the hundreds to thousands
# so large-sample approximation is applicable to panel analysis, either at reagent or gene level
##
pvaluesNorm = function(tvalues) {
tvalues = -abs(tvalues)
pvalues = 2*pnorm(tvalues)
return(pvalues)
}
getVarianceWeights = function(dat, min_var = 0.01, max_var=2) {
dat$grouping = paste(dat$reagentId, dat$cellLineId, dat$timeGroup, dat$COVARIATE, sep="-")
vars = by(dat$expr, list(dat$grouping), var, na.rm=TRUE)
vars = cbind.data.frame(names(vars), as.numeric(vars), stringsAsFactors=F)
colnames(vars) = c("grouping", "var")
dat = merge(dat, vars)
dat$var_bounded = dat$var
dat$var_bounded = ifelse(dat$var_bounded < min_var, min_var, dat$var_bounded)
dat$var_bounded = ifelse(dat$var_bounded > max_var, max_var, dat$var_bounded)
# Weights are inversely proportional to the variance, multiplied by a scaling
# factors s.t. the total sum of weights is still the number of rows/data points
# without this, the significance values change substantially.
dat$W = (nrow(dat)/sum(1/dat$var_bounded, na.rm=TRUE)) * (1/dat$var_bounded)
return(dat)
}
##
# For the siMEM model structures, calculate the Denominator Degrees of Freedom
# using the Between-Within heuristic
#
# DDF = m_i - m_(i-1) - p_i
# by convention, the DDF of the intercept is evaluated at the lowest level
# the time and time:COVARIATE covariates are also evaluated at this level, since there
# are multiple -different- timed measurements associated with each screen, hence
# time and time:COVARIATE are "WITHIN/INNER" to the screenId grouping level
#
##
withinBetweenDF = function(dat, modelType="panelGene") {
ddf = list()
if(modelType == "panelGene") {
denDFi = nrow(dat) - length(unique(dat$cellLineIdUnique)) - length(unique(dat$COVARIATE))
# DDF = m_i - m_(i-1) - p_i
# For Panel analyses, the COVARIATE is a cell-line specific variable, hence changes at
# the level of cellLineId, but not within replicate measures of a given cellLineId
denDFdelta = length(unique(dat$cellLineIdUnique)) - length(unique(dat$reagentId)) - length(unique(dat$COVARIATE)) + 1
ddf[["intercept"]] = denDFi
ddf[["interceptdelta"]] = denDFdelta
ddf[["timenum"]] = denDFi
} else if(modelType == "panelRg") {
denDFi = nrow(dat) - length(unique(dat$cellLineIdUnique)) - length(unique(dat$COVARIATE))
# Here, the m_(i-1) = m_0 = 1 by convention, since the model includes an intercept term.
denDFdelta = length(unique(dat$cellLineIdUnique)) - 1 - length(unique(dat$COVARIATE)) + 1
ddf[["intercept"]] = denDFi
ddf[["interceptdelta"]] = denDFdelta
ddf[["timenum"]] = denDFi
} else if(modelType == "isogenicGene") {
# DDF = m_i - m_(i-1) - p_i
# by convention, the DDF of the intercept is evaluated at the lowest level
# the time and time:COVARIATE covariates are also evaluated at this level, since there
# are multiple -different- array expression values associated with each screen, hence
# time and time:COVARIATE are "WITHIN/INNER" to the screenId grouping level
denDFi = nrow(dat) - length(unique(dat$replicateIdUnique)) - length(unique(dat$COVARIATE))
# For isogenic screens, the COVARIATE variable itself changes at the level of replicateIdUnique,
# so the DDF is the same as for the intercept
denDFdelta = denDFi
ddf[["intercept"]] = denDFi
ddf[["interceptdelta"]] = denDFdelta
ddf[["timenum"]] = denDFi
} else if(modelType == "isogenicRg") {
# DDF = m_i - m_(i-1) - p_i
# by convention, the DDF of the intercept is evaluated at the lowest level
# the time and time:COVARIATE covariates are also evaluated at this level, since there
# are multiple -different- array expression values associated with each screen, hence
# time and time:COVARIATE are "WITHIN/INNER" to the screenId grouping level
denDFi = nrow(dat) - length(unique(dat$replicateIdUnique)) - length(unique(dat$COVARIATE))
# For isogenic screens, the COVARIATE variable itself changes at the level of replicateIdUnique,
# so the DDF is the same as for the intercept
denDFdelta = denDFi
ddf[["intercept"]] = denDFi
ddf[["interceptdelta"]] = denDFdelta
ddf[["timenum"]] = denDFi
} else {
msg = paste("ERROR in withinBetweenDF():",
"\n\nUnrecognized model type Between-Within Degrees of Freedom calculation: ", modelType, sep="")
stop(msg)
}
return(ddf)
}
##
# Get per-row mean-variance for a set of replicates (eg: same screen, timepoint)
##
getReplicateMeanVar = function(reps) {
if(ncol(reps) > 1) {
mu = rowMeans(reps, na.rm=TRUE)
variance = rowVars(reps, na.rm=T)
} else {
# Since variance is used to weight measurements, setting variance to 1 by default
# will lead to all measurements being equally weighted
mu = reps[,1]
variance = rep(1, length(mu))
}
final = cbind.data.frame(mu=mu, variance=variance)
return(final)
}
##
# smoothing with locfit causes segfault in locfit C code for some data (Project Achilles Cowley 2014 in particular)
##
getMeanVar = function(exprs, pheno, minVar=0.1, smoothing = 0.5, method="loess", variableMap=getDefaultVariableMap(), printProgress=FALSE) {
meanVar = list()
meanVarFit = list()
meanVarPredicted = list()
message("Beginning calculation of replicate means, and smoothed mean-variance function for grouped/replicate samples...")
# Identify the column that identifies biological replicates for the mean-variance
# calculation, and calculate the mean-variance function empirically, using local regression
for(grp in unique(pheno[,variableMap$replicateGroupId])) {
if(printProgress) {
message(grp)
}
# Get the unique sample identifiers for all cell line/timepoint replicates
samples = pheno[pheno[,variableMap$replicateGroupId] == grp, variableMap$sampleId]
sampleDat = exprs[,samples,drop=F]
meanVarTable = getReplicateMeanVar(sampleDat)
if(length(samples) > 1) {
if(method == "locfit") {
# the locfit C code sometimes segfaults - possibly when too many mean-variance pairs are identical
meanVarFitted = locfit(variance ~ mu, data=meanVarTable, family="gamma", alpha=smoothing)
# To get the precision (inverse-variance) weights, first get the smoothed mean-variance fit at the given mean.
meanVarTable$variance_smoothed = fitted(meanVarFitted, data=meanVarTable)
} else {
meanVarFitted = loess(variance ~ mu, data=meanVarTable, span=smoothing)
# To get the precision (inverse-variance) weights, first get the smoothed mean-variance fit at the given mean.
meanVarTable$variance_smoothed = predict(meanVarFitted, data=meanVarTable)
}
} else {
# In the case of a single replicate, set the mean as the values of that replicate, and the variance to 1
# inverse-variance weights will then be equal to 1, ie: equal weights.
meanVarFitted = NA
meanVarTable$variance_smoothed = meanVarTable$variance
}
meanVarTable$mu = round(meanVarTable$mu, digits=2)
meanVarTable$variance = round(meanVarTable$variance, digits=3)
meanVarTable$variance_smoothed = round(meanVarTable$variance_smoothed, digits=3)
meanVarTable$variance_smoothed = ifelse(meanVarTable$variance_smoothed < minVar, minVar, meanVarTable$variance_smoothed)
meanVar[[grp]] = meanVarTable
meanVarFit[[grp]] = meanVarFitted
}
final = list()
final[["mean_var"]] = meanVar
final[["mean_var_fit"]] = meanVarFit
return(final)
}
##
# Calculate smoothed mean-variance curves for each set of grouped/replicate samples, and
# assign inverse-variance weights to each data data point in the matrix
##
addPrecisionWeights = function(eset, variableMap = getDefaultVariableMap(), minVar=0.01, smoothing=0.05, printProgress=FALSE) {
shrna = exprs(eset)
pheno = pData(eset)
fdat = fData(eset)
meanVar = getMeanVar(shrna, pheno, minVar=minVar, smoothing=smoothing, variableMap=variableMap, printProgress=printProgress)
message("Finalizing mean-variance function calculations and assigning precision weights to data points...")
groupMeans = lapply(meanVar[[1]], function(dataf) {return(dataf[,"mu"])} )
groupMeans = t(do.call(rbind, groupMeans))
groupMeans = groupMeans[,pheno[,variableMap$replicateGroupId]]
colnames(groupMeans) = pheno[,variableMap$sampleId]
rownames(groupMeans) = rownames(shrna)
groupVars = lapply(meanVar[[1]], function(dataf) {return(dataf[,"variance"])} )
groupVars = t(do.call(rbind, groupVars))
groupVars = groupVars[,pheno[,variableMap$replicateGroupId]]
colnames(groupVars) = pheno[,variableMap$sampleId]
rownames(groupVars) = rownames(shrna)
varWeights = lapply(meanVar[[1]], function(dataf) {return(dataf[,"variance_smoothed"])} )
varWeights = t(do.call(rbind, varWeights))
# duplicate the weights for columns containing replicates for the same timepoint
varWeights = varWeights[,pheno[,variableMap$replicateGroupId]]
colnames(varWeights) = pheno[,variableMap$sampleId]
rownames(varWeights) = rownames(shrna)
varWeights = 1/varWeights
varWeights = round(varWeights, digits=2)
# Add the means and smoothed precision weights to the ExpressionSet in additional assayData slots
# These slots will be accessed if the models are fit with precision weights to mitigate heteroscedasticity.
storageMode(eset) = "environment"
assayData(eset)[["varWeights"]] = varWeights
assayData(eset)[["groupMeans"]] = groupMeans
assayData(eset)[["groupVars"]] = groupVars
storageMode(eset) = "lockedEnvironment"
message("Done.")
return(eset)
}
##
# Fit a local regression to estimate the posterior probability of "signal"
# as a function of measurement intensity
##
getSignalProb = function(signalProb) {
# Use the predict method to interpolate posterior probability for values not covered in
# input signalProb table, which has a
fit = splinefun(signalProb[,1], signalProb[,2], method="natural")
return(fit)
}
addSignalProbWeights = function(eset, signalProbTable, variableMap=getDefaultVariableMap()) {
posterior = getSignalProb(signalProbTable)
groupMeans = assayData(eset)[["groupMeans"]]
pheno = pData(eset)
# This function calculates both group means and the variable map
if(is.null(groupMeans)) {
eset = addMeanVarianceWeights(eset, variableMap=variableMap)
}
if(length(grep(variableMap$screenId, colnames(pheno))) == 0) {
pheno[,variableMap$screenId] = paste(pheno[,variableMap$cellLineId], pheno[,variableMap$condition], sep="_")
pheno[,variableMap$screenId] = gsub("_$", "", pheno[,variableMap$screenId])
print(table(pheno[,variableMap$screenId]))
}
# Order the samples by sample id, then timepoint
meta = pheno[order(pheno[,variableMap$screenId], pheno[,variableMap$timeGroup]),]
tempMx = matrix(1, nrow=nrow(groupMeans), ncol=ncol(groupMeans))
rownames(tempMx) = rownames(groupMeans)
colnames(tempMx) = colnames(groupMeans)
message("Begin calculating signal probability weights...")
for(screenId in unique(meta[,variableMap$screenId])) {
message(screenId)
# Reset screen-specific intermediate variables
posteriorProd = rep(1, nrow(groupMeans))
previousProb = rep(1, nrow(groupMeans))
# previousProb = 1:nrow(groupMeans)
timePoints = unique(meta[ meta[,variableMap$screenId] == screenId, variableMap$timeGroup])
for(timept in timePoints) {
samples = meta[ meta[,variableMap$screenId] == screenId & meta[,variableMap$timeGroup] == timept, variableMap$sampleId]
repeated = matrix(rep(previousProb, times=length(samples)),
nrow=length(previousProb), ncol=length(samples))
colnames(repeated) = samples
# Store signal weights for this timepoint
tempMx[,samples] = repeated
# Get the posterior signal using the mean of all replicates
# The mean is repeated for all replicate samples, so just take 1st one
posteriorProb = posterior(groupMeans[,samples[1],drop=T])
# Keep a running product of probabilities from all previous timepoints
previousProb = previousProb * posteriorProb
previousProb = ifelse(previousProb < 0.01, 0.01, previousProb)
}
# stop("...")
}
# Ensure that the rows and columns match, so we don't incorrectly map weights
# table(rownames(tempMx) == rownames(groupMeans))
# table(colnames(tempMx) == colnames(groupMeans))
message("Finalizing signal probability weights calculations ...")
tempMx = round(tempMx, digits=3)
storageMode(eset) = "environment"
assayData(eset)[["signalProbWeights"]] = tempMx
storageMode(eset) = "lockedEnvironment"
return(eset)
}
###
# Store/access model formulas for later retrieval.
# The list returned by this function can be passed as a parameter to simem(),
# making it easier to run models with alternate structures.
###
getModelFormulas = function(type="") {
# Gene-level models
# For end-point analyses
# f1 = as.formula("expr ~ 1 + COVARIATE + (1|reagentId/cellLineId)")
# CROSSED random effects
# f1 = as.formula("expr ~ 1 + COVARIATE + (1|reagentId) + (1|cellLineId)")
# Reagent nested in cell line
# f1 = as.formula("expr ~ 1 + COVARIATE + (1|cellLineId/reagentId)")
# For short time-course analyses
# f1 = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|reagentId/cellLineId)")
# CROSSED random effects
# f1 = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|reagentId) + (timeNum|cellLineId)")
# Reagent nested in cell line
# f1 = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|cellLineId/reagentId)")
modelMap = list(
panelGene = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|reagentId/cellLineId)"),
panelRg = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|cellLineId)"),
isogenicGene = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (timeNum|reagentId) + (0+timeNum|reagentId/replicateId)"),
isogenicRg = as.formula("expr ~ 1 + timeNum + timeNum:COVARIATE + (0+timeNum|replicateId)")
)
if(type == "endpoint") {
modelMap$panelGene = as.formula("expr ~ 1 + COVARIATE + (1|reagentId/cellLineId)")
modelMap$panelRg = as.formula("expr ~ 1 + COVARIATE + (1|cellLineId)")
}
return(modelMap)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.