##################################################################
# Requirements for generic data extraction function:
# - extract arbitrary subset of screens, identified by:
# cellLineId, condition, any combination of filtering criteria organized as a list (variable => vars)
#
# - optionally: add data column(s) to meta-data extracted from eSet pData()
# these will be merged with the metadata using unique "sampleId"
# filters can apply to these as well
# COVARIATE column may or may not be added using this additional column
#
# QUESTION: how to reconcile entering an "COVARIATE" matrix such as per-gene CN vs.
# adding multi-covariate matrix?
#
# IMPORTANT: check that
#
# - optionally: specify list of reagentIds with weights [0;1].
# Reagents with weight 0 are to be excluded.
# Total number of non-zero-weight reagents stored.
# Data weights renormalized to sum of non-zero weights, after T_i-1, variance weighting
#
# - optionally: specify a data frame of per-gene or per-reagent values, that will be slotted
# as the COVARIATE column
#
#
# INPUT ARGUMENTS:
# eset - eset containing array or sequencing-based quantification of screens
# idType - "gene" or "reagent"
# id - a variable with an id of the appropriate type
# covariate - variable column, contained in either the eset phenoData matrix or the
# "annotations" variable, to include as additional covariate in the
# model.
# reagentWeights - a data frame, which MUST have as 1st column the unique reagent
# identifier, with header the value of variableMap$reagentId
# (by default, "trcn_id"). Additionally, at least one column
# of the input data frame must be labeled "weight".
# At its simplest, labeling each reagent with 0 or 1 indicates
# which reagents to exclude from the analysis by some previous
# filtering or QC analysis. Data points with a weight of 0
# are ignored by the mixed-effect model. By default, all data
# points are assumed to have equal weight. This can be changed,
# for example, by using inverse-variance weights, to incorporate
# additional information.
# annotationsPerCellLine - additional columns that associate values with the different
# Cell Lines in the experiment.
# IMPORTANT: this data frame MUST have as 1st column
# the value of variableMap$sampleId (by default, "sample_id")
# annotationsPerId - additional matrix, indexed by samples for columns, and genes/reagents
# on rows, that provides gene/reagent specific values per sample.
# for example, these can be copy number status, or expression value
# IMPORTANT: this data frame MUST start with either:
# - the value of variableMap$geneId (by default, "gene_id")
# if the value of idType parameter is "gene"
# - the value of variableMap$reagentId (by default, "trcn_id")
# if the value of idType parameter is "reagent"
# variableMap - identifies the column names in the eSet phenoData and annotations
# table that correspond to the data format required for the Mixed-Effect,
# and other, R modeling packages.
##################################################################
extractData = function(eset,
idType,
id,
covariate=NULL,
annotationsPerCellLine=NULL,
annotationsPerId=NULL,
covariateFactorOrder=NULL,
reagentWeights=NULL,
cellLineWeights=NULL,
inverseVarianceWeights=TRUE,
signalProbWeights=FALSE,
variableMap = getDefaultVariableMap()
) {
# Load eset data
shrna = exprs(eset)
pheno = pData(eset)
fdat = fData(eset)
if(inverseVarianceWeights) {
varWeights = assayData(eset)[["varWeights"]]
}
if(signalProbWeights) {
signalWeights = assayData(eset)[["signalProbWeights"]]
}
##################################################################
# BEGIN - Input variable sanity check
##################################################################
# TEST that key columns are present in eset phenoData
phenoColsToKeep = c(variableMap$sampleId,
variableMap$cellLineId,
variableMap$replicateId,
variableMap$timeGroup,
variableMap$timeNum)
if(length(intersect(phenoColsToKeep, colnames(pheno))) != length(phenoColsToKeep)) {
expected = paste(phenoColsToKeep, sep="", collapse=", ")
observed = paste(intersect(phenoColsToKeep, colnames(pheno)), sep="", collapse=", ")
msg = paste("ERROR in extractData():\n\n",
"eset phenoData data frame must have, at a minimum, the following columns: ", expected,
"However, only the following were observed: ", observed,
"\n\nRequired column names can be customized using the 'variableMap' parameter",
"\n\n\n\n", sep="")
stop(msg)
}
if(!(idType %in% c("gene", "reagent"))) {
msg = paste("ERROR in extractData():\n\n",
"Incorrect 'idType' value (either 'gene' or 'reagent' allowed): ", idType,
"\n\n\n\n", sep="")
stop(msg)
}
if(!is.data.frame(variableMap) || nrow(variableMap) != 1) {
msg = paste("ERROR in extractData():\n\n",
"Incorrect 'variableMap' input (data frame with single row required)",
"\n\n\n\n", sep="")
stop(msg)
}
# TEST sampleId labeling
# check that annotations table, if specified, has sampleId as first column
if(is.data.frame(reagentWeights) &&
(colnames(reagentWeights)[1] != variableMap$reagentId ||
length(grep("weight", colnames(reagentWeights))) == 0)) {
msg = paste("ERROR in extractData():\n\n",
"Reagent weights table must have as first column: ",
variableMap$reagentId,
"\n\nThe data frame must also contain a 'weight' column.",
"\n\nReagent identifier can be customized using the 'variableMap' parameter", "\n\n\n\n", sep="")
stop(msg)
}
# TEST sampleId labeling
# check that annotations table, if specified, has sampleId as first column
if(is.data.frame(annotationsPerCellLine) && colnames(annotationsPerCellLine)[1] != variableMap$cellLineId) {
msg = paste("ERROR in extractData():\n\n",
"Additional 'annotationsPerCellLine' table must have as first column ", variableMap$cellLineId,
"\n\nRequired column names can be customized using the 'variableMap' parameter",
"\n\n\n\n", sep="")
stop(msg)
}
# If the additional annotations per cell line table has non-id columns that overlap
# with the pheno column, warn of this and drop the additional columns from the pheno matrix
# TEST sampleId labeling
# check that annotations table, if specified, has sampleId as first column
if(is.data.frame(annotationsPerCellLine)) {
overlap = setdiff(
intersect(colnames(pheno), colnames(annotationsPerCellLine)),
variableMap$cellLineId)
# Warn of duplicate column names, and drop them from the pheno matrix.
if(length(overlap) > 0) {
colsToRemove = paste(overlap, sep="", collapse="|")
superceded = paste(overlap, sep="", collapse=", ")
pheno = pheno[,-grep(colsToRemove, colnames(pheno))]
msg = paste0("WARNING in extractData():\n\n",
"Duplicate columns other than cell line identifier shared between eset phenoData and 'annotationsPerCellLine'.\n",
"The following columns of phenoData were superceded by 'annotationsPerCellLine' input to avoid data merging issues:\n",
superceded,
"\n\n\n\n")
warning(msg)
}
}
# If we're specifying a per gene/reagent set of annotations (eg: Copy Number status)
# ensure that, if we're looking at
# gene-level data, the first column of annotationsById is "gene_id"
# reagent-level data, the first column of annotationsById is "reagent_id"
####if(is.data.frame(annotationsPerId) &&
#### ( ((idType == "gene") && (colnames(annotationsPerId)[1] != variableMap$geneId)) ||
#### ((idType == "reagent") && (colnames(annotationsPerId)[1] != variableMap$reagentId)) )) {
#### msg = paste("ERROR in extractData():\n\n",
#### "Additional 'annotationsPerId' table must have as first column either ",
#### variableMap$geneId, " or ", variableMap$reagentId,
#### "\n\nRequired column names can be customized using the 'variableMap' parameter", "\n\n\n\n", sep="")
#### stop(msg)
####}
if(is.data.frame(annotationsPerId) && (colnames(annotationsPerId)[1] != variableMap$geneId)) {
msg = paste("ERROR in extractData():\n\n",
"Additional 'annotationsPerId' table must have as first column ", variableMap$geneId,
"\n\nRequired column names can be customized using the 'variableMap' parameter", "\n\n\n\n", sep="")
stop(msg)
}
# Ensure that the id we're extracting data for has a unique entry in annotationsPerId
if(is.data.frame(annotationsPerId)) {
idVar = variableMap$geneId
if(idType == "reagent") {
gene = fdat[fdat[,variableMap$reagentId] == id, variableMap$geneId]
} else {
gene = id
}
checkIdMatch = which(annotationsPerId[,variableMap$geneId] == gene)
matchCount = length(checkIdMatch)
if(matchCount != 1) {
msg = paste("ERROR in extractData():\n\n",
"While checking 'annotationsPerId' data frame for unique matching gene or reagent id.",
"\n\nid specified: ", id,
"\n\nNumber of matching rows found for id in 'annotationsPerId': ", matchCount,
"\n\nPlease ensure that the first column of 'annotationsPerId' contains an entry for the specified id.",
"\n\n\n\n", sep="")
stop(msg)
}
}
# TEST sampleId labeling for the phenoData of the eset
if(length(intersect(colnames(pheno), variableMap$sampleId)) != 1) {
observed = paste(colnames(pheno), sep="", collapse=", ")
msg = paste("ERROR in extractData():\n\n",
"At least one of the eset phenoData columns must be: ", variableMap$sampleId,
"\n\nWe observe the following columns in the phenoData: ", observed,
"\n\nRequired column names can be customized using the 'variableMap' parameter", "\n\n\n\n", sep="")
stop(msg)
}
# TEST gene/reagent ids
# test that gene and reagent ids are properly set in the fData of the eset
idVars = c(variableMap$reagentId, variableMap$geneId, variableMap$symbol)
idMatch = intersect(colnames(fdat), idVars)
if(length(idMatch) != length(idVars)) {
expected = paste(idVars, sep="", collapse=", ")
observed = paste(colnames(fdat), sep="", collapse=", ")
msg = paste("ERROR in extractData():\n\n",
"Cannot find reagent id, gene id or gene symbol in eset featureData.",
"\n\nWe expect to find the following columns: ", expected,
"\n\nWe observe the following columns in the featureData: ", observed,
"\n\nId variable columns can be customized using the 'variableMap' parameter", "\n\n\n\n", sep="")
stop(msg)
}
#############################################
# MERGE ADDITIONAL ANNOTATIONS into the pheno matrix if theye're provided
#############################################
if(is.data.frame(annotationsPerCellLine)) {
# add additional columns to the pheno on the fly
pheno = merge(pheno, annotationsPerCellLine)
# Merge may not preserve row names, and these are used later on
rownames(pheno) = pheno[,variableMap$sampleId]
}
# If a per-id annotation table is supplied, pull out the annotation matching
# the specified id, and put the "per-id" values into the "COVARIATE" column
if(is.data.frame(annotationsPerId)) {
idVar = variableMap$geneId
if(idType == "reagent") {
gene = fdat[fdat[,variableMap$reagentId] == id, variableMap$geneId]
} else {
gene = id
}
values = annotationsPerId[annotationsPerId[,idVar] == gene,-1,drop=F]
values = t(values)
values = cbind.data.frame(rownames(values), values, stringsAsFactors=F)
colnames(values)[1:2] = c(variableMap$cellLineId, "COVARIATE")
# add additional columns to the pheno on the fly
pheno = merge(pheno, values)
rownames(pheno) = pheno[,variableMap$sampleId]
}
# Ensure covariate variable exists in either, provided it hasn't been extracted
# from annotationsPerId just above
if(is.character(covariate) && !is.data.frame(annotationsPerId) && length(intersect(colnames(pheno), covariate)) != length(covariate)) {
msg = paste("ERROR in extractData():\n\n",
"Cannot find this 'covariate' column in the eset phenoData (or annotationsPerCellLine, if specified): ", covariate,
"\n\n\n\n", sep="")
stop(msg)
}
# Test whether filter variables match those found in the data (pheno+annotations)
# if(is.list(filters)) {
# filterVars = names(filters)
# matchedVars = intersect(filterVars, colnames(pheno))
# mismatchedVars = setdiff(filterVars, colnames(pheno))
#
# if(length(mismatchedVars) > 0) {
# expected = paste(matchedVars, sep="", collapse=", ")
# observed = paste(mismatchedVars, sep="", collapse=", ")
# msg = paste("ERROR in extractData():\n\n",
# "Some specified filter variables not found in phenoData or additional annotations.",
# "\n\nFilter variables matching existing phenoData variables: ", expected,
# "\n\nFilter variables missing from phenoData: ", observed, "\n\n\n\n", sep="")
# stop(msg)
# }
# }
if(!is.null(covariate) && is.data.frame(annotationsPerId)) {
msg = paste("WARNING in extractData(): Both 'covariate' and 'annotationsPerId' parameters were specified.",
"\n\nAs a result, 'covariate' parameter will be ignored and the id-specific values pulled from annotationsPerId will be used instead.",
"\n\n\n\n", sep="")
warning(msg)
}
##################################################################
# END - Input variable sanity check
##################################################################
##################################################################
# BEGIN - Subset samples to analyze using filters, keep only relevant columns
##################################################################
# # include all samples by default
# sampleFilter = rep(TRUE, nrow(pheno))
# # APPLY FILTER - get the samples satisfying all filtering criteria
# if(is.list(filters)) {
# for(filterVar in names(filters)) {
# # AND each new filter condition with previous ones
# sampleFilter = sampleFilter & (!is.na(pheno[,filterVar]) & (pheno[,filterVar] %in% filters[[filterVar]]))
# }
# }
# # Restrict to a subset of samples, and to relevant columns
# # Keep basic information, plus any variables filtered on, as well as the
# # COVARIATE variable (the covariate, such as ERBB2 amp)
# if(is.list(filters)) {
# phenoColsToKeep = union(phenoColsToKeep, names(filters))
# }
# We took the row from the annotationsPerId table corresponding to our entry
if(is.data.frame(annotationsPerCellLine)) {
phenoColsToKeep = union(phenoColsToKeep, colnames(annotationsPerCellLine))
}
# We took the row from the annotationsPerId table corresponding to our entry
if(is.data.frame(annotationsPerId)) {
phenoColsToKeep = union(phenoColsToKeep, "COVARIATE")
}
# We preferentially extract the covariate column from the perId annotations table
# if specified, otherwise use the "covariate" variable if specified
# if(is.character(covariate) && !is.data.frame(annotationsPerId)) {
# phenoColsToKeep = union(phenoColsToKeep, covariate)
# }
# Keep only the columns required for the analysis
pheno = pheno[,phenoColsToKeep]
# keep only a subset of samples, and columns relevant to the analysis
# pheno = pheno[sampleFilter, phenoColsToKeep]
##################################################################
# END - Subset samples to analyze using filters, keep only relevant columns
##################################################################
##################################################################
# BEGIN - Subset shRNA intensity/count data to gene/reagent of interest
##################################################################
# extract rows corresponding to gene or reagent of interest
if(idType == "reagent") {
filtered = which(fdat[,variableMap$reagentId] == id)
} else if(idType == "gene") {
filtered = which(fdat[,variableMap$geneId] == id)
} else {
msg = paste("ERROR in extractData():\n\n",
idType, " id value specified does not match any in the eset featureData: ", id,
"\n\n\n\n", sep="")
stop(msg)
}
gene_id = unique(fdat[filtered,variableMap$geneId])
symbol = unique(fdat[filtered,variableMap$symbol])
# extract reagent/gene ids for subset of screens matching our filters
# before merging data with annotations, ensure that properly labeled sample id
# column exists
tempWide = shrna[filtered,rownames(pheno),drop=F]
temp = t(tempWide)
temp = cbind.data.frame(rownames(temp), temp, stringsAsFactors=F)
colnames(temp)[1] = variableMap$sampleId
##################################################################
# END - Subset shRNA intensity/count data to extract gene/reagent of interest
##################################################################
##################################################################
# BEGIN - reshape data frame to associate annotations with each intensity measurement
##################################################################
# merge on sampleId, assuming column/sample names in intensity matrix don't match
# column names in phenoData (sample names can reasonably be assumed distinct from
# "doublings", "cell_line" and other such phenoData column names)
dat = merge(pheno, temp)
melted = melt(dat, id.vars=phenoColsToKeep, variable_name=variableMap$reagentId)
# Make the variable name more generic, in case we want to use TRPS
melted$geneId = rep(gene_id, nrow(melted))
melted$symbol = rep(symbol, nrow(melted))
# Standardize column names so that model formulae can be specified using these
# consistent column names
cols = standardizeColumnNames(colnames(melted), variableMap=variableMap)
# If we require an covariate column (covariate input param is set)
# and the COVARIATE column isn't already specified (eg: from annotationsPerId)
if(is.character(covariate) && length(covariate) == 1 && length(grep("COVARIATE", cols)) == 0) {
cols = gsub(paste0("^",covariate,"$"), "COVARIATE", cols)
# Alternatively, if 2+ covariates were specified, and no annotationsPerId value was specified,
} else if(is.character(covariate) && length(covariate) > 1 && length(grep("COVARIATE", cols)) == 0) {
cols = gsub(paste0("^",covariate[1],"$"), "COVARIATE", cols)
cols = gsub(paste0("^",covariate[2],"$"), "COVARIATE2", cols)
# Alternatively, if annotationsPerId value was specified, the per-gene values will already
# have been placed into the COVARIATE column, if an additional covariate has been specified
# rename that column as COVARIATE2. If length(covariate) argument > 1, ignore further values
} else if(is.character(covariate) && length(grep("COVARIATE", cols)) == 1) {
cols = gsub(paste0("^",covariate[1],"$"), "COVARIATE2", cols)
} else if(is.character(covariate) && length(grep("COVARIATE", cols)) == 1) {
cols = gsub(paste0("^",covariate[1],"$"), "COVARIATE2", cols)
}
# else {
# # WARNING? Or just remove this clause?
# warning("\n\nUnhandled condition in if-else block that sets COVARIATE columns...")
# }
# the melt() function puts the intensity into a column named "value"
cols = gsub("^value$", "expr", cols)
colnames(melted) = cols
melted$cellLineIdUnique = paste(melted$reagentId, melted$cellLineId, sep="-")
melted$replicateIdUnique = paste(melted$reagentId, melted$cellLineId, melted$replicateId, sep="-")
##################################################################
# END - reshape data frame to associate annotations with each intensity measurement
##################################################################
meltedVarWts = NA
meltedNoiseWts = NA
if(inverseVarianceWeights) {
varWeights = assayData(eset)[["varWeights"]]
# extract reagent/gene ids for subset of screens matching our filters
# before merging data weights with annotations, ensure that properly labeled
# sample id column exists
varWts = varWeights[filtered,rownames(pheno),drop=F]
varWts = t(varWts)
varWts = cbind.data.frame(rownames(varWts), varWts, stringsAsFactors=F)
colnames(varWts)[1] = variableMap$sampleId
datVarWts = merge(pheno, varWts)
meltedVarWts = melt(datVarWts, id.vars=phenoColsToKeep, variable_name=variableMap$reagentId)
meltedVarWts = standardizeColumnNames(meltedVarWts, variableMap=variableMap)
colnames(meltedVarWts) = gsub("^value$", "variance_weights", colnames(meltedVarWts))
}
if(signalProbWeights) {
signalWeights = assayData(eset)[["signalProbWeights"]]
# extract reagent/gene ids for subset of screens matching our filters
# before merging data weights with annotations, ensure that properly labeled
# sample id column exists
signalWts = signalWeights[filtered,rownames(pheno),drop=F]
signalWts = t(signalWts)
signalWts = cbind.data.frame(rownames(signalWts), signalWts, stringsAsFactors=F)
colnames(signalWts)[1] = variableMap$sampleId
datNoiseWts = merge(pheno, signalWts)
meltedNoiseWts = melt(datNoiseWts, id.vars=phenoColsToKeep, variable_name=variableMap$reagentId)
meltedNoiseWts = standardizeColumnNames(meltedNoiseWts, variableMap=variableMap)
colnames(meltedNoiseWts) = gsub("^value$", "signal_weights", colnames(meltedNoiseWts))
}
# CALCULATE WEIGHTS, using a combination of reagent-specific, cell line-specific
# inverse-variance weights and microarray noise (signal probability) weights
melted = assignWeightsToObservations(dat=melted,
reagentWeights=reagentWeights,
cellLineWeights=cellLineWeights,
inverseVarianceWeights=inverseVarianceWeights,
signalProbWeights=signalProbWeights,
varWeightsDF=meltedVarWts,
signalWeightsDF=meltedNoiseWts,
variableMap=variableMap)
colSort = sort(colnames(melted))
melted = melted[,colSort]
# If the covariate is categorical, cast it as a Factor to facilitate modeling
if(!is.null(melted$COVARIATE) && is.character(melted$COVARIATE)) {
observedCovariateValues = unique(melted$COVARIATE)
#if necessary, reorder the factor levels in COVARIATE, which are by default alphabetical
# However, both vectors must be characters, and all observed COVARIATE values must be
# accounted for in the covariateFactorOrder
if(is.character(covariateFactorOrder) && is.character(melted$COVARIATE)
&& length(intersect(covariateFactorOrder, melted$COVARIATE)) == length(observedCovariateValues)) {
subsetOrder = covariateFactorOrder[covariateFactorOrder%in% observedCovariateValues]
# Reset factor levels, for example, if the first level should be "NON-AMP"
# and we want to compare the covariate of "AMP"
melted$COVARIATE = factor(melted$COVARIATE, levels=subsetOrder)
} else {
melted$COVARIATE = factor(melted$COVARIATE)
}
}
# If we have 2 covariates, for now the second covariate will be handled alphabetically
if(!is.null(melted$COVARIATE2) && is.character(melted$COVARIATE2)) {
melted$COVARIATE2 = factor(melted$COVARIATE2)
}
return(melted)
}
# Map column names in data to standard ones to be used in model fitting process
standardizeColumnNames = function(dat, variableMap=getDefaultVariableMap()) {
if(is.vector(dat)) {
cols = dat
} else if(is.matrix(dat) | is.data.frame(dat)) {
cols = colnames(dat)
}
cols = gsub(paste0("^",variableMap$sampleId,"$"), "sampleId", cols)
cols = gsub(paste0("^",variableMap$cellLineId,"$"), "cellLineId", cols)
cols = gsub(paste0("^",variableMap$timeGroup,"$"), "timeGroup", cols)
cols = gsub(paste0("^",variableMap$timeNum,"$"), "timeNum", cols)
cols = gsub(paste0("^",variableMap$replicateId,"$"), "replicateId", cols)
cols = gsub(paste0("^",variableMap$reagentId,"$"), "reagentId", cols)
if(is.vector(dat)) {
dat = cols
} else if(is.matrix(dat) | is.data.frame(dat)) {
colnames(dat) = cols
}
return(dat)
}
# The reagent/cell line weights are extracted from input data frames if required
# inverse variance and signal weights have already been extracted and added as
# columns to 'dat', if the options were specified
assignWeightsToObservations = function(dat,
reagentWeights,
cellLineWeights,
inverseVarianceWeights,
varWeightsDF,
signalProbWeights,
signalWeightsDF,
variableMap) {
##################################################################
# BEGIN - Calculate final weight of each data point in the dataset
# Each type of weight should be renormalized so the total weight
# of all observations is identical to the number of observations.
# The alternative is to assig absolute weights, which could lead
# to implausible results.
##################################################################
weightRG = is.data.frame(reagentWeights)
weightCL = is.data.frame(cellLineWeights)
weightVar = is.data.frame(varWeightsDF)
weightSignal = is.data.frame(signalWeightsDF)
if((weightRG | weightCL | weightVar | weightSignal) & nrow(dat) > 0) {
# Default all data points to equal value
dat$W=1
#If specified, attach per-reagent weights to the data
if(weightRG) {
# By this point, the reagent column is identified by the string "reagentId"
# regardless of the original column label
rgs = unique(dat$reagentId)
colnames(reagentWeights)[1] = "reagentId"
rgWeights = reagentWeights[reagentWeights$reagentId %in% rgs, c("reagentId", "weight"), drop=F]
colnames(rgWeights)[2] = "reagent_weights"
#append reagent-specific weights
dat = merge(dat, rgWeights, by="reagentId", all.x=T)
#remove measurements associated with reagents of weight 0.
toKeepRG = !(is.na(dat$reagent_weights)) & dat$reagent_weights > 0
dat = dat[toKeepRG,]
}
#If specified, attach per-reagent weights to the data
if(weightCL) {
# By this point, the reagent column is identified by the string "reagentId"
# regardless of the original column label
cellLines = unique(dat$cellLineId)
colnames(cellLineWeights)[1] = "cellLineId"
clWeights = cellLineWeights[cellLineWeights$cellLineId %in% cellLines, c("cellLineId", "weight"), drop=F]
colnames(clWeights)[2] = "cell_line_weights"
#append reagent-specific weights
dat = merge(dat, clWeights, by="cellLineId", all.x=T)
#remove measurements associated with cell lines of weight 0.
toKeepCL = !(is.na(dat$cell_line_weights)) & dat$cell_line_weights > 0
dat = dat[toKeepCL,]
}
if(weightVar) {
dat = merge(dat, varWeightsDF)
dat = dat[!is.na(dat$variance_weights) & dat$variance_weights > 0,]
}
if(weightSignal) {
dat = merge(dat, signalWeightsDF)
dat = dat[!is.na(dat$signal_weights) & dat$signal_weights > 0,]
}
# Total number of non-zero observations
totalObservations = nrow(dat)
if(weightRG) {
totalRG = sum(dat$reagent_weights)
rescalingRG = totalObservations/totalRG
dat$reagent_weights = dat$reagent_weights * rescalingRG
dat$W = dat$W * dat$reagent_weights
}
if(weightCL) {
totalCL = sum(dat$cell_line_weights)
rescalingCL = totalObservations/totalCL
dat$cell_line_weights = dat$cell_line_weights * rescalingCL
dat$W = dat$W * dat$cell_line_weights
}
if(weightVar) {
totalVar = sum(dat$variance_weights)
rescalingVar = totalObservations/totalVar
dat$variance_weights = dat$variance_weights * rescalingVar
dat$W = dat$W * dat$variance_weights
}
if(weightSignal) {
totalSignal = sum(dat$signal_weights)
rescalingSignal = totalObservations/totalSignal
dat$signal_weights = dat$signal_weights * rescalingSignal
dat$W = dat$W * dat$signal_weights
}
# Rescale total weight sum to be equal to original number of data points
# Otherwise, one can arbitrarily increase significance of regression
# parameters simply by multiplying all weightings by 2, or 100, for example.
# if total = sum(weights)
# then multiplying all weights by desired/total ensures
# that the data point weights sum to the desired value,
# in this case the original number of data points.
total = sum(dat$W)
rescaling = totalObservations/total
dat$W = dat$W * rescaling
# If certain cell lines/reagents are entirely excluded from the data
# reset the factor information so that factor levels with no data are excluded
dat = resetFactorLevels(dat)
}
return(dat)
}
###
# If we subset the model data's data frame, some variables will be factors where some factor levels
# will have no associated measurements. Having factor levels with no associated measurements messes up
# the model fitting, since the model will try to fit data for -each- factor level. The reset
# resolves this issue.
###
resetFactorLevels = function(dat) {
# Make sure no non-represented factor values exist
dat$reagentId = factor(as.character(dat$reagentId))
dat$cellLineId = factor(as.character(dat$cellLineId))
dat$cellLineIdUnique = factor(as.character(dat$cellLineIdUnique))
dat$replicateIdUnique = factor(as.character(dat$replicateIdUnique))
return(dat)
}
getDefaultVariableMap = function() {
# screenId - a unique identifier for each screen. Cell lines screened under different
# perturbations must have distinct screenIds (eg: cell line + perturbation)
# cellLineId - cell line name. The same cell line can be perturbed in different
# ways (Eg: drug screen), leading to multiple different screenId
# associated with the same cell line
variableMap = data.frame(geneId = "gene_id",
symbol="symbol",
reagentId="trcn_id",
cellLineId="cell_line",
sampleId="sample_id",
replicateId="replicate",
timeGroup="timepointStd",
timeNum="timepointIndex",
condition="condition",
screenId="screen_id",
replicateGroupId="replicate_group",
stringsAsFactors=F)
return(variableMap)
}
getMeanSd = function(dat, meanSdTable, variableMap=getDefaultVariableMap()) {
# check that 2 columns are passed - means, and timepoints associated with means
colsDat = c("expr", variableMap$timeGroup)
cols = c("mu", "sigma", variableMap$timeGroup)
# colsDat = c("expr", "timepoint")
# cols = c("mu", "sigma", "timepoint")
if(length(intersect(colnames(dat), colsDat)) != 2) {
expected = paste(colsDat, sep="", collapse=", ")
observed = paste(colnames(dat), sep="", collapse=", ")
msg = paste("ERROR in getMeanSd():\n\n",
"The following 2 columns are expected in the 'dat' parameter: ", expected,
"\n\nHowever, we observe the following columns in 'dat': ", observed,
"\n\nThe 'expr' column must be named as such and is required.",
"\n\nThe ", variableMap$timeGroup ," column name can be changed using the 'variableMap' parameter.",
"\n\n\n\n", sep="")
stop(msg)
}
if(length(intersect(colnames(meanSdTable), cols)) != 3) {
expected = paste(cols, sep="", collapse=", ")
observed = paste(colnames(meanSdTable), sep="", collapse=", ")
msg = paste("ERROR in getMeanSd():\n\n",
"The following 3 columns are expected in the meanSdTable: ", expected,
"\n\nHowever, we observe the following columns in the meanSdTable: ", observed,
"\n\nThe 'mu' and 'sigma' columns must be named as such and are required.",
"\n\nThe ", variableMap$timeGroup ," column name can be changed using the 'variableMap' parameter.",
"\n\n\n\n", sep="")
stop(msg)
}
# Get the timepoint values closest to those available in the specified table
datTime = sort(unique(dat[,variableMap$timeGroup]))
meanSdTime = sort(unique(meanSdTable[,variableMap$timeGroup]))
timeMap = sapply(datTime, function(v, times) {times[which.min(abs(times-v))]}, times=meanSdTime)
timeMap = cbind.data.frame(datTime=datTime, meanSdTime=meanSdTime)
dat2 = merge(dat, timeMap, all.x=T, by.x=variableMap$timeGroup, by.y=datTime, sort=F)
print(table(dat$expr == dat2$expr, useNA="always"))
mapped = list()
# perform mean-SD mapping for each timepoint separately
# eg: if mean-SD function provided for times 0,1,2, perform 3 mapping iterations
for(onetime in meanSdTime) {
# get values to map, and mapping table, for the given time value
onetimeDat = dat2[dat2$datTime == onetime,]
onetimeTab = meanSdTime[variableMap$timeGroup == onetime,]
# get table lookup indices for the mean expression values by mapping each
# to the closest 'mean' value entry in the mean-SD mapping table
meanIndices = sapply(onetimeDat$expr,
function(v, increments) {which.min(abs(increments-x))},
increments=unlist(onetimeTab[,variableMap$timeGroup]))
# for the mapped indices, return the matching Std. Deviation
# This sd value is presumably calculated as a smoothed version of the
# individual probe replicate mean-SD values
onetimeDat$sd = onetimeTab[meanIndices, "sigma"]
# Append the dat table, with Std. Dev. values added, to the list
mapped = c(mapped, list(onetimeDat))
}
final = do.call(rbind, mapped)
return(final)
}
# Load empirical mean-variance tables and convert them to functions using spline interpolation
loadMeanVar = function(meanVarFile, minVar = 0.1) {
meanVar = read.delim(meanVarFile, header=T, as.is=T, check.names=F)
# Limit how low the median variance goes
minSd = minVar^(1/2)
meanVar$variance = ifelse(meanVar$variance < minVar, minVar, meanVar$variance)
meanVar$sigma = ifelse(meanVar$sigma < minSd, minSd, meanVar$sigma)
timeGroups = unique(meanVar$timeGroup)
fns = list()
meanVarFns = list()
meanSdFns = list()
for(timept in timeGroups) {
dat = meanVar[meanVar$timeGroup == timept, c("mu", "sigma", "variance")]
meanVarFns[[as.character(timept)]] = splinefun(x=dat$mu, y=dat$variance, method="natural")
meanSdFns[[as.character(timept)]] = splinefun(x=dat$mu, y=dat$sigma, method="natural")
}
fns[["sd"]] = meanSdFns
fns[["var"]] = meanVarFns
return(fns)
}
formatOutput = function(outputDF) {
# rename dropout_rate_estimate as baseline_rate
# rename dropout_rate_diff_X_estimate as difference_X
# rename relative_dropout_rate_X as rdr_
# rename_dropout_rate_diff_X_[pvalue|fdr] as [pvalue|fdr]_X
colnames(outputDF) = gsub("dropout_rate_estimate", "baseline_trend", colnames(outputDF))
colnames(outputDF) = gsub("dropout_rate_(se|df|t|pvalue|fdr)$", "baseline_\\1", colnames(outputDF))
colnames(outputDF) = gsub("dropout_rate_diff_(.*)_estimate", "difference_\\1", colnames(outputDF))
colnames(outputDF) = gsub("dropout_rate_diff_(.*)_(se|df|t|pvalue|fdr)$", "\\2_\\1", colnames(outputDF))
# In the case of end-point columns, we have columns of the format X_[estimate|pvalue|fdr]
colnames(outputDF) = gsub("intercept_estimate", "intercept", colnames(outputDF))
colnames(outputDF) = gsub("([a-zA-Z0-9]*)_estimate", "difference_\\1", colnames(outputDF))
colnames(outputDF) = gsub("([a-zA-Z0-9]*)_(se|df|t|pvalue|fdr)$", "\\2_\\1", colnames(outputDF))
# If column name ends with _, remove it. (This occurs if the covariate is continuous, e.g., expression)
colnames(outputDF) = gsub("_$", "", colnames(outputDF))
newOrder = c(
setdiff(colnames(outputDF), c("warnings", "errors")),
c("warnings", "errors")
)
final = outputDF[,newOrder]
return(final)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.