### Class definition
setClass("csDEXdataSet",
slots=c(
dataType="character", # Type of the dataset (count/PSI).
exprData="matrix", # alias for either counts or PSI.
rowData="data.frame", # bin names, precision, etc.
colData="data.frame", # design files on conditions
cpmData="matrix" # computed cpm per group (gene)
))
### Getters
setGeneric("exprData", function(obj, normalized=FALSE) standardGeneric("exprData"))
setMethod("exprData", "csDEXdataSet",
function(obj, normalized=FALSE){
if(normalized){
if(is.null(csDEX::colData(obj)$size.factor)){
message("Size factors not provided yet. Run estimateSizeFactors first.\n")
NULL
} else {
sweep(obj@exprData, 2, colData(obj)$size.factor, "/")
}
} else {
obj@exprData
}})
setGeneric("rowData", function(obj) standardGeneric("rowData"))
setMethod("rowData", "csDEXdataSet", function(obj) obj@rowData)
setGeneric("dataType", function(obj) standardGeneric("dataType"))
setMethod("dataType", "csDEXdataSet", function(obj) obj@dataType)
setGeneric("colData", function(obj) standardGeneric("colData"))
setMethod("colData", "csDEXdataSet", function(obj) obj@colData)
setGeneric("cpmData", function(obj) standardGeneric("cpmData"))
setMethod("cpmData", "csDEXdataSet", function(obj) obj@cpmData)
### Setters
setGeneric("exprData<-", function(obj, value) standardGeneric("exprData<-"))
setReplaceMethod("exprData", "csDEXdataSet",
function(obj, value) {obj@exprData <- value; validObject(obj); obj})
setGeneric("rowData<-", function(obj, value) standardGeneric("rowData<-"))
setReplaceMethod("rowData", "csDEXdataSet",
function(obj, value) {obj@rowData <- value; validObject(obj); obj})
setGeneric("colData<-", function(obj, value) standardGeneric("colData<-"))
setReplaceMethod("colData", "csDEXdataSet",
function(obj, value) {obj@colData <- value; validObject(obj); obj})
setGeneric("cpmData<-", function(obj, value) standardGeneric("cpmData<-"))
setReplaceMethod("cpmData", "csDEXdataSet",
function(obj, value) {obj@cpmData <- value; validObject(obj); obj})
### Data generating functions
# Return a csDEXdataSet with estimated size Factors
setGeneric("estimateSizeFactors",
function(obj) standardGeneric("estimateSizeFactors"))
setMethod("estimateSizeFactors", "csDEXdataSet",
function(obj) {
csDEX::colData(obj)$size.factor = 1.0 / c(edgeR::calcNormFactors(obj@exprData))
obj
})
# Return a csDEXdataSet with estimated precisions
setGeneric("estimatePrecisions",
function(obj) standardGeneric("estimatePrecisions"))
setMethod("estimatePrecisions", "csDEXdataSet",
function(obj) {
expr = csDEX::exprData(obj, normalized = TRUE)
csDEX::rowData(obj)$precision = 1.0 / edgeR::estimateDisp(expr)$tagwise.dispersion
obj
})
# Estimate *maximum* CPM in bins inside a gene
# Required for filtering untestable genes in particular conditions
# Implemented as a separate function to avoid unnecesary increase of memory
setGeneric("estimateGeneCPM",
function(obj) standardGeneric("estimateGeneCPM"))
setMethod("estimateGeneCPM", "csDEXdataSet",
function(obj) {
if(is.null(colData(obj)$lib.size)){
message("Design file must provide input.read.count (library size) per sample.\n")
NULL
} else {
expr = csDEX::exprData(obj)
lib.size = csDEX::colData(obj)$lib.size
cpms = edgeR::cpm(expr, lib.size=lib.size)
groups = unique(csDEX::rowData(obj)$groupID)
cpm.max = matrix(0, nrow=length(groups), ncol=ncol(expr))
colnames(cpm.max) = colnames(expr)
row.names(cpm.max) = groups
for (i in 1:length(groups))
cpm.max[i, ] = apply(cpms[csDEX::rowData(obj)$groupID == groups[i],], 2, max)
csDEX::cpmData(obj) = cpm.max
obj}
})
# Class should define way to handle replicates, etc.
# Enable passing a custom aggregation function, such as geom. mean, simple mean, sum, etc.
csDEXdataSet <- function(data.dir, design.file, type="count",
col.condition="Experiment.target",
col.replicate="File.accession",
col.testable="testable",
col.read.count="input.read.count",
col.additional=c(),
data.file.ext="txt",
aggregation=NULL,
min.bin.count=NULL,
min.gene.count=NULL,
zero.out=NULL){
### Input checks ###
if(!(type %in% c("count", "PSI")))
stop("type must be one of ('count', 'PSI')")
### Set recommended parameters
if(is.null(min.bin.count)) min.bin.count = 0
if(type == "PSI"){
if(is.null(aggregation)) aggregation = mean
if(is.null(zero.out)) zero.out = TRUE
if(is.null(min.gene.count)) min.gene.count = 1
} else if (type == "count"){
if(is.null(aggregation)) aggregation = sum
if(is.null(zero.out)) zero.out = FALSE
if(is.null(min.gene.count)) min.gene.count = 0
}
### Private functions ###
zeroOut <- function(repData, min.count=0){
# Zero-out non-expressed genes in individual replicates
genes = unlist(lapply(row.names(repData),
function(x) strsplit(x, ":")[[1]][1]))
for(g in unique(genes)){
inxs = genes == g
zeros = which(apply(repData[inxs, ], 2, sum) < min.count)
repData[inxs, zeros] = NA
}
return(repData)
}
# Parse design file and ensure columns are present
design = read.csv(design.file, sep="\t", header=TRUE)
stopifnot(col.condition %in% colnames(design))
stopifnot(col.replicate %in% colnames(design))
# Input check if additional columns have been requested
# Assert values of additional columns can be uniquely mapped to conditions
if(length(col.additional) > 0){
for(col.name in col.additional){
cond2add = aggregate(design[,col.name],
by=list(condition=design[,col.condition]),
FUN=function(x) length(unique(x)) == 1)
if(!all(cond2add$x))
stop(sprintf("Values of field %s are not unique for one or more conditions",
col.name))
}
}
# Parse condition but retain order of rowss
conditions = sort(unique(design[,col.condition]))
n.con = length(conditions)
# Parse expression data
message("Processing expression data")
exprData = NULL
lib.sizes = NULL
testable = rep(TRUE, n.con)
for (i in 1:length(conditions)){
cond = conditions[i]
message(sprintf("Condition %s", cond))
replicates = design[design[,col.condition] == cond, col.replicate]
# Merge data on read counts if available
# Aggregated in the same manner than counts
cond.lib.size = NULL
if (col.read.count %in% colnames(design)){
cond.lib.size = aggregation(design[design[,col.condition] == cond, col.read.count])
lib.sizes = c(lib.sizes, cond.lib.size)
}
# Information on testability if available.
if (col.testable %in% colnames(design)){
testable[i] = as.logical(sum(design[design[,col.condition] == cond, col.testable]))
}
# Merge data on replicates
repData = NULL
n.rep = length(replicates)
for (j in 1:length(replicates)){
rep = replicates[j]
rep.path = file.path(data.dir, paste(rep, data.file.ext, sep="."))
y = read.table(rep.path, header=FALSE, comment.char="_")
y$V1 = as.character(y$V1)
n.row = nrow(y)
message(sprintf(" Replicate %s, num. rows: %d", rep.path, n.row))
if(is.null(repData)){
repData = matrix(0, ncol=n.rep, nrow=n.row)
row.names(repData) = y$V1
}
repData[y$V1, j] = y$V2
}
# Zero out on individual bin level
repData[repData < min.bin.count] = 0
# Zero out on a gene level and merge replicates
if(zero.out || min.gene.count > 0)
repData = zeroOut(repData, min.gene.count)
repVec = suppressWarnings(apply(repData, 1, aggregation, na.rm=TRUE))
repVec[is.infinite(repVec)] = 0
repVec[is.na(repVec)] = 0
if(is.null(exprData)){
exprData = matrix(0, ncol=n.con, nrow=n.row)
row.names(exprData) = row.names(repData)
colnames(exprData) = conditions
}
exprData[,i] = repVec
}
# Order of passed data files must match order of rows in design file
rowData = data.frame(
featureID = row.names(exprData), # gene: exon
groupID = unlist(lapply(row.names(repData), # gene
function(x) strsplit(x, ":")[[1]][1])),
binID = unlist(lapply(row.names(repData), # exon
function(x) strsplit(x, ":")[[1]][2]))
)
rowData$featureID = as.character(rowData$featureID)
rowData$groupID = as.character(rowData$groupID)
rowData$binID = as.character(rowData$binID)
row.names(rowData) = rowData$featureID
colData = data.frame(condition=colnames(exprData))
row.names(colData) <- colData$condition
if(!is.null(lib.sizes)) colData$lib.size = lib.sizes
# Append additional columns
colData$testable = testable
for (col.name in col.additional){
for (cond in colData$condition){
val = unique(design[design[,col.condition] == cond, col.name])
colData[cond, col.name] = val
}
}
new("csDEXdataSet", exprData=exprData, rowData=rowData, colData=colData, dataType=type)
}
waldTest <- function(mm0, model0, alpha=0.05){
# Wald test for Beta model coefficients
# Reduce non-significant coefficients to zero for efficient computation
beta0 = model0$coefficients$mean
# Chi2 test for coefficients greater than zero
scores = diag(model0$vcov)[1:length(beta0)]
wald = rep(0, length(scores))
names(wald) = names(scores)
# Test statistic
for (i in 1:length(wald)){
w = aod::wald.test(b=beta0[i], scores[i], T=1)
wald[i] = w$result$chi2["P"]
}
# (Non-)significant coefficients
sig = names(beta0)[wald < alpha]
not.sig = names(beta0)[wald >= alpha]
# Merge prior coefficients for correct computation of likelihood
# New model matrix yields same likelihood
if(length(not.sig) > 1){
mu.prior = mm0[,not.sig] %*% beta0[not.sig]
colnames(mu.prior) = "mu.prior"
mm.prior = cbind(mu.prior, mm0[,sig])
coef.prior = c(1, beta0[sig])
} else {
mm.prior = mm0
coef.prior = beta0
}
return(list(sig=sig, not.sig=not.sig, mm.prior=mm.prior, coef.prior=coef.prior))
}
geneModel <- function(input, min.cpm=NULL, tmp.dir=NULL, dist="count",
alpha.wald=NULL, formula=y~featureID+condition){
# Writing to file
write.gene.file <- function(tmp.dir, results, gene){
if(!is.null(tmp.dir)){
fname = file.path(tmp.dir, sprintf("%s.csv", gene))
write.csv(results, fname, quote=TRUE, row.names=FALSE)}
}
# Extract input values
expr = input$expr
rowdata = input$rowdata
coldata = input$coldata
gene = rowdata$groupID[1]
# Prepare a results data frame
results = reshape::melt(expr)
colnames(results) = c("featureID", "condition", "y")
results = merge(results, rowdata, by="featureID")
results[,colnames(coldata)] = coldata[as.character(results$condition), colnames(coldata)]
results$featureID = droplevels(as.factor(results$featureID))
results$condition = droplevels(as.factor(results$condition))
results[,c("cpm", "pvalue", "fdr", "fdr.all", "padj", "residual", "loglik", "ddev", "time", "nrow", "ncol", "msg")] = NA
# Check variance
if(var(results$y) == 0){
results$testable = FALSE
results$msg = "zero gene variance"
write.gene.file(tmp.dir, results, gene)
return(results)}
# Define testability based on min.cpm
if(!is.null(min.cpm)){
gene.cpm = input$gene.cpm
results$testable = results$testable & (gene.cpm[as.character(results$condition)] >= min.cpm)
results$cpm = gene.cpm[as.character(results$condition)]
results[results$cpm < min.cpm, "msg"] = "low cpm"
}
# Correct values to (0, 1) for the beta model
N = nrow(results)
if(dist == "PSI")
results$y = ((N - 1) * results$y + 0.5) / N
if(dist == "count")
results$y = round(results$y)
# Null hypothesis
frm0 = formula
mm0 = model.matrix(frm0, results)
# Fit null model and store coefficients for better initialization
if(dist == "count"){
model0 = tryCatch(csDEX::nbreg.fit(X=mm0,
y=as.vector(results$y),
phi=results$precision,
tol=0.0001),
warning = function(w) w,
error = function(e) e)
} else if(dist == "PSI"){
model0 = tryCatch(
betareg::betareg.fit(x=mm0, y=as.vector(results$y)),
warning = function(w) w,
error = function(e) e)
}
if(class(model0)[1] != "list") {
results$msg[is.na(results$msg)] = sprintf("null model fit error: %s",
gsub("[\r\n\t]", "", model0))
results$testable = FALSE
write.gene.file(tmp.dir, results, gene)
return(results)
} else {
start.coefs = model0$coefficients$mean
}
# Filter testable values according to fit residual
results$residual = results$y - model0$fitted.values
# Compute Wald test statistics if alpha.wald
results$interaction = 0
if(is.null(alpha.wald)){
# Full alternative model for gene
frm1 = as.formula(paste0(deparse(formula), "+interaction"))
mm1 = model.matrix(frm1, results)
} else {
# Zero coefficients merged into one column
wt.stats = csDEX::waldTest(mm0, model0, alpha=alpha.wald)
start.coefs = wt.stats$coef.prior
mm1 = cbind(wt.stats$mm.prior, as.vector(results$interaction))
colnames(mm1)[ncol(mm1)] = "interaction"
}
# Store design matrix dimensions
results$nrow = nrow(mm1)
results$ncol = ncol(mm1)
# Compute deviance and results
for (i in 1:nrow(results)){
if(!results[i, "testable"]) next
mm1[, "interaction"] = 0
mm1[i, "interaction"] = 1
start.time = Sys.time()
if(dist == "count"){
coefs.nbinom.init = c(start.coefs, 0)
model1 = tryCatch(csDEX::nbreg.fit(X = mm1,
y=as.vector(results$y),
phi = results$precision,
tol=0.0001,
beta.init = coefs.nbinom.init,
verbose=FALSE),
warning = function(w) w,
error = function(e) e)
} else if(dist == "PSI"){
coefs.beta.init = model0$coefficients
coefs.beta.init$mean = c(start.coefs, 0)
model1 = tryCatch(
betareg::betareg.fit(x=mm1, y=as.vector(results$y),
control=betareg::betareg.control(start=coefs.beta.init)),
warning = function(w) w,
error = function(e) e)
}
if(class(model1)[1] != "list") {
results$msg[i] = sprintf("fit error: %s",
gsub("[\r\n\t]", "", model1))
results$testable[i] = FALSE
next
}
# Compute difference of deviance (likelihood ratio) test statistic
results$time[i] = Sys.time() - start.time
results$loglik[i] = model1$loglik
# Likelihood ratio-test pvalue
ddev = 2 * (model1$loglik - model0$loglik)
results$ddev[i] = ddev
if (ddev > 0){
results$pvalue[i] = 1.0 - pchisq(ddev, 1)
} else {
results$pvalue[i] = 1
}
}
# Correct for multiple testing
results$interaction = NULL
results$padj = p.adjust(results$pvalue, method="bonferroni")
results$fdr = p.adjust(results$pvalue, method="BH")
results = results[order(results$pvalue),]
# Write intermediate results
write.gene.file(tmp.dir, results, gene)
return(results)
}
testForDEU <- function(cdx, workers=1, tmp.dir=NULL, min.cpm=NULL, alpha.wald=NULL,
formula=y~featureID+condition){
# Test for differential exon usage given a csDEX data object file,
# with calculated precisions and size factors. Only test genes
# if they have any reads mapped onto them
gene.set = unique(csDEX::rowData(cdx)$groupID)
inputs = list()
dist = csDEX::dataType(cdx)
# Check input for NBINOM
if(dist == "count"){
stopifnot(!is.null(csDEX::colData(cdx)$size.factor))
stopifnot(!is.null(csDEX::rowData(cdx)$precision))
}
# Add to queue if gene has not equal count at all cells
gene.cpm = NULL
message("Constructing inputs ... \n")
for (g in gene.set){
inxs = which(csDEX::rowData(cdx)$groupID == g)
gene.expr = csDEX::exprData(cdx, normalized=(dist=="count"))[inxs,]
gene.rowdata = csDEX::rowData(cdx)[inxs,]
if(!is.null(min.cpm)) gene.cpm = csDEX::cpmData(cdx)[g,]
inputs[[length(inputs)+1]] = list(expr=gene.expr,
rowdata=gene.rowdata,
coldata=csDEX::colData(cdx),
gene.cpm=gene.cpm)
}
# If workers == 1, run within the same process
if(workers > 1){
message(sprintf("Dispatching on %d workers, cache directory %s. \n", workers, tmp.dir))
jobs = mclapply(inputs, csDEX::geneModel,
min.cpm, tmp.dir, dist, alpha.wald, formula,
mc.preschedule=FALSE, mc.cores=workers, mc.silent=TRUE)
} else {
jobs = list()
for(inp in inputs)
jobs[[length(jobs)+1]] = csDEX::geneModel(inp, min.cpm, tmp.dir,
dist, alpha.wald, formula)
}
if(is.null(tmp.dir)){
results = data.frame()
for (i in 1:length(jobs)){
if (!class(jobs[[i]]) == "try-error")
results = rbind(results, jobs[[i]])
}
results$fdr.all = p.adjust(results$pvalue, "BH")
results=results[order(results$pvalue),]
return(results)
} else {
message(sprintf("Results for written to %s", tmp.dir))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.