Nothing
#Some of these functions were based on similar functions created for the DEXSeq package.
#
# Note that DEXSeq is licensed under the GPL v3. Therefore this
# code packaged together is licensed under the GPL v3, as noted in the LICENSE file.
# Some code snippets are "united states government work" and thus cannot be
# copyrighted. See the LICENSE file for more information.
#
# The current versions of the original functions upon which these were based can be found
# here: http://github.com/Bioconductor-mirror/DEXSeq
#
# Updated Authorship and license information can be found here:
# here: http://github.com/Bioconductor-mirror/DEXSeq/blob/master/DESCRIPTION
##########################################################################
##########################################################################
##########################################################################
######### Front-end Interface functions:
##########################################################################
##########################################################################
##########################################################################
INTERNALDEBUGMODE = FALSE
##########################################################################
######### Running JunctionSeq Analyses:
##########################################################################
runJunctionSeqAnalyses <- function(sample.files, sample.names, condition,
flat.gff.file = NULL,
analysis.type = c("junctionsAndExons","junctionsOnly","exonsOnly"),
meanCountTestableThreshold = "auto",
nCores = 1,
use.covars = NULL,
test.formula0 = formula(~ sample + countbin),
test.formula1 = formula(~ sample + countbin + condition : countbin),
effect.formula = formula(~ condition + countbin + condition : countbin),
geneLevel.formula = formula(~ condition),
use.exons = NULL, use.junctions = NULL,
use.known.junctions = TRUE, use.novel.junctions = TRUE,
use.multigene.aggregates = FALSE,
gene.names = NULL,
method.GLM = c(c("advanced","DESeq2-style"), c("simpleML","DEXSeq-v1.8.0-style")),
method.dispFit = c("parametric", "local", "mean"),
method.dispFinal = c("shrink","max","fitted","noShare"),
method.sizeFactors = c("byGenes","byCountbins"),
method.countVectors = c("geneLevelCounts","sumOfAllBinsForGene","sumOfAllBinsOfSameTypeForGene"),
method.expressionEstimation = c("feature-vs-gene","feature-vs-otherFeatures"),
method.cooksFilter = TRUE,
optimizeFilteringForAlpha = 0.01,
fitDispersionsForExonsAndJunctionsSeparately = TRUE,
keep.hypothesisTest.fit = FALSE,
keep.estimation.fit = FALSE,
replicateDEXSeqBehavior.useRawBaseMean = FALSE,
verbose = TRUE, debug.mode = FALSE
){
keep.debug.model.data <- TRUE
gtf.format <- TRUE
analysis.type <- match.arg(analysis.type)
method.GLM <- match.arg(method.GLM)
method.dispFit <- match.arg(method.dispFit)
method.dispFinal <- match.arg(method.dispFinal)
method.sizeFactors <- match.arg(method.sizeFactors)
method.countVectors <- match.arg(method.countVectors)
method.expressionEstimation <- match.arg(method.expressionEstimation)
callStack <- list(deparse(match.call(), nlines=1))
if(is.null(use.junctions) && is.null(use.exons)){
if(analysis.type == "junctionsAndExons"){
use.junctions <- TRUE
use.exons <- TRUE
} else if(analysis.type == "junctionsOnly"){
use.junctions <- TRUE
use.exons <- FALSE
} else if(analysis.type == "exonsOnly"){
use.junctions <- FALSE
use.exons <- TRUE
}
} else {
if(is.null(use.junctions) || is.null(use.exons)){
stop(paste0("Illegal syntax! If parameter use.junctions or use.exons are used, then BOTH must be set!\n use.junctions = '",use.junctions,"', use.exons = '",use.exons,"'"))
}
if(isTRUE(use.junctions) && isTRUE(use.exons)){
analysis.type <- "junctionsAndExons"
} else if(isTRUE(use.junctions) && isTRUE(! use.exons)){
analysis.type <- "junctionsOnly"
} else if(isTRUE(! use.junctions) && isTRUE(use.exons)){
analysis.type <- "exonsOnly"
} else {
stop("Illegal syntax! Parameters use.exons and use.junctions cannot both be false!")
}
}
if(isTRUE(verbose)){
message("> STARTING runJunctionSeqAnalyses (v",packageVersion("JunctionSeq"),") (",date(),")")
message(paste("> rJSA: sample.files: ", paste0(sample.files,collapse=", ")))
message(paste("> rJSA: sample.names: ", paste0(sample.names,collapse=", ")))
message(paste("> rJSA: condition: ", paste0(condition,collapse=", ")))
message(paste("> rJSA: analysis.type: ",analysis.type))
message(paste("> rJSA: use.junctions: ",use.junctions))
message(paste("> rJSA: use.novel.junctions: ",use.novel.junctions))
message(paste("> rJSA: use.exons: ",use.exons))
message(paste("> rJSA: nCores: ",nCores))
message(paste("> rJSA: use.covars: ",use.covars))
message(paste("> rJSA: test.formula0: ",paste0(test.formula0,collapse=" ")))
message(paste("> rJSA: test.formula1: ",paste0(test.formula1,collapse=" ")))
message(paste("> rJSA: use.multigene.aggregates: ", use.multigene.aggregates))
}
#require("DEXSeq")
if(! is.factor(condition)){
condition <- factor(condition, levels = sort(unique(condition)))
}
design <- data.frame(condition = condition)
if(! is.null(use.covars)){
message(paste0("> rJSA: using covars:"," ",date()))
if(class(use.covars) != "data.frame"){
stop(paste0("FATAL ERROR: use.covars must be a data.frame! Instead it appears to be: ",class(use.covars)))
}
for(i in 1:ncol(use.covars)){
if(! is.factor(use.covars[[i]]) ){
use.covars[[i]] <- factor(use.covars[[i]], levels = sort(unique(use.covars[[i]])) )
}
}
design <- data.frame(cbind(design,use.covars))
for(i in 1:length(names(use.covars))){
message(paste0(" covar: ",names(use.covars)[i]))
message(paste0(c(" ",paste0(use.covars[,i],collapse=", "))))
}
names(design) <- c("condition",names(use.covars))
}
row.names(design) <- sample.names
if(isTRUE(verbose)) { message(paste0("> rJSA: Reading Count files..."," ",date(),".")) }
jscs = readJunctionSeqCounts(countfiles = as.character(sample.files),
samplenames = sample.names,
design = design,
flat.gff.file = flat.gff.file,
verbose = verbose,
use.junctions = use.junctions,
use.novel.junctions = use.novel.junctions,
use.known.junctions = use.known.junctions,
use.exons = use.exons,
use.multigene.aggregates = use.multigene.aggregates,
nCores = nCores,
method.countVectors = method.countVectors,
test.formula1 = test.formula1,
gene.names = gene.names
)
attr(jscs,"CallStack") <- c(list(deparse(match.call())), attr(jscs,"CallStack"))
if(isTRUE(verbose)) { message(paste0("> rJSA: Count files read."," ",date(),".")) }
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) { message(paste0("> rJSA: Estimating Size Factors..."," ",date(),".")) }
jscs <- estimateJunctionSeqSizeFactors(jscs, method.sizeFactors = method.sizeFactors, replicateDEXSeqBehavior.useRawBaseMean = replicateDEXSeqBehavior.useRawBaseMean, verbose = verbose)
if(isTRUE(verbose)) { message(paste0("> rJSA: Size Factors Done. Size Factors are:","."))
message(paste0("> rJSA: ",paste0(names(sizeFactors(jscs)),collapse=",")))
message(paste0("> rJSA: ",paste0(sizeFactors(jscs),collapse=",")) )
if(isTRUE(debug.mode)) reportMem(jscs)
message(paste0("> rJSA: Estimating Dispersions..."," ",date(),"."))
}
jscs <- estimateJunctionSeqDispersions(jscs,
method.GLM = method.GLM,
nCores = nCores,
test.formula1=test.formula1,
meanCountTestableThreshold = meanCountTestableThreshold,
verbose = verbose)
if(isTRUE(verbose)) { message(paste0("> rJSA: Dispersions estimated."," ",date(),".")) }
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) { message(paste0("> rJSA: Fitting Dispersion Fcn..."," ",date(),".")) }
jscs <- fitJunctionSeqDispersionFunction(jscs,
method.GLM = method.GLM,
method.dispFit = method.dispFit,
method.dispFinal = method.dispFinal,
verbose = verbose,
fitDispersionsForExonsAndJunctionsSeparately = fitDispersionsForExonsAndJunctionsSeparately)
if(isTRUE(verbose)) { message(paste0("> rJSA: Dispersions Fcn Fitted."," ",date(),".")) }
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) { message(paste0("> rJSA: Testing for DEU..."," ",date(),".")) }
jscs <- testForDiffUsage(jscs,
nCores = nCores,
test.formula0 = test.formula0,
test.formula1 = test.formula1,
method.GLM = method.GLM,
keep.hypothesisTest.fit = keep.hypothesisTest.fit,
meanCountTestableThreshold = meanCountTestableThreshold,
method.cooksFilter = method.cooksFilter,
optimizeFilteringForAlpha = optimizeFilteringForAlpha,
verbose = verbose);
if(isTRUE(verbose)) { message(paste0("> rJSA: DEU tests complete."," ",date(),".")) }
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) { message(paste0("> rJSA: Estimating effect sizes using effects models..."," ",date(),".")) }
jscs <- estimateEffectSizes( jscs ,
effect.formula = effect.formula,
geneLevel.formula = geneLevel.formula,
method.expressionEstimation = method.expressionEstimation,
keep.estimation.fit = keep.estimation.fit,
nCores = nCores)
if(isTRUE(verbose)) { message(paste0("> rJSA: Effect Sizes estimated.",".")) }
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) { message(paste0("> rJSA: Generating results table..."," ",date(),".")) }
sampleNames(jscs) = as.character(sample.names)
if(isTRUE(debug.mode)) reportMem(jscs)
if(isTRUE(verbose)) message("> FINISHED runJunctionSeqAnalyses (",date(),")")
return(jscs)
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
writeCompleteResults <- function(jscs, outfile.prefix,
gzip.output = TRUE, FDR.threshold = 0.01,
save.allGenes = TRUE, save.sigGenes = TRUE, save.fit = FALSE, save.VST = FALSE,
save.bedTracks = TRUE,
save.jscs = FALSE,
bedtrack.format = c("BED","GTF","GFF3"),
verbose = TRUE
){
bedtrack.format <- match.arg(bedtrack.format)
if(isTRUE(verbose)){
message("> STARTING writeCompleteResults (v",packageVersion("JunctionSeq"),") (",date(),")")
message(paste("> wcr: outfile.prefix: ",outfile.prefix))
message(paste("> wcr: FDR.threshold: ",FDR.threshold))
message(paste("> wcr: save.allGenes: ",save.allGenes))
message(paste("> wcr: save.sigGenes: ",save.sigGenes))
message(paste("> wcr: save.fit: ",save.fit))
message(paste("> wcr: save.VST: ",save.VST))
message(paste("> wcr: bedtrack.format: ",bedtrack.format))
}
keep.debug.model.data <- TRUE
if(isTRUE(save.jscs)) save( jscs, file = paste0(outfile.prefix, "jscs.RData") )
if(isTRUE(verbose)) message("> wcr: Writing sizeFactors.")
writeSizeFactors(jscs, file = paste0(outfile.prefix,"sizeFactors.txt"))
countVectors <- jscs@countVectors
sampleNames <- sampleNames(jscs@phenoData)
colnames(countVectors) <- c(paste0("rawCounts_",sampleNames), paste0("rawGeneCounts_",sampleNames))
expression.data <- cbind.data.frame(featureID = rownames(fData(jscs)), do.call(cbind.data.frame,jscs@plottingEstimates) , countVectors )
colnames(expression.data) <- c("featureID", unlist( lapply( jscs@plottingEstimates, colnames ) ), colnames(countVectors) )
if(isTRUE(save.VST)){
expression.data.vst <- cbind.data.frame(featureID = rownames(fData(jscs)), do.call(cbind.data.frame,lapply(jscs@plottingEstimates, function(pe){ vst(pe,jscs) })) )
colnames(expression.data.vst) <- c("featureID", paste0( unlist( lapply( jscs@plottingEstimates, colnames ) ), "VST" ) )
}
if(isTRUE(save.allGenes) && isTRUE(verbose)) message("> wcr: Writing results for ",length(unique(as.character(fData(jscs)$geneID)))," genes.")
if(isTRUE(save.allGenes) && isTRUE(verbose)) message("> wcr: Found ",nrow(fData(jscs))," counting bins belonging to these genes.")
if(isTRUE(save.allGenes)) write.simple.table.gz(expression.data, file=paste0(outfile.prefix,"allGenes.expression.data.txt"), use.gzip=gzip.output,row.names=FALSE,col.names=TRUE,quote=FALSE, sep = '\t')
if(isTRUE(save.allGenes) && isTRUE(save.VST)) write.simple.table.gz(expression.data.vst, file=paste0(outfile.prefix,"allGenes.expression.data.VST.txt"), use.gzip=gzip.output,row.names=FALSE,col.names=TRUE,quote=FALSE, sep = '\t')
if(isTRUE(save.allGenes)) write.table.gz(fData(jscs), file=paste0(outfile.prefix,"allGenes.results.txt"), use.gzip=gzip.output,row.names="featureID",col.names=TRUE,quote=FALSE)
modelFitForHypothesisTest <- jscs@modelFitForHypothesisTest
modelFitForEffectSize <- jscs@modelFitForEffectSize
if(isTRUE(save.fit)) save( modelFitForHypothesisTest, file = paste0(outfile.prefix, "modelFitForHypothesisTest.RData") )
if(isTRUE(save.fit)) save( modelFitForEffectSize, file = paste0(outfile.prefix, "modelFitForEffectSize.RData") )
sig.features <- which( f.na(fData(jscs)$padjust < FDR.threshold) )
gene.list <- unique(as.character(fData(jscs)$geneID[sig.features]))
if(isTRUE(save.sigGenes)){
if(length(sig.features) == 0){
if(isTRUE(verbose)) message("> wcr: Zero Significant Features! (at adjusted-p-value threshold ",FDR.threshold,")")
} #else {
tryCatch({
genewiseTable <- makeGeneWiseTable(jscs, gene.list, FDR.threshold = FDR.threshold, verbose = verbose)
write.simple.table.gz(pData(genewiseTable),file=paste0(outfile.prefix,"sigGenes.genewiseResults.txt"), use.gzip=gzip.output,row.names=FALSE,col.names=TRUE,quote=FALSE, sep = '\t')
}, error = function(e){
warning(" Warning! Caught error while attempting to compile optional genewise table.\n Error text:",e,"");
})
if(isTRUE(verbose)) message("> wcr: Writing results for ", length(gene.list), " genes with 1 or more significant junctions (at adjusted-p-value threshold ", FDR.threshold,")")
sig.rows <- which( as.character(fData(jscs)$geneID) %in% gene.list )
if(isTRUE(verbose)) message("> wcr: Found ", length(sig.rows), " counting bins belonging to those genes.")
if(isTRUE(verbose)) message("> wcr: Writing sigGenes.expression.data.txt")
write.simple.table.gz(expression.data[sig.rows,, drop=FALSE], file=paste0(outfile.prefix,"sigGenes.expression.data.txt"), use.gzip=gzip.output,row.names=FALSE,col.names=TRUE,quote=FALSE, sep = '\t')
if(isTRUE(save.VST)) write.simple.table.gz(expression.data.vst[sig.rows,, drop=FALSE], file=paste0(outfile.prefix,"sigGenes.expression.data.VST.txt"), use.gzip=gzip.output,row.names=FALSE,col.names=TRUE,quote=FALSE, sep = '\t')
if(isTRUE(verbose)) message("> wcr: Writing sigGenes.results.txt")
write.table.gz(fData(jscs)[sig.rows,, drop=FALSE], file=paste0(outfile.prefix,"sigGenes.results.txt"), use.gzip=gzip.output,row.names="featureID",col.names=TRUE,quote=FALSE)
#}
}
if(isTRUE(gzip.output)){
bedFileExt <- ".bed.gz"
} else {
bedFileExt <- ".bed"
}
if(isTRUE(verbose)) message("> wcr: Writing bed tracks.")
if(isTRUE(save.bedTracks)){
if(isTRUE(save.allGenes)){
if(any(fData(jscs)$featureType == "splice_site" | fData(jscs)$featureType == "novel_splice_site")){
writeExprBedTrack(paste0(outfile.prefix,"allGenes.junctionCoverage",bedFileExt),
jscs = jscs, only.with.sig.gene = FALSE,
plot.exons = FALSE, plot.junctions = TRUE,
output.format = bedtrack.format, verbose = verbose, use.gzip = gzip.output,
trackLine = paste0("track name='JctExprAll' description='Splice Junction Coverage Estimates, by group' itemRgb='On' visibility=3"))
}
if(any(fData(jscs)$featureType == "exonic_part")){
writeExprBedTrack(paste0(outfile.prefix,"allGenes.exonCoverage",bedFileExt),
jscs = jscs, only.with.sig.gene = FALSE,
plot.exons = TRUE, plot.junctions = FALSE,
output.format = bedtrack.format, verbose = verbose, use.gzip = gzip.output,
trackLine = paste0("track name='ExonExprAll' description='Exonic Region Coverage Estimates, by group' itemRgb='On' visibility=3"))
}
}
if(isTRUE(save.sigGenes)){
sig.features <- which( f.na(fData(jscs)$padjust < FDR.threshold) )
#if(length(sig.features) > 0){
if(any(fData(jscs)$featureType == "splice_site" | fData(jscs)$featureType == "novel_splice_site")){
writeExprBedTrack(paste0(outfile.prefix,"sigGenes.junctionCoverage",bedFileExt),
jscs = jscs, only.with.sig.gene = TRUE,
plot.exons = FALSE, plot.junctions = TRUE,
output.format = bedtrack.format, verbose = verbose, FDR.threshold= FDR.threshold, use.gzip = gzip.output,
trackLine = paste0("track name='JctExprAll' description='Sig genes splice Junction Coverage Estimates, by group' itemRgb='On' visibility=3"))
}
if(any(fData(jscs)$featureType == "exonic_part")){
writeExprBedTrack(paste0(outfile.prefix,"sigGenes.exonCoverage",bedFileExt),
jscs = jscs, only.with.sig.gene = TRUE,
plot.exons = TRUE, plot.junctions = FALSE,
output.format = bedtrack.format, verbose = verbose, FDR.threshold= FDR.threshold, use.gzip = gzip.output,
trackLine = paste0("track name='ExonExprAll' description='Sig genes exonic Region Coverage Estimates, by group' itemRgb='On' visibility=3"))
}
writeSigBedTrack(paste0(outfile.prefix,"sigGenes.pvalues",bedFileExt),
jscs = jscs,
output.format = bedtrack.format, verbose = verbose, FDR.threshold= FDR.threshold, use.gzip = gzip.output,
trackLine = paste0("track name='JctPvals' description='Significant Splice Junctions' useScore=1 visibility=3")
)
#}
}
}
if(isTRUE(verbose)) message("> DONE writeCompleteResults (",date(),")")
}
writeSizeFactors <- function(jscs, file = NULL){
if( all( is.na( sizeFactors( jscs )) ) ){
stop("No size factors to write.\n")
}
sf <- data.frame(sample.ID = names(sizeFactors(jscs)), size.factor = sizeFactors(jscs))
if(ncol(jscs@altSizeFactors) > 0){
sf <- cbind.data.frame(sf, jscs@altSizeFactors)
}
if(! is.null(file)){
write.table(sf,file = file, quote=FALSE,row.names=FALSE,col.names=TRUE,sep='\t')
}
return(sf)
}
##########################################################################
##########################################################################
##########################################################################
######### Top-Level Coefficient calculation functions
##########################################################################
##########################################################################
##########################################################################
generateAllExpressionEstimates.v2 <- function(jscs,nCores = 1,fitExpToVar="condition",verbose=TRUE){
myApply <- getMyApply(nCores)
levelCt <- length(levels(jscs@phenoData$condition))
sampleCt <- length(sampleNames(jscs@phenoData))
out <- getAllData2(jscs, runOnFeatures = seq_len(nrow(fData(jscs))),
fitExpToVar=fitExpToVar,
formula1 = formula(paste0("~ ",fitExpToVar," + countbin + ",fitExpToVar," : countbin")),
nCores=nCores, dispColumn="dispersion",verbose = verbose)
out <- cbind.data.frame(featureID = rownames(fData(jscs)),
geneID = fData(jscs)$geneID,
countbinID = fData(jscs)$countbinID,
out)
out
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
generateAllExpressionEstimates <- function(jscs,nCores = 1,fitExpToVar="condition",verbose=TRUE){
myApply <- getMyApply(nCores)
levelCt <- length(levels(jscs@phenoData$condition))
sampleCt <- length(sampleNames(jscs@phenoData))
out <- rbind( c(0.0,0.0,0.0,rep(0.0,levelCt),rep(0.0,levelCt),rep(0.0,sampleCt),rep(0.0,levelCt),rep(0.0,levelCt),rep(0.0,sampleCt)))
#out <- rbind( c(0,0,0,rep(0,levelCt),rep(0,levelCt),rep(0,sampleCt),rep(0,levelCt),rep(0,levelCt),rep(0,sampleCt)))
out <- data.frame(out)[0,]
names.out <- c("featureID","geneID","countbinID",
paste("exprVST",(levels(jscs@phenoData$condition)),sep='_'),
paste("expr",(levels(jscs@phenoData$condition)),sep='_'),
paste("rExprVST",(levels(jscs@phenoData$condition)),sep='_'),
paste("rExpr",(levels(jscs@phenoData$condition)),sep='_'),
paste("normCountVST",sampleNames(jscs@phenoData),sep='_'),
paste("normCount",sampleNames(jscs@phenoData),sep='_'),
paste("rawCount",sampleNames(jscs@phenoData),sep='_')
)
gene.list <- unique(jscs@featureData$geneID)
blank.line.length <- length(levels(jscs@phenoData$condition)) * 4
if(isTRUE(verbose)){
message(paste("Starting expression estimate generation on",length(gene.list),"top-level features (eg genes)."))
}
get.one.genes.data <- function(geneIndex){
geneID <- gene.list[geneIndex]
es <- fitAndArrangeCoefs( jscs, geneID, frm=as.formula(paste("count ~", fitExpToVar, "* countbin")) )
if(! is.null(es)){
#If the model converged properly, then output the results:
curr <- data.frame(getAllData(jscs,es,geneID))
if(geneIndex - 1 %% 10000 == 0 & verbose){
message("colnames:")
message(paste0(colnames(curr),collapse=" "))
}
names(curr) <- names.out
return( list(TRUE , curr) )
} else {
#If the model does not converge, return NA for all fitted values.
if(isTRUE(verbose)){
message(paste("glm fit failed for gene", geneID, " index: ",geneIndex))
}
curr <- featureData(jscs)@data[featureData(jscs)@data$geneID == geneID,c("geneID","countbinID"), drop=FALSE]
rn <- row.names(curr)
curr <- cbind.data.frame(rn,curr)
names(curr)[1] <- "featureID"
exonCt <- dim(curr)[1]
NA.buffer <- matrix(rep(NA,blank.line.length * exonCt), nrow=exonCt, ncol=blank.line.length )
norCounts <- cbind.data.frame(get.norcounts.data(jscs,geneID),get.norcounts.data(jscs,geneID,vst.xform=FALSE), get.rawcounts.data(jscs,geneID))
curr <- cbind.data.frame(curr,NA.buffer,norCounts)
names(curr) <- names.out
row.names(curr) <- rn
return( list(FALSE, curr) )
}
}
pair.buffer <- myApply(as.list(1:length(gene.list)), FUN=get.one.genes.data)
genes.data.list <- lapply(pair.buffer, "[[", 2)
did.gene.converge <- sapply(pair.buffer, "[[", 1)
if(isTRUE(verbose)){
message(paste("generated expression data for: ",length(genes.data.list),"top-level features (ie genes). ",sum(did.gene.converge)," converged. ",sum(! did.gene.converge)," failed."))
}
out <- do.call(rbind.data.frame,genes.data.list)
names(out) <- names.out
#############################
#NEW CODE: save data to jscs!
message("Saving data to JunctionSeqCountSet. (",date(),")")
conditionLevels <- levels(jscs@phenoData$condition)
names(conditionLevels) <- conditionLevels
sampleNames <- colnames(counts(jscs))
exprEstimate <- out[, strStartsWith(names(out), "expr_") ]
exprEstimateVST <- out[, strStartsWith(names(out), "exprVST_") ]
otherExprEstimate <- do.call(cbind.data.frame, lapply( conditionLevels, function(x){ rep(NA,nrow(jscs))} ) )
otherExprEstimateVST <- do.call(cbind.data.frame, lapply( conditionLevels, function(x){ rep(NA,nrow(jscs))} ) )
relExprEstimate <- out[, strStartsWith(names(out), "rExpr_") ]
relExprEstimateVST <- out[, strStartsWith(names(out), "rExprVST_") ]
normCounts <- cbind( out[, strStartsWith(names(out),"normCount_") ], do.call(cbind.data.frame, lapply( 1:sampleCt, function(x){ rep(NA,nrow(jscs))} ) ) )
normCountsVST <- cbind( out[, strStartsWith(names(out),"normCountVST") ], do.call(cbind.data.frame, lapply( 1:sampleCt, function(x){ rep(NA,nrow(jscs))} ) ) )
colnames(otherExprEstimate) <- paste0("geneExpr","_",conditionLevels)
colnames(otherExprEstimateVST) <- paste0("geneExprVST","_",conditionLevels)
colnames(relExprEstimate) <- paste0("relExpr","_",conditionLevels)
colnames(relExprEstimateVST) <- paste0("relExprVST","_",conditionLevels)
colnames(normCounts) <- c(paste0("normCount_",sampleNames), paste0("normGeneCount_",sampleNames))
colnames(normCountsVST) <- c(paste0("normCountsVST_",sampleNames), paste0("normGeneCountsVST_",sampleNames))
jscs@plottingEstimates <- list(exprEstimate = exprEstimate,
geneExprEstimate = otherExprEstimate,
relExprEstimate = relExprEstimate,
normCounts = normCounts)
jscs@plottingEstimatesVST <- list(exprEstimateVST = exprEstimateVST,
geneExprEstimateVST = otherExprEstimateVST,
relExprEstimateVST = relExprEstimateVST,
normCountsVST = normCountsVST)
message("Done saving data to JunctionSeqCountSet. (",date(),")")
#############################
if(isTRUE(verbose)){
message(paste("Collapsed expression data into a single data frame. generateAllExpressionEstimates() complete."))
}
list(jscs, out)
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
readJunctionSeqCounts <- function(countfiles = NULL, countdata = NULL,
samplenames, design,
flat.gff.file=NULL,
test.formula1 = formula(~ sample + countbin + condition : countbin),
analysis.type = c("junctionsAndExons","junctionsOnly","exonsOnly"),
nCores = 1,
use.exons = NULL, use.junctions = NULL,
use.known.junctions = TRUE,
use.novel.junctions = TRUE,
use.multigene.aggregates = FALSE,
gene.names = NULL,
verbose = TRUE,
method.countVectors = c("geneLevelCounts","sumOfAllBinsForGene","sumOfAllBinsOfSameTypeForGene"),
noDESeqMatrix = FALSE)
{
method.countVectors <- match.arg(method.countVectors)
if(isTRUE(verbose)) {
message("-> STARTING readJunctionSeqCounts (",date(),")")
}
analysis.type <- match.arg(analysis.type)
if(is.null(use.junctions) && is.null(use.exons)){
if(analysis.type == "junctionsAndExons"){
use.junctions <- TRUE
use.exons <- TRUE
} else if(analysis.type == "junctionsOnly"){
use.junctions <- TRUE
use.exons <- FALSE
} else if(analysis.type == "exonsOnly"){
use.junctions <- FALSE
use.exons <- TRUE
}
} else {
if(is.null(use.junctions) || is.null(use.exons)){
stop(paste0("Illegal syntax! If parameter use.junctions or use.exons are used, then BOTH must be set!\n use.junctions = '",use.junctions,"', use.exons = '",use.exons,"'"))
}
if(use.junctions && use.exons){
analysis.type <- "junctionsAndExons"
} else if(use.junctions && (! use.exons)){
analysis.type <- "junctionsOnly"
} else if((! use.junctions) && use.exons){
analysis.type <- "exonsOnly"
} else {
stop("Illegal syntax! Parameters use.exons and use.junctions cannot both be false!")
}
}
if(isTRUE(verbose)){
message("---> RJSC; (v",packageVersion("JunctionSeq"),")")
message("---> RJSC: samplenames: ",paste0(samplenames, collapse=","))
message("---> RJSC: flat.gff.file: ",flat.gff.file)
message("---> RJSC: use.exons:",use.exons)
message("---> RJSC: use.junctions:",use.junctions)
message("---> RJSC: use.novel.junctions:",use.novel.junctions)
}
if((is.null(countfiles) && is.null(countdata))){
stop("Fatal error: Either countfiles OR countdata must be set! Both are null!")
}
if( (!is.null(countfiles)) && (!is.null(countdata)) ){
stop("Fatal error: Either countfiles OR countdata must be set! Both are non-null!")
}
stopifnot( class(design) == "data.frame" )
for(i in 1:ncol(design)){
if( ! is.factor(design[[i]])){
stop("ERROR: design must be a data.frame composed entirely of factors!")
}
}
if(! is.null(countfiles)){
lf <- lapply( countfiles, function(x)
read.table( x, header=FALSE,stringsAsFactors=FALSE ) )
} else {
lf <- countdata
}
if( !all( sapply( lf[-1], function(x) all( x$V1 == lf[1]$V1 ) ) ) )
stop( "Count files have differing gene ID column." )
if(isTRUE(verbose)) message("---> File read complete.");
dcounts <- sapply( lf, `[[`, "V2" )
rownames(dcounts) <- lf[[1]][,1]
dcounts <- dcounts[ substr(rownames(dcounts),1,1)!="_", ]
bin.type <- sapply( rownames(dcounts),
function(x){
substr(strsplit(x, ":",fixed=TRUE)[[1]][2],1,1)
})
raw.geneID <- sapply( rownames(dcounts),
function(x){
strsplit(x, ":",fixed=TRUE)[[1]][1]
})
if(isTRUE(verbose)) message(paste0("---> Extracted counts. Found ",dim(dcounts)[1]," features so far."));
original.dcount.nrow <- nrow(dcounts);
geneCountTable <- dcounts[bin.type == "A",, drop=FALSE]
rownames(geneCountTable) <- sapply(strsplit(rownames(geneCountTable), ":"),"[[",1)
colnames(geneCountTable) <- as.character(samplenames)
use.bins <- bin.type != "A"
if(isTRUE(verbose)) message(paste0("---> Extracted gene-level counts. Found: ",dim(geneCountTable)[1], " genes and aggregate-genes."))
if(isTRUE(verbose)) message(paste0("---> Removed gene features. Found: ",sum(use.bins), " features to be included so far."))
if(isTRUE(! use.junctions) && isTRUE(! use.exons)){
stop("FATAL ERROR: At least one of: use.junctions or use.exons must be set to TRUE. Otherwise you've got no data to test!")
}
if(isTRUE(! use.junctions)){
use.bins <- use.bins & bin.type != "J" & bin.type != "N"
if(isTRUE(verbose)) message(paste0("---> Removed splice junction features. Found: ",sum(use.bins), " features to be included so far."))
}
if(isTRUE(! use.novel.junctions)){
use.bins <- use.bins & bin.type != "N"
if(isTRUE(verbose)) message(paste0("---> Removed novel splice junction features. Found: ",sum(use.bins), " features to be included so far."))
}
if(isTRUE(! use.known.junctions)){
use.bins <- use.bins & bin.type != "J"
if(isTRUE(verbose)) message(paste0("---> Removed known splice junction features. Found: ",sum(use.bins), " features to be included so far."))
}
if(isTRUE(! use.exons)){
use.bins <- use.bins & bin.type != "E"
if(isTRUE(verbose)) message(paste0("---> Removed exon features. Found: ",sum(use.bins), " features to be included so far."))
}
is.multiGene.aggregate <- grepl("+", raw.geneID, fixed=TRUE)
multiGene.aggregate.IDs <- unique(raw.geneID[is.multiGene.aggregate & use.bins])
multiGene.aggregate.ct <- length(multiGene.aggregate.IDs)
multiGene.aggregate.geneCt <- length(unlist( strsplit(multiGene.aggregate.IDs, "+",fixed=TRUE)))
if(isTRUE(verbose)) message("---> Note: ",sum(is.multiGene.aggregate[use.bins])," counting bins from overlapping genes")
if(isTRUE(verbose)) message("---> There are ",multiGene.aggregate.ct, " multigene aggregates.")
if(isTRUE(verbose)) message("---> There are ",multiGene.aggregate.geneCt," genes that are part of an aggregate.")
if(isTRUE(! use.multigene.aggregates)){
use.bins <- use.bins & (! is.multiGene.aggregate)
if(isTRUE(verbose)) message(paste0("---> Removed multigene-aggregate features. Found: ",sum(use.bins), " features to be included so far."))
}
if(isTRUE(verbose)) message(paste0("---> Final feature count: ",sum(use.bins), " features to be included in the analysis."))
dcounts <- dcounts[use.bins,, drop=FALSE]
bin.type <- bin.type[use.bins]
if(isTRUE(verbose)) message("---> Extracted feature counts.");
colnames(dcounts) <- as.character(samplenames)
splitted <- strsplit(rownames(dcounts), ":")
exons <- sapply(splitted, "[[", 2)
genesrle <- sapply( splitted, "[[", 1)
if(isTRUE(verbose)) message("---> counts complete.");
if(! is.null(flat.gff.file)){
if(isTRUE(verbose)) message("-----> reading annotation...");
anno.data <- readAnnotationData(flat.gff.file)
if(isTRUE(verbose)) message("-----> formatting annotation...");
#featureName featureType chrom start end strand gene_id part_number transcripts
exoninfo <- data.frame(chr = anno.data$chrom, start = anno.data$start, end = anno.data$end, strand = anno.data$strand)
if(isTRUE(verbose)) message("-----> initial generation...");
rownames(exoninfo) <- anno.data$featureName
transcripts <- anno.data$transcripts
transcripts <- gsub("\\+", ";", transcripts)
names(transcripts) <- rownames(exoninfo)
matching <- match(rownames(dcounts), rownames(exoninfo))
if(any(is.na(matching))){
message("FATAL ERROR! Annotation file appears to be missing information! More information below:");
message(" Count files have: ",original.dcount.nrow," rows.");
message(" Flat GFF file has: ",nrow(exoninfo)," rows.");
if(original.dcount.nrow != nrow(exoninfo)) message(" WARNING: number of count file rows does not match flattned GFF file! One of the files may have been truncated, or may have been created using different strandedness rules.")
nomatch <- which(is.na(matching));
message(" Missing information on ",length(nomatch),"/",nrow(dcounts)," features.");
nomatchSubset <- nomatch[1:min(c(10,length(nomatch)))]
message(" A few of the missing feature ID's: [\"",paste0(rownames(dcounts)[nomatchSubset],collapse="\",\""),"\"]");
stop("FATAL ERROR! Annotation file appears to be missing information! Are you sure you are using the correct flattened annotation file, created by the QoRTs QC command?")
}
if(isTRUE(verbose)) message("-----> creating jscs...");
jscs <- newJunctionSeqCountSet(countData=dcounts, geneCountData = geneCountTable, design=design, geneIDs=genesrle, countbinIDs=exons, featureIntervals=exoninfo[matching,], transcripts=transcripts[matching])
jscs@annotationFile <- flat.gff.file
jscs@flatGffData <- anno.data
jscs@flatGffGeneData <- readGeneInfo(flat.gff.file)
jscs <- mapGeneNames(jscs, gene.names)
} else {
if(isTRUE(verbose)) message("-> FINISHED readJunctionSeqCounts (",date(),")");
message("Warning: flat gff annotation not set (via parameter flat.gff.file)! While technically optional, running without the annotation data may make interpretation of the data difficult. Much of the plotting functionality will not work!")
warning("Warning: flat gff annotation not set (via parameter flat.gff.file)! While technically optional, running without the annotation data may make interpretation of the data difficult. Much of the plotting functionality will not work!")
jscs <- newJunctionSeqCountSet(countData=dcounts, geneCountData = geneCountTable, design=design, geneIDs=genesrle, countbinIDs=exons);
}
attr(jscs,"AltMethods") <- c(attr(jscs,"AltMethods"), method.countVectors = method.countVectors)
attr(jscs,"CallStack") <- list(deparse(match.call()))
jscs@analysisType <- analysis.type
featureChar <- substr(fData(jscs)$countbinID,1,1)
fData(jscs)[["featureType"]] <- ifelse(featureChar == "E","exonic_part",ifelse(featureChar == "J", "splice_site", "novel_splice_site"))
varMetadata( featureData(jscs) )[ "featureType", "labelDescription" ] <- "The type of feature (exonic_part,, splice_site, or novel_splice_site)."
pData(jscs)$countfiles <- countfiles
if(isTRUE(verbose)) message("-----> generating count vectors... (",date(),")")
jscs@countVectors <- getAllJunctionSeqCountVectors(jscs, nCores = nCores, method.countVectors); #use.alternate.method = use.alternate.method)
if(isTRUE(verbose)) message("-----> count vectors generated (",date(),")");
if(! noDESeqMatrix){
if(isTRUE(verbose)) message("-----> generating DESeqDataSet... (",date(),")")
jscs <- makeDESeqDataSetFromJSCS(jscs, test.formula1 = test.formula1)
if(isTRUE(verbose)) message("-----> DESeqDataSet generated (",date(),")")
fData(jscs)[["allZero"]] <- (rowSums(counts(jscs)) == 0) |
(rowSums(counts(jscs@DESeqDataSet)[, colData(jscs@DESeqDataSet)$countbin == "others"]) ==0)
mcols(jscs@DESeqDataSet)$allZero <- fData(jscs)[["allZero"]]
}
return(jscs)
if(isTRUE(verbose)) message("-> FINISHED readJunctionSeqCounts (",date(),")");
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
mapGeneNames <- function(jscs, gene.names = NULL, gene.name.separator = "+", gene.multimap.separator = ","){
if(is.null(gene.names)){
jscs@flatGffGeneData$gene_name <- jscs@flatGffGeneData$geneID;
return(jscs)
}
if(class(gene.names) == "character"){
gene.names <- read.table(gene.names, sep='\t',header=TRUE,stringsAsFactors=FALSE);
}
if(class(gene.names) != "data.frame"){
stop("Error: gene.names must be a filename or a data frame!")
}
if(names(gene.names)[1] != "geneID"){
message(" (Assuming \"",names(gene.names)[1],"\" column is geneID)");
}
if(names(gene.names)[2] != "gene_name"){
message(" (Assuming \"",names(gene.names)[2],"\" column is gene_name)");
}
jscs@flatGffGeneData$gene_name <- mapGeneNamesToList(jscs@flatGffGeneData$geneID,
gene.names = gene.names,
gene.name.separator = gene.name.separator,
gene.multimap.separator = gene.multimap.separator)
#For adding future functionality. Currently not used for anything.
if(ncol(gene.names) > 2){
for(i in 3:ncol(gene.names)){
jscs@flatGffGeneData[[names(gene.names)[i]]] <- mapGeneNamesToList(jscs@flatGffGeneData$geneID,
gene.names = gene.names,
gene.name.separator = gene.name.separator,
gene.multimap.separator = gene.multimap.separator,
oldID.column = 1, newID.column = i);
}
}
return(jscs)
}
mapGeneNamesToList <- function(geneIDs, gene.names = NULL, gene.name.separator = "+", gene.multimap.separator = ",", oldID.column = 1, newID.column = 2){
if(is.null(gene.names)){
out = geneIDs
names(out) = geneIDs
return(out)
}
if(class(gene.names) != "data.frame"){
stop("Error: gene.names must be a filename or a data frame!")
}
oldIDs <- as.character(geneIDs)
oldIDs.map <- as.character(gene.names[,oldID.column])
newIDs <- as.character(gene.names[,newID.column])
oldID.list <- strsplit(oldIDs, "+", fixed=TRUE)
out <- sapply(oldID.list, function(gs){
gs <- as.character(gs)
paste0(sapply(gs, function(g){
idx <- which(as.character(oldIDs.map) == g)
if(length(idx) == 0){
g
} else if(length(idx) == 1){
newIDs[idx]
} else {
paste0(newIDs[idx], collapse= gene.multimap.separator)
}
}), collapse= gene.name.separator)
})
return(out)
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
readAnnotationData <- function(flat.gff.file){
aggregates<-read.delim(flat.gff.file, stringsAsFactors=FALSE, header=FALSE)
colnames(aggregates)<-c("chr", "source", "class", "start", "end", "ex", "strand", "ex2", "attr")
aggregates<-aggregates[which(aggregates$class != "aggregate_gene"),]
aggregates$attr <- gsub("\"|=|;", "", aggregates$attr)
gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1", aggregates$attr)
transcripts <- gsub(".*tx_set\\s(\\S+).*", "\\1", aggregates$attr)
part_number <- gsub(".*num\\s(\\S+).*", "\\1", aggregates$attr)
featureCode <- ifelse(aggregates$class == "exonic_part","E", ifelse(aggregates$class == "splice_site","J", ifelse(aggregates$class == "novel_splice_site","N","?")))
featureName <- paste0(gene_id,":",featureCode,part_number)
out <- data.frame(featureName = featureName,
featureType = aggregates$class,
chrom = aggregates$chr,
start = aggregates$start,
end = aggregates$end,
strand = aggregates$strand,
gene_id = gene_id,
part_number = part_number,
transcripts = transcripts
)
out$start <- as.integer(out$start - 1)
return(out)
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
readGeneInfo <- function(flat.gff.file){
aggregates<-read.delim(flat.gff.file, stringsAsFactors=FALSE, header=FALSE)
colnames(aggregates)<-c("chr", "source", "class", "start", "end", "ex", "strand", "ex2", "attr")
aggregates <- aggregates[which(aggregates$class == "aggregate_gene"),]
attrSimple <- gsub("\"|=|;", "", aggregates$attr)
attrCells <- strsplit(attrSimple,"\\s")
#Backwards compatibility with older versions of QoRTs:
if( ! grepl(".*tx_set\\s\\S+.*",attrSimple[[1]])){
gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1", attrSimple)
part_number <- gsub(".*num\\s(\\S+).*", "\\1", attrSimple)
out <- data.frame(geneID = gene_id,num = part_number, stringsAsFactors=FALSE)
} else {
gene_id <- sub(".*gene_id\\s(\\S+).*", "\\1", attrSimple)
transcripts <- gsub(".*tx_set\\s(\\S+).*", "\\1", attrSimple)
part_number <- gsub(".*num\\s(\\S+).*", "\\1", attrSimple)
tx_strands <- gsub(".*tx_strands\\s(\\S+).*", "\\1", attrSimple)
aggregateGeneStrand <- gsub(".*aggregateGeneStrand\\s(\\S+).*", "\\1", attrSimple)
out <- data.frame(geneID = gene_id,aggregateGeneStrand = aggregateGeneStrand, tx_set = transcripts, num = part_number, tx_strands = tx_strands, stringsAsFactors=FALSE)
}
return(out)
}
##############################################################################################################################################################################################################################
#########
##############################################################################################################################################################################################################################
getColCt <- function(geneID, merged.data,
plot.exon.results, plot.junction.results, plot.novel.junction.results,
plot.untestable.results){
if(is.null(plot.exon.results)){
plot.exon.results <- any( merged.data$featureType == "exonic_part" )
}
if(is.null(plot.junction.results)){
plot.junction.results <- any( merged.data$featureType == "splice_site" | merged.data$featureType == "novel_splice_site" )
}
if(is.null(plot.novel.junction.results)){
if(plot.junction.results){
plot.novel.junction.results <- any( merged.data$featureType == "novel_splice_site" )
} else {
plot.novel.junction.results <- FALSE
}
}
rt <- merged.data$geneID == geneID
if(isTRUE(! plot.exon.results)){
rt <- rt & merged.data$featureType != "exonic_part"
}
if(isTRUE(! plot.junction.results)){
rt <- rt & merged.data$featureType != "splice_site"
}
if(isTRUE(! plot.novel.junction.results)){
rt <- rt & merged.data$featureType != "novel_splice_site"
}
if(isTRUE(! plot.untestable.results)){
rt <- rt & merged.data$testable
}
return(sum(rt))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.