require(R6)
require(DESeq2)
#' Class a simple interface to deseq data
#'
#' @docType class
#' @importFrom R6 R6Class
#' @import grDevices
#' @import graphics
#' @import stats
#' @import DESeq2
#' @import dplyr
#' @import tidyverse
#' @import methods
#' @import RColorBrewer
#' @import RCurl
#' @import ggplot2
#' @import pheatmap
#' @export
#' @keywords Deseq
#' @return Object of \code{\link{R6Class}} to store Deseq data.
#' @format \code{\link{R6Class}} object.
#' @examples
#' dnmt <- deseqAbs$new("DNMT1KO",fc.file)
#' dnmt$filename
#' head(dnmt$rawfile)
#' head(dnmt$pos)
#' head(dnmt$length)
#' head(dnmt$rawCounts)
#' ## sampleNames
#' dnmt$sampleNames <- colnames(dnmt$rawCounts)
#' ## define coldata conditions
#' dnmt$colData <- data.frame(condition=c(rep("CTR",3),rep("KO",3)))
#' dnmt$makeDESeq()
#' dnmt$deseq
#' dnmt$makeDiffex()
#' dnmt$test
#' dnmt$makeVST()
#' dnmt$makeFPKM()
#' @field name the name of the experiment
#' @field filename the name of the raw featurecount output
#' @field rawfile the raw featurecount file
#' @field rawCounts the count matrix (removing position and length info column 1-6) with ID rownames
#' @field baseMean list of data.frames: Mean and SD: mean and SD of normalized count data
#' @field FPKMMean list of data.frames: Mean and SD: mean and SD of FPKM
#' @field geneID geneIDs
#' @field colData a data.frame with condition information
#' @field sampleNames a vector given by user to provide suitable sampleNames to replace filenames from featureCounts
#' @field VST the output from varianceStabilizingTransformation(dds)
#' @field deseq the output from DESeq(dds)
#' @field FPKM matrix with FPKM values for all genes
#' @field test output from diffex analysis results(dds)
#' @field pos position data for each gene
#' @field length of each gene
#' @export
deseqAbs <- R6Class("deseqAbs",
public = list(
name = "character",
filename = NULL,
rawfile = NULL,
rawCounts = NULL,
normCounts = NULL,
design = NULL,
baseMean = NULL,
FPKMMean = NULL,
geneID = NULL,
colData = NULL,
sampleNames = NULL,
VST = NULL,
deseq = NULL,
FPKM = NULL,
test = NULL,
pos = NULL,
length = NULL,
initialize = function(name = NULL,filename = NULL,colData = NULL,design=NULL) {
### Check if all required parameters are set!
if(is.null(filename)) {
cat(">ERROR: No filename (with full path) given! \n")
cat("--- Please provide name (and full path) of raw featureCounts output file upon creating this object\n\n")
}
if(is.null(colData)) {
cat(">ERROR: No colData given! \n")
} else {
if(is.null(colData$condition)) {
cat(">ERROR: colData has no 'condition' column. \n")
}
if(is.null(colData$samples)) {
cat(">ERROR: colData has no 'samples' column. \n")
}
}
## If all parameters are set, initialize
if(!is.null(filename) & !is.null(colData) & !is.null(colData$condition) & !is.null(colData$samples)) {
# message
self$greet()
self$name <- name
self$filename <- filename
#make sure to factorize colData
cols <- names(colData)
colData[,cols] <- lapply(colData[,cols],factor)
self$colData <- colData
# sample names, must be unique
colnames <- paste(colData$condition,colData$samples,sep = ": ")
if(length(which(duplicated(colnames)))>0){
self$sampleNames <- make.names(colnames,unique = T)
} else {
self$sampleNames <- colnames
}
self$test <- list()
self$read_file(filename)
self$geneID <- as.character(self$rawfile[,1])
self$getPos()
self$getRawCounts()
colnames(self$rawCounts) <- self$sampleNames
if(!is.null(design)){
self$design <- design
} else { self$design <- formula(~condition)}
}
else {
cat(">ERROR: Could not initialize object..\n")
}
},
read_file = function(filename) {
if(!is.na(filename)) {
cat(">>Reading featureCount file\n")
self$rawfile <- read.csv(filename,header=T,sep = "\t",skip=1)
cat("- ..featureCount file reading done. Access rawdata with $rawfile\n")
} else {
cat(">ERROR: You must add name of raw featurecount file.\n")
cat("- > dnmt$filename <- \"<path>/<featureCountOutput>\"\n")
}
},
sampleQC = function() {
par(mar=c(4,4,4,4))
if(is.null(self$VST) | is.null(self$deseq)) {
cat(">>First: Run fullAuto() for normalization and diffex testing.. \n")
self$fullAuto()
cat("- ..complete! fullAuto() compelted, now proceding with sampleQC().\n")
}
if(!is.null(self$VST) & !is.null(self$deseq)) {
cat(">>Plotting reads assigned to database. Also showing non-mapped reads.\n")
self$readsAssigned(nonAssigned=TRUE)
cat(">>Plotting reads assigned to database. Percentage of all reads mapping to genome.\n")
self$readsAssigned(nonAssigned=F)
cat(">>Plotting PCA. Top 1000 most variable genes used.\n")
self$pca(ntop = 1000)
cat(">>Plotting most variable genes.\n")
self$mostVariableHeat()
cat(">>Plotting sample to sample distances.\n")
self$sampleToSample()
cat(">>Plotting significantly changed genes.. These changes are defined by the default DESeq results() test..\n")
self$significantHeat()
}else {
cat(">ERROR! could not run QC.. $VST or $deseq is NULL!\n")
}
par(mar=c(4,4,4,4))
},
significantHeat = function(test=self$test$Default,ntop=50) {
mostSignificantHeat(data=assay(self$VST),test=test,ntop=ntop)
},
mostVariableHeat = function(ntop=50) {
mostVariableHeat(self$VST,ntop=ntop)
},
sampleToSample = function() {
sampleToSample(self$VST,samples = paste(self$colData$samples,self$colData$condition,sep=" : "))
},
# provide specific summary file name if not identical name as count file with *.summary suffix
# if nonAssigned = T, then also plot all reads that are NOT assigned to the given database
readsAssigned = function(summaryFile=NULL,nonAssigned=F) {
if (is.null(summaryFile)) {
sum <- read.delim(paste(self$filename, ".summary", sep = ""))
} else {
sum <- read.delim(summaryFile)
}
if (!is.null(self$colData$samples)) {
colnames(sum) <- c("a", as.character(self$colData$samples))
}
tot.map <- colSums(sum[, -1])
assigned <- sum[1, -1]
notassigned <- tot.map - assigned
if (nonAssigned) {
plot <- as.matrix(rbind(assigned = assigned, not.assigned = notassigned))
x <- barplot(plot, col = c("blue", "grey80"), ylim = c(0,max(tot.map) * 1.2), ylab = "total read number",las = 2)
legend("topleft", legend = c("not assigned", "assigned"),
fill = c("grey80", "blue"),bty='n')
title(main = "Reads assigned to annotation out of all mapped reads")
} else {
plot <- as.matrix(assigned * 100/(notassigned + assigned))
x <- barplot(plot, col = c("blue", "grey80"), ylim = c(0,max(plot) * 2), ylab = "percentage reads assigned / reads to genome", las = 2)
title(main = "Reads assigned to annotation out of all mapped reads")
options(scipen=-10)
text(x = x-.4, y = plot * 1.1, labels = format(assigned,scientific=T),srt=90,pos = 4)
options(scipen=0)
}
return(plot)
},
getPos = function() {
cat(">>Fetching Positional info from file\n")
self$length <- self$rawfile[,6]
names(self$length) <- self$rawfile[,1]
self$pos <- self$rawfile[,2:5]
rownames(self$pos) <- self$rawfile[,1]
cat("- ..complete! Get position of genes with $pos\n")
},
getAverage = function() {
self$getAverageReads()
self$getAverageFPKM()
},
getAverageReads = function() {
if(is.null(self$deseq)) {
cat(">ERROR: You must run deseq to get normalized read counts..\n")
self$makeDESeq()
}
if(length(levels(self$deseq$condition))>sum(duplicated(self$deseq$condition))) {
cat(">Warning: Some of your levels do not have replicates.. \n")
} else {
## normalized counts
cat(">>Computing mean normalized counts of each condition\n")
baseMeanPerLvl <- sapply( levels(self$deseq$condition), function(lvl) rowMeans( counts(self$deseq,normalized=TRUE)[,self$deseq$condition == lvl] ) )
baseSDPerLvl <- sapply( levels(self$deseq$condition), function(lvl) apply( counts(self$deseq,normalized=TRUE)[,self$deseq$condition == lvl],1,sd ) )
baseSEPerLvl <- sapply( levels(self$deseq$condition),
function(lvl) apply( counts(self$deseq,normalized=TRUE)[,self$deseq$condition == lvl],1,
function(dat) sd(dat)/sqrt(length(dat))))
colnames(baseSDPerLvl) <- colnames(baseSDPerLvl)
self$baseMean <- list(Mean=baseMeanPerLvl,SD=baseSDPerLvl,SE=baseSEPerLvl)
cat("- ..complete! Mean normalized counts computed for each condition. access mean with $baseMean$Mean, and st.dev with $baseMean$SD \n")
}
},
getAverageFPKM = function() {
## FPKM
if(is.null(self$FPKM)) {self$makeFPKM() }
if(length(levels(self$deseq$condition))>sum(duplicated(self$deseq$condition))) {
cat(">Warning: Some of your levels do not have replicates..\n")
} else {
if(!is.null(self$FPKM)) {
cat(">>Computing mean FPKM of each condition\n")
baseMeanPerLvl <- sapply( levels(factor(self$colData$condition)), function(lvl) rowMeans( self$FPKM[,self$colData$condition == lvl] ) )
baseSDPerLvl <- sapply( levels(factor(self$colData$condition)), function(lvl) apply( self$FPKM[,self$colData$condition == lvl],1,sd ) )
baseSDPerLvl <- sapply( levels(self$colData$condition), function(lvl) apply( self$FPKM[,self$colData$condition == lvl],1,sd) )
baseSEPerLvl <- sapply( levels(self$colData$condition),
function(lvl) apply( self$FPKM[,self$colData$condition == lvl],1,
function(dat) sd(dat)/sqrt(length(dat))))
colnames(baseSDPerLvl) <- colnames(baseSDPerLvl)
self$FPKMMean <- list(Mean=baseMeanPerLvl,SD=baseSDPerLvl,SE=baseSEPerLvl)
cat("- ..complete! Mean normalized expression computed for each condition. access mean with $baseMean$Mean, and st.dev with $baseMean$SD \n")
}
}
},
getRawCounts = function() {
cat(">>Getting countData matrix\n")
self$rawCounts <- self$rawfile[,-c(1:6)]
rownames(self$rawCounts) <- self$geneID
if(!is.null(self$sampleNames)) {
colnames(self$rawCounts) <- self$sampleNames
} else if(!is.null(self$colData)) {
colnames(self$rawCounts) <- make.names(names = self$colData$condition,unique = T)
}
cat("- ..complete! Access raw countData with $rawCounts\n")
},
greet = function() {
cat(">>>Creating deseqAbs object\n")
},
makeDESeq = function() {
if(is.null(self$deseq)) {
if(!is.data.frame(self$colData)) {
cat(">ERROR: Add colData data.frame! E.g: >deseqAbs$colData <- data.frame(condition = x)\n")
} else {
cat(">>Running DESeq")
dds <- DESeqDataSetFromMatrix(countData = self$rawCounts,
colData = self$colData,
design = self$design)
self$deseq <- DESeq(dds)
self$normCounts <- counts(self$deseq,normalized=T)
cat("- ..complete! Access object with $deseq \n")
}
}else{
cat("> DESeq already run on this object, not needed to run again.. \n")
}
},
makeVST = function(blind=NULL) {
if(is.null(self$deseq)){
cat(">> The $deseq object is not initialized on this instance.. This has to be done before varianceStablizingTransformation..\n")
self$makeDESeq()
}
if(is.null(blind)) {
cat(">ERROR: You need to define if you want to do blind dispersion estimates!\n")
cat("== set blind=FALSE if you expect a large fraction of genes to have large differences in counts explainable by experimental design!\n")
cat("== set blind=TRUE otherwise.\n")
cat("== If you are not sure, try both, and compare clustring results\n")
} else {
bl <- "blind"
if(!blind) { bl <- "not-blind" }
cat(paste(">>Performing ",bl," variance stabilizing transformation \n",sep = ""))
self$VST <- varianceStabilizingTransformation(self$deseq,blind = blind)
# add colnames of VST assay dataframe
# if SampleNames given (which they should be upon initialization, set to colnames)
if(!is.null(self$sampleNames)) {
# add condition to sample names
colnames <- paste(self$colData$condition,self$sampleNames,sep=" : ")
# if there will be duplicates, make unique
if(length(which(duplicated(colnames)))>0) {
names(assay(self$VST)) <- make.names(colnames,unique = T)
} else {
names(assay(self$VST)) <- colnames
}
# if No sampleNames exist, use colData conditions only.
}else if(!is.null(self$colData$condition)) {
names(assay(self$VST)) <- make.names(names = as.character(self$colData$condition),unique = T)
}
cat(paste(" - ..complete! ",bl," variance stabilizing transformation \n",sep = ""))
}
},
# name: name of test
# c1: condition 1, has to be of the form of output of resultsNames(dds)
# c2: condition 2
# n1: condition 1, as index (integer) of the condition in vector from resultsNames(x$deseq)
# n1: condition 2, as index (integer) of the condition in vector from resultsNames(x$deseq)
makeDiffex = function(name=NULL,c1=NULL,c2=NULL,n1=NULL,n2=NULL) {
if(is.null(self$deseq)){
cat(">> DESeq is not run on this object. This has to be done before running diffex..")
self$makeDESeq()
}
# if no specific conditions input, do default conditions
if(is.null(n1) & is.null(c1)) {
cat(">>Testing for differential expression..\n")
self$test[["Default"]] <- results(self$deseq)
cat("- ..complete! Diffex completed with default values. Access with $test.\n")
} else if(!is.null(n1) & !is.null(n2)) {
c1 <- gsub(pattern = "condition",replacement = "",resultsNames(self$deseq)[n1])
c2 <- gsub(pattern = "condition",replacement = "",resultsNames(self$deseq)[n2])
cat(">>Testing for differential expression..\n")
cat(paste("-- Testing ",c1," vs. ",c2,"..\n",sep = ""))
if(!is.null(name)) {
self$test[[name]] <- results(self$deseq,contrast = c("condition",c1,c2))
} else {
self$test[[paste("Test:",c1,"_vs._",c2,"",sep = "")]] <- results(self$deseq,contrast = c("condition",c1,c2))
}
cat(paste("- ..complete! Test completed for ",c1," vs. ",c2,"..\n",sep = ""))
} else if(!is.null(c1) & !is.null(c2)) {
c1 <- gsub(pattern = "condition",replacement = "",c1)
c2 <- gsub(pattern = "condition",replacement = "",c2)
cat(">>Testing for differential expression..\n")
cat(paste("-- Testing ",c1," vs. ",c2,"..\n",sep = ""))
if(!is.null(name)) {
self$test[[name]] <- results(self$deseq,contrast = c("condition",c1,c2))
} else {
self$test[[paste("Test:",c1,"_vs._",c2,"",sep = "")]] <- results(self$deseq,contrast = c("condition",c1,c2))
}
cat(paste("- ..Complete! Test completed for ",c1," vs. ",c2,"..\n",sep = ""))
}
},
makeFPKM = function() {
cat(">>Computing FPKM..\n")
self$FPKM <- 10^9*t(t(self$rawCounts/as.numeric(self$length))/as.numeric(colSums(self$rawCounts)))
rownames(self$FPKM) <- self$geneID
if(!is.null(self$sampleNames)) {
names(self$FPKM) <- make.names(names = self$sampleNames,unique = T)
}else if(!is.null(self$colData)) {
names(self$FPKM) <- make.names(names = self$colData$condition,unique = T)
}
cat("- ..complete! FPKM computed. Access with $FPKM.\n")
},
pca = function(ntop=1000,title=NULL,label=NULL,col=self$colData$condition,shape=NULL) {
if(is.null(title)) {title = "PCA"}
if(is.null(label)) {label = rep("",length(self$colData$condition))}
if(is.null(shape)) {
PCAplotter(dat = self$VST,
color = col,
title = title,
ntop = ntop,
label = label)
} else {
PCAplotter(dat = self$VST,
color = col,
shape = shape,
title = title,
ntop = ntop,
label = label)
}
},
fullAuto = function() {
if(!is.null(self$colData) & !is.null(self$rawCounts)) {
if(!is.null(self$sampleNames)) {
colnames(self$rawCounts) <- make.names(self$sampleNames,unique = T)
}else if(!is.null(self$colData)) {
colnames(self$rawCounts) <- make.names(names = self$colData$condition,unique = T)
}
par(mar=c(4,4,4,4))
self$makeDESeq()
self$makeDiffex()
self$makeVST(blind = F)
self$makeFPKM()
self$getAverage()
par(mar=c(4,4,4,4))
}else {
cat(">ERROR: Cannot do this yet.. \n")
}
if(is.null(self$colData)){
cat("- You need to make colData data.frame!\n")
}
if(is.null(self$rawCounts)) {
cat("- You need to add rawCounts matrix!\n")
}
},
## get most variable genes
getVariable = function(ntop=50,sdcut = 0) {
sd <- apply(assay(self$VST),1,sd)
# if user specifices cutoff of sd for genes to return
if(sdcut > 0) {
return(rownames(self$VST[sd>sdcut,]))
} else { ## if no sdcut defined, return top 50 (or user specified ntop genes)
tmp <- assay(self$VST)
sd.ordered <- tmp[order(-apply(tmp,1,sd)),]
return(rownames(sd.ordered[1:ntop,]))
}
},
meanBars = function(genes,cond=NULL,FPKM=FALSE,points=F) {
if(FPKM) {
if(is.null(self$FPKMMean)) {
self$getAverageFPKM()
}
} else {
if(is.null(self$baseMean)) {
self$getAverageReads()
}
}
meanBar(deseqAbs = self,genes = genes,cond = cond,FPKM = FPKM,points=points)
}
))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.