````r cat("") cat("") cat("") cat("") cat("") cat("")
```r library(googleVis) op <- options(gvis.plot.tag='chart') options(stringsAsFactors=FALSE) library(RNASeqUtility) require(rtracklayer) library("gplots") library("RColorBrewer") library(ggplot2) library(DESeq2) library("genefilter") # library(ReportingTools) library(rCharts)
#This following analysis is based on: http://www.bioconductor.org/help/workflows/rnaseqGene/ load("../Annotation.rda") setwd(rootDir) args <- commandArgs(TRUE) if( length(args) == 0 ){ load( file.path(getwd(),"Configuration.rda") ) } else{ load(file.path(args[1],"Configuration.rda")) } # load(file.path( rootDir,"Annotation.rda"))# groupsForPCAplot = "condition" firstConditionLvl = as.character(samplesInfo$condition[1]) #It will be convenient to make sure that untrt is the first level in the dex factor, so that the default log2 fold changes are calculated as treated over untreated numberOfGenesToReport=50
########################################################### # Filtering by low expressed reads and subsetting by Annotation! ########################################################### #Filter ENSEMBL ANNOTATION BY BIOTYPE ensReadsCountDF_clustered_filt = ensReadsCountDF_clustered[ which( !ensReadsCountDF_clustered$gene_biotype %in% c("miRNA","snoRNA") ), ] countsEns = apply(ensReadsCountDF_clustered_filt[,(which(colnames(ensReadsCountDF_clustered_filt) == "UID")+1) : (which(colnames(ensReadsCountDF_clustered_filt) == "mapStart")-1)],2,as.numeric) rownames(countsEns) = ensReadsCountDF_clustered_filt$UID keep = rowSums(countsEns == 0) <= (dim(countsEns)[2]/2)+1 countsEns = countsEns[keep,] countsContigs = apply(contigAnnotTable_fin[,grep("^[0-9]+$",colnames(contigAnnotTable_fin))],2,as.numeric) rownames(countsContigs) = contigAnnotTable_fin$contigID keep = rowSums(countsContigs == 0) <= (dim(countsContigs)[2]/2)+1 countsContigs = countsContigs[keep,] if( dim(otherReadsCountDF)[1] > 0 ){ countsOther = apply(otherReadsCountDF[,(which(colnames(otherReadsCountDF) == "UID")+1) : (which(colnames(otherReadsCountDF) == "mapStart")-1)],2,as.numeric) rownames(countsOther) = otherReadsCountDF$UID keep = rowSums(countsOther == 0) <= (dim(countsOther)[2]/2)+1 countsOther = countsOther[keep,] counts = rbind( countsEns, countsContigs) counts = rbind( counts, countsOther) } else{ counts = rbind( countsEns, countsContigs) } head(counts)
dds <- DESeqDataSetFromMatrix(countData = counts, colData = samplesInfo, design = ~ condition)
colData(dds)
The transformation is done by regularized-logarithm transformation (rlog) in order to derive approximate homoskedastic data
rld <- rlog(dds) head(assay(rld))
par( mfrow = c( 1, 2 ) ) dds <- estimateSizeFactors(dds) plot( log2( 1 + counts(dds, normalized=TRUE)[ , 1:2] ), col=rgb(0,0,0,.2), pch=16, cex=0.3 ) plot( assay(rld)[ , 1:2], col=rgb(0,0,0,.2), pch=16, cex=0.3 ) par( mfrow = c(1,1) )
sampleDists <- dist( t( assay(rld) ) ) sampleDistMatrix <- as.matrix( sampleDists ) rownames(sampleDistMatrix) <- dds$sampleName colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) hc <- hclust(sampleDists) heatmap.2( sampleDistMatrix, Rowv=as.dendrogram(hc), symm=TRUE, trace="none", col=colors, margins=c(2,10), labCol=FALSE )
This plot shows the similarity of the samples based on their expression values (rlog transformed)
plotPCA(rld, intgroup = groupsForPCAplot) #second possibility to plot... # mds <- data.frame(cmdscale(sampleDistMatrix)) # mds <- cbind(mds, colData(rld)) # qplot(X1,X2,color=condition,data=mds)
The data points (i.e., here, the samples) are projected onto the 2D plane such that they spread out in the two directions which explain most of the differences in the data. The x-axis is the direction (or principal component) which separates the data points the most. The amount of the total variance which is contained in the direction is printed in the axis label.
dds$condition <- relevel(dds$condition, firstConditionLvl) dds <- DESeq(dds) res <- results(dds) resOrdered <- res[order(res$padj),] #DIFFERENT CONTRASTS MAY BE DEFINED AND ADDED TO THIS TEMPLATE HERE: #results(dds, contrast=c("cell", "N061011", "N61311"))
mcols(res, use.names=TRUE)
Of course, this estimate has an uncertainty associated with it, which is available in the column lfcSE, the standard error estimate for the log2 fold change estimate.
summary(res)
sum(res$padj < 0.1, na.rm=TRUE)
resSig <- subset(res, padj < 0.1) if( dim(resSig)[1] == 0 ){ resSig = resOrdered[1:10,] } head(resSig) topGene = which( rownames(res) %in% rownames(head(resSig)) )
tmp = resSig[ order( resSig$log2FoldChange ), ] tmp$UID = rownames(tmp) tmp = merge(as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE) head(tmp)
tmp = resSig[ order( -resSig$log2FoldChange ), ] tmp$UID = rownames(tmp) tmp = merge(as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE) head(tmp)
plotMA(res) with(res[topGene, ], { points(baseMean, log2FoldChange, col="dodgerblue", cex=1.5, lwd=2) text(baseMean, log2FoldChange, rownames(res[topGene, ]), pos=2, col="dodgerblue", cex=0.5) })
### Creating a report of the top differentially expressed genes # des2Report <- HTMLReport(shortName = "tmpReport", # title = "Differentially Expressed genes", # reportDirectory = "./reports") # #Top 50 genes are reported # publish(dds, des2Report, pvalueCutoff=1,n = numberOfGenesToReport, # factor = colData(dds)$condition, # reportDir="./reports") #Otherwise the annotation.db is not needed -> only rownames are displayed # # finish(des2Report)
#The report that contains the read count distribution for each gene can be accessed here # cat( paste0("<a href=\"reports/tmpReport.html\"> ", "Top ",numberOfGenesToReport," differentially expressed genes </a>") )
Whether a gene is called significant depends not only on its LFC but also on its within-group variability, which DESeq2 quantifies as the dispersion. For strongly expressed genes, the dispersion can be understood as a squared coefficient of variation: a dispersion value of 0.01 means that the gene's expression tends to differ by typically sqrt(0.01) = 10% between samples of the same treatment group. For weak genes, the Poisson noise is an additional source of noise.
plotDispEsts(dds)
The black points are the dispersion estimates for each gene as obtained by considering the information from each gene separately. Unless one has many samples, these values fluctuate strongly around their true values. Therefore, we fit the red trend line, which shows the dispersions' dependence on the mean, and then shrink each gene's estimate towards the red line to obtain the final estimates (blue points) that are then used in the hypothesis test. The blue circles above the main "cloud" of points are genes which have high gene-wise dispersion estimates which are labelled as dispersion outliers. These estimates are therefore not shrunk toward the fitted trend line.
hist(res$pvalue[res$baseMean > 1], breaks=20, col="grey50", border="white", main = "Pvalue Distribution", xlab="pvalue")
Normally one should see a peak on the left side (genes with very small counts are excluded)
topVarGenes <- head(order(-rowVars(assay(rld))),100)
colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255) sidecols <- c("grey","dodgerblue")[ rld$condition ] mat <- assay(rld)[ topVarGenes, ] mat <- mat - rowMeans(mat) colnames(mat) <- rld$sampleName heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols, labRow=FALSE, mar=c(10,2), scale="row")
colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255) sidecols <- c("grey","dodgerblue")[ rld$condition ] signCand = na.omit(which(resOrdered$pvalue < 0.05)) mat <- assay(rld)[ rownames(resOrdered[signCand,]), ] mat <- mat - rowMeans(mat) colnames(mat) <- rld$sampleName tmp = resOrdered[signCand,] tmp$UID = rownames(tmp) tmp = merge( as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE) rownames(mat) = tmp$Name # heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols, # labRow=TRUE, mar=c(10,2), scale="row") heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols, mar=c(10,15), scale="row")
colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255) sidecols <- c("grey","dodgerblue")[ rld$condition ] signCand = na.omit(which(resOrdered$padj < 0.1)) if( length(signCand) > 1 ){ mat <- assay(rld)[ rownames(resOrdered[signCand,]), ] mat <- mat - rowMeans(mat) colnames(mat) <- rld$sampleName tmp = resOrdered[signCand,] tmp$UID = rownames(tmp) tmp = merge( as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE) rownames(mat) = tmp$Name # heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols, # labRow=TRUE, mar=c(10,2), scale="row") heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols, mar=c(10,15), scale="row") } else{ message("No candidates below adj.pval 0.1") }
col.concent <- c("gold" , "red") barplot(colSums(counts(dds))*1e-6, names = colnames(counts), ylab="Library size (millions)", col = col.concent[factor(samplesInfo$condition)], las = 2, cex.names = 0.5) legend("topleft", legend = levels(factor(samplesInfo$condition)), col = col.concent, pch = 15, cex = 0.7)
````r
dat = resOrdered dat$UID = rownames(dat) dat = merge(as.data.frame(dat), minimalAnnotationDF, by="UID", sort=FALSE) write.table(dat, "DETable.csv", sep="\t", col.names=TRUE, row.names=FALSE)
```r dat$biotypeSummary = ifelse( !dat$biotype %in% c("antisense", "lincRNA", "miRNA", "snoRNA", "snRNA", "tRNA", "unknown", "rRNA"), "other", dat$biotype) # dat$biotypeSummary = ifelse( !dat$biotype %in% c("antisense", "lincRNA", "tRNA", "rRNA"), "other", dat$biotype) g1 = ggplot( dat, aes(x=log2(baseMean), y=log2FoldChange, color=biotypeSummary), environment=environment() ) + geom_point() + labs(title="MAPlot (Averaged over all arrays)", x="Intensity (A)", y="log2 Fold Change (M)") + geom_text( data = dat[1:20,],aes(label=Name), show_guide = FALSE, size=2, color="black") g1
dat$logPrev = -log10(dat$pvalue) g = ggplot( dat, aes(x=log2FoldChange, y=logPrev, colour = biotypeSummary), environment=environment() ) + geom_point() + labs(title="Vulcano Plot (Averaged over all arrays)", x="log2 Fold Change (M)", y="-log10(p-value)") + geom_hline(aes(yintercept=-log10(0.05)), color="red") + geom_text( data = dat[1:20,],aes(label=Name), show_guide = FALSE, size=2.5, color="black") g # scale_color_manual(values=c( "red","lightgrey")) + # geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07 geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07
ggplot(dat, aes(x=biotype) ) + geom_histogram(stat="bin") + coord_flip()
dat.hist = data.frame("Biotype"=names(table(dat$biotype)), "value"=as.numeric(table(dat$biotype))) dat.box1 = dat[,c("biotypeSummary","baseMean")] dat.box1$baseMean = log2(dat.box1$baseMean) dat.box1$group = "ReadCount" colnames(dat.box1) = c("biotype","value","group") dat.box2 = dat[,c("biotypeSummary","log2FoldChange")] dat.box2$group = "Log2FoldChange" colnames(dat.box2) = c("biotype","value","group") dat.box = rbind(dat.box1, dat.box2) g = ggplot( dat.box, aes(factor(biotype), y=value, fill=factor(biotype)), environment=environment() ) + geom_boxplot() + facet_grid(group~., scales="free") g # scale_color_manual(values=c( "red","lightgrey")) + # geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07 geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07
````r save.image("workspace.rda")
````r ##Interactive Plots (Javascript needs to be enabled) ### Interactive MA Plot of the top 500 genes # dataToPlot = as.data.frame(resOrdered[1:500,]) # dataToPlot$ID = rownames(dataToPlot) # # n2 = nPlot( log2FoldChange ~ baseMean, data=dataToPlot, type='scatterChart', group='chipType' ) # n2$chart(tooltipContent= "#! function(key, x, y, e){ # return '<b>Name: </b> ' + e.point.ID + '<br/>' + # '<b>PValue: </b> ' + e.point.pvalue + '<br/>' + # '<b>adjPValue: </b>' + e.point.padj # } !#") # n2$print("chart1")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.