````r cat("") cat("") cat("") cat("") cat("") cat("")

```r
library(googleVis)
op <- options(gvis.plot.tag='chart')
options(stringsAsFactors=FALSE)
library(RNASeqUtility)
require(rtracklayer)
library("gplots")
library("RColorBrewer")
library(ggplot2)
library(DESeq2)
library("genefilter")
# library(ReportingTools)
library(rCharts)
#This following analysis is based on: http://www.bioconductor.org/help/workflows/rnaseqGene/
load("../Annotation.rda")
setwd(rootDir)
args <- commandArgs(TRUE)
if( length(args) == 0 ){
  load( file.path(getwd(),"Configuration.rda") )
} else{
  load(file.path(args[1],"Configuration.rda"))
}
# load(file.path( rootDir,"Annotation.rda"))#

groupsForPCAplot = "condition"
firstConditionLvl = as.character(samplesInfo$condition[1]) #It will be convenient to make sure that untrt is the first level in the dex factor, so that the default log2 fold changes are calculated as treated over untreated
numberOfGenesToReport=50

Differential Expression Analysis

Count Table creation

###########################################################
#       Filtering by low expressed reads and subsetting by Annotation!
###########################################################

#Filter ENSEMBL ANNOTATION BY BIOTYPE
ensReadsCountDF_clustered_filt = ensReadsCountDF_clustered[ which( !ensReadsCountDF_clustered$gene_biotype %in% c("miRNA","snoRNA") ), ]

countsEns = apply(ensReadsCountDF_clustered_filt[,(which(colnames(ensReadsCountDF_clustered_filt) == "UID")+1) : (which(colnames(ensReadsCountDF_clustered_filt) == "mapStart")-1)],2,as.numeric)
rownames(countsEns) = ensReadsCountDF_clustered_filt$UID
keep = rowSums(countsEns == 0) <= (dim(countsEns)[2]/2)+1
countsEns = countsEns[keep,]

countsContigs = apply(contigAnnotTable_fin[,grep("^[0-9]+$",colnames(contigAnnotTable_fin))],2,as.numeric)
rownames(countsContigs) = contigAnnotTable_fin$contigID
keep = rowSums(countsContigs == 0) <= (dim(countsContigs)[2]/2)+1
countsContigs = countsContigs[keep,]

if( dim(otherReadsCountDF)[1] > 0 ){
  countsOther = apply(otherReadsCountDF[,(which(colnames(otherReadsCountDF) == "UID")+1) : (which(colnames(otherReadsCountDF) == "mapStart")-1)],2,as.numeric)
  rownames(countsOther) = otherReadsCountDF$UID
  keep = rowSums(countsOther == 0) <= (dim(countsOther)[2]/2)+1
  countsOther = countsOther[keep,]

  counts = rbind( countsEns, countsContigs)
  counts = rbind( counts, countsOther)
} else{
  counts = rbind( countsEns, countsContigs)
}

head(counts)

DESeq2 - Creating the DESeqDataSet

dds <- DESeqDataSetFromMatrix(countData = counts,
                                 colData = samplesInfo,
                                 design = ~ condition)

Sample Info Table

colData(dds)

Transformation and inspection of count data

The transformation is done by regularized-logarithm transformation (rlog) in order to derive approximate homoskedastic data

rld <- rlog(dds)
head(assay(rld))
par( mfrow = c( 1, 2 ) )
dds <- estimateSizeFactors(dds)
plot( log2( 1 + counts(dds, normalized=TRUE)[ , 1:2] ),
     col=rgb(0,0,0,.2), pch=16, cex=0.3 )
plot( assay(rld)[ , 1:2],
     col=rgb(0,0,0,.2), pch=16, cex=0.3 )
par( mfrow = c(1,1) )

Sample to Sample distances by euclidean distance between samples using the rlog transformed data

sampleDists <- dist( t( assay(rld) ) )
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- dds$sampleName
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
hc <- hclust(sampleDists)
heatmap.2( sampleDistMatrix, Rowv=as.dendrogram(hc),
          symm=TRUE, trace="none", col=colors,
          margins=c(2,10), labCol=FALSE )

This plot shows the similarity of the samples based on their expression values (rlog transformed)

Sample to Sample distances with principal component analysis

plotPCA(rld, intgroup = groupsForPCAplot)
#second possibility to plot...
# mds <- data.frame(cmdscale(sampleDistMatrix))
# mds <- cbind(mds, colData(rld))
# qplot(X1,X2,color=condition,data=mds)

The data points (i.e., here, the samples) are projected onto the 2D plane such that they spread out in the two directions which explain most of the differences in the data. The x-axis is the direction (or principal component) which separates the data points the most. The amount of the total variance which is contained in the direction is printed in the axis label.

Differential Expression Analysis

Calculation by DESeq

dds$condition <- relevel(dds$condition, firstConditionLvl)
dds <- DESeq(dds)
res <- results(dds)
resOrdered <- res[order(res$padj),]
#DIFFERENT CONTRASTS MAY BE DEFINED AND ADDED TO THIS TEMPLATE HERE: 
#results(dds, contrast=c("cell", "N061011", "N61311"))

Explanation of the columns in the result table

mcols(res, use.names=TRUE)

Of course, this estimate has an uncertainty associated with it, which is available in the column lfcSE, the standard error estimate for the log2 fold change estimate.

Summary statistics of the result

summary(res)

Number of differentially expressed genes

sum(res$padj < 0.1, na.rm=TRUE)

Top differentially expressed genes (sorted by p-value)

resSig <- subset(res, padj < 0.1)
if( dim(resSig)[1] == 0 ){
  resSig = resOrdered[1:10,]
} 
head(resSig)
topGene = which( rownames(res) %in%  rownames(head(resSig)) )

Top down-regulated genes

tmp = resSig[ order( resSig$log2FoldChange ), ]
tmp$UID = rownames(tmp)
tmp = merge(as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE)
head(tmp)

Top up-regulated genes

tmp = resSig[ order( -resSig$log2FoldChange ), ]
tmp$UID = rownames(tmp)
tmp = merge(as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE)
head(tmp)

MA Plot highlighting the top 6 genes (sorted by adjusted p-value)

plotMA(res)
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=1.5, lwd=2)
  text(baseMean, log2FoldChange, rownames(res[topGene, ]), pos=2, col="dodgerblue", cex=0.5)
})
### Creating a report of the top differentially expressed genes
# des2Report <- HTMLReport(shortName = "tmpReport",
#                            title = "Differentially Expressed genes",
#                            reportDirectory = "./reports")
# #Top 50 genes are reported
# publish(dds, des2Report, pvalueCutoff=1,n = numberOfGenesToReport,
#            factor = colData(dds)$condition,
#           reportDir="./reports") #Otherwise the annotation.db is not needed -> only rownames are displayed
# 
# finish(des2Report)
#The report that contains the read count distribution for each gene can be accessed here

# cat(  paste0("<a href=\"reports/tmpReport.html\"> ", "Top ",numberOfGenesToReport," differentially expressed genes  </a>") )

Diagnostic Plots

Whether a gene is called significant depends not only on its LFC but also on its within-group variability, which DESeq2 quantifies as the dispersion. For strongly expressed genes, the dispersion can be understood as a squared coefficient of variation: a dispersion value of 0.01 means that the gene's expression tends to differ by typically sqrt(0.01) = 10% between samples of the same treatment group. For weak genes, the Poisson noise is an additional source of noise.

Dispersion estimation

plotDispEsts(dds)

The black points are the dispersion estimates for each gene as obtained by considering the information from each gene separately. Unless one has many samples, these values fluctuate strongly around their true values. Therefore, we fit the red trend line, which shows the dispersions' dependence on the mean, and then shrink each gene's estimate towards the red line to obtain the final estimates (blue points) that are then used in the hypothesis test. The blue circles above the main "cloud" of points are genes which have high gene-wise dispersion estimates which are labelled as dispersion outliers. These estimates are therefore not shrunk toward the fitted trend line.

Histogram of the p-values

hist(res$pvalue[res$baseMean > 1], breaks=20, col="grey50", border="white", main = "Pvalue Distribution", xlab="pvalue")

Normally one should see a peak on the left side (genes with very small counts are excluded)

Gene Clustering

This heatmap shows the clustering of the 100 genes with the highest variance across samples

topVarGenes <- head(order(-rowVars(assay(rld))),100)
colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255)
sidecols <- c("grey","dodgerblue")[ rld$condition ]
mat <- assay(rld)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
colnames(mat) <- rld$sampleName
heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols,
          labRow=FALSE, mar=c(10,2), scale="row")

This heatmap shows the clustering of top differentially expressed p-value < 0.05 genes

colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255)
sidecols <- c("grey","dodgerblue")[ rld$condition ]
signCand = na.omit(which(resOrdered$pvalue < 0.05))

mat <- assay(rld)[ rownames(resOrdered[signCand,]), ]
mat <- mat - rowMeans(mat)
colnames(mat) <- rld$sampleName
tmp = resOrdered[signCand,]
tmp$UID = rownames(tmp)
tmp = merge( as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE)
rownames(mat) = tmp$Name
# heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols,
#           labRow=TRUE, mar=c(10,2), scale="row")
heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols,
          mar=c(10,15), scale="row")

This heatmap shows the clustering of top differentially expressed adjusted p-value < 0.05 genes

colors <- colorRampPalette( rev(brewer.pal(9, "PuOr")) )(255)
sidecols <- c("grey","dodgerblue")[ rld$condition ]
signCand = na.omit(which(resOrdered$padj < 0.1))

if( length(signCand) > 1 ){
  mat <- assay(rld)[ rownames(resOrdered[signCand,]), ]
  mat <- mat - rowMeans(mat)
  colnames(mat) <- rld$sampleName
  tmp = resOrdered[signCand,]
  tmp$UID = rownames(tmp)
  tmp = merge( as.data.frame(tmp), minimalAnnotationDF, by="UID", sort=FALSE)
  rownames(mat) = tmp$Name
  # heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols,
  #           labRow=TRUE, mar=c(10,2), scale="row")
  heatmap.2(mat, trace="none", col=colors, ColSideColors=sidecols,
            mar=c(10,15), scale="row")
} else{
  message("No candidates below adj.pval 0.1")
}

Various Plots

Library Size Disttribution

col.concent <- c("gold" , "red")
barplot(colSums(counts(dds))*1e-6, names = colnames(counts), ylab="Library size (millions)",
          col = col.concent[factor(samplesInfo$condition)], las = 2, cex.names = 0.5)
legend("topleft", legend = levels(factor(samplesInfo$condition)), col = col.concent, pch = 15, cex = 0.7)

MA Plot showing the biotype distribution

````r

Saving results

dat = resOrdered dat$UID = rownames(dat) dat = merge(as.data.frame(dat), minimalAnnotationDF, by="UID", sort=FALSE) write.table(dat, "DETable.csv", sep="\t", col.names=TRUE, row.names=FALSE)

```r
dat$biotypeSummary = ifelse( !dat$biotype %in% c("antisense", "lincRNA", "miRNA", "snoRNA", "snRNA", "tRNA", "unknown", "rRNA"), "other", dat$biotype)
# dat$biotypeSummary = ifelse( !dat$biotype %in% c("antisense", "lincRNA", "tRNA", "rRNA"), "other", dat$biotype)
g1 = ggplot( dat, aes(x=log2(baseMean), y=log2FoldChange, color=biotypeSummary), environment=environment() ) + geom_point() + 
  labs(title="MAPlot (Averaged over all arrays)", x="Intensity (A)", y="log2 Fold Change (M)") + 
  geom_text( data = dat[1:20,],aes(label=Name), show_guide = FALSE, size=2, color="black")
g1

Vulcano Plot of the resulting Data

dat$logPrev = -log10(dat$pvalue)
g = ggplot( dat, aes(x=log2FoldChange, y=logPrev, colour = biotypeSummary), environment=environment() ) + geom_point() +
  labs(title="Vulcano Plot (Averaged over all arrays)", x="log2 Fold Change (M)", y="-log10(p-value)") + 
  geom_hline(aes(yintercept=-log10(0.05)), color="red") + 
  geom_text( data = dat[1:20,],aes(label=Name), show_guide = FALSE, size=2.5, color="black")
g
# scale_color_manual(values=c( "red","lightgrey")) + 
#  geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07   geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07

Histogram of the ncRNA classes

ggplot(dat, aes(x=biotype) ) + geom_histogram(stat="bin") + coord_flip()
dat.hist = data.frame("Biotype"=names(table(dat$biotype)), "value"=as.numeric(table(dat$biotype)))

dat.box1 = dat[,c("biotypeSummary","baseMean")]
dat.box1$baseMean = log2(dat.box1$baseMean)
dat.box1$group = "ReadCount"
colnames(dat.box1) = c("biotype","value","group")
dat.box2 = dat[,c("biotypeSummary","log2FoldChange")]
dat.box2$group = "Log2FoldChange"
colnames(dat.box2) = c("biotype","value","group")
dat.box = rbind(dat.box1, dat.box2)

g = ggplot( dat.box, aes(factor(biotype), y=value, fill=factor(biotype)), environment=environment() ) + geom_boxplot() + facet_grid(group~., scales="free")
g
# scale_color_manual(values=c( "red","lightgrey")) + 
#  geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07   geom_text( data = labelDF.tt, aes( x= logFC, y=logPrev, label=Name), show_guide = FALSE, size=2, color="black") #-0.07

````r save.image("workspace.rda")

````r

##Interactive Plots (Javascript needs to be enabled)

### Interactive MA Plot of the top 500 genes
# dataToPlot = as.data.frame(resOrdered[1:500,])
# dataToPlot$ID = rownames(dataToPlot)
# 
# n2 = nPlot( log2FoldChange ~ baseMean, data=dataToPlot, type='scatterChart', group='chipType' )
# n2$chart(tooltipContent= "#! function(key, x, y, e){
#   return '<b>Name: </b> ' + e.point.ID + '<br/>' + 
#          '<b>PValue: </b> ' + e.point.pvalue + '<br/>' + 
#          '<b>adjPValue: </b>' + e.point.padj
# } !#")
# n2$print("chart1")


SimonSchafferer/RNASeqUtility documentation built on May 9, 2019, 1:34 p.m.