#' Comparison quantification with the expected spike-in concentrations.
#'
#' Loads and processes the RNAseq data before comparing it with the spike-in concentrations.
#' Will produce a number of benchmarking plots and files in the current working directory.
#'
#' @param ANALYSIS_NAME A string indicating the name of the analysis/pipeline. Will be used in filenames, plot titles, etc.
#' @param rnaseq The path to the gene-level RNAseq expression matrix. If not given, will look for relevant files in the working directory. The expression matrix should have gene symbols in the first column/row.names, and sample names (e.g. 'AJ80' for the 12-samples dataset, "A_1" for SEQC, etc) as column headers.
#' @param qt A string indicating the unit of the expression matrix (either "FPKM", "TPM" or "COUNTS").
#' @param fc.undetected The foldchange to assign to undetected spike-ins (should be either 1 or NA, default 1)
#'
#' @return Nothing, but produces many files in the working directory...
#'
#' @examples
#' # first we create a directory and put the example quantification file in it:
#' data(exampledata)
#' dir.create("example")
#' write.table(exampleGeneLevel,"w12.genes.quant",sep="\t",quote=FALSE)
#' # then we run the function, giving a name to the analysis,
#' # specifying the file and type of quantification:
#' analyzeSpikein("tophat.featureCount", "w12.genes.quant", qt="COUNTS")
#'
#' @export
analyzeSpikein <- function(ANALYSIS_NAME, rnaseq=NULL, qt, fc.undetected=1){
sl <- list(analysis=ANALYSIS_NAME)
qt <- match.arg(toupper(qt),c("FPKM","TPM","COUNTS"))
if(is.null(rnaseq)){
sp <- .getSpikeinQt()
}else{
sp <- read.delim(rnaseq,header=TRUE,row.names=1)
if("DQ854994" %in% row.names(sp)) sp <- .getSpikeinsFromTranscripts(sp)
}
dinf <- .checkDataset(sp, ANALYSIS_NAME)
if(dinf[["dataset"]] == "sim") stop("There are no spike-ins to analyze in the simulated data.")
dinf$ptitle <- paste(dinf$dataset,":",dinf$analysis," (spike-ins)",sep="")
dinf$fname <- .plfilename(dinf$dataset,dinf$analysis,"spikein")
data("ercc")
if(dinf$level=="gene"){
row.names(ercc) <- ercc$ERCC.ID
}else{
row.names(ercc) <- ercc$TX.ID
}
if(qt=="COUNTS"){
# values are counts - we first convert to FPKM
sp <- counts2fpkmWrapper(sp,dinf$level,uniquelyMappableLengths=FALSE)
qt <- "FPKM"
}
if(qt != "TPM"){
for(i in 1:ncol(sp)) sp[,i] <- fpkm2tpm(sp[,i])
qt <- "TPM"
}
# ensure that all spike-ins are there, set to 0 the missing ones
sp <- merge(sp,ercc,by="row.names",all.y=TRUE)[,1:(ncol(sp)+1)]
row.names(sp) <- sp$Row.names
sp$Row.names <- NULL
sp <- as.matrix(sp)
sp[which(is.na(sp) | is.nan(sp) | is.infinite(sp) | sp<0)] <- 0
# put everything in the same order
genes <- row.names(sp)[which(row.names(sp) %in% row.names(ercc))]
sp <- as.data.frame(sp[genes,dinf$samples])
ercc <- ercc[genes,]
# in case it is unknown, determine which sample has which mix
df <- data.frame(row.names=names(sp))
df$mix1.cor <- apply(sp,2,FUN=function(x){ cor(as.numeric(x),as.numeric(ercc$concentration.in.Mix.1..attomoles.ul.))})
df$mix2.cor <- apply(sp,2,FUN=function(x){ cor(as.numeric(x),as.numeric(ercc$concentration.in.Mix.2..attomoles.ul.))})
df$best.matching.mix <- apply(df,1,FUN=function(x){ order(as.numeric(x),decreasing=TRUE)[1]})
# renormalize samples according to their expected spike-in amounts,
# thereby avoiding discrepancies due to the different amounts of spike-ins put in the cells.
# We first make a copy of the spike-in mesurements, which we'll multiply by FC between mixes to make an artificial situation were the samples have the same mix:
sp2 <- sp
for(i in which(df$best.matching.mix==2)) sp2[,i] <- as.numeric(sp2[,i])*as.numeric(ercc$expected.fold.change.ratio)
# we then use this to get linear normalizers
nf <- getLinearNormalizers(sp2)
# which we can then apply to the original measurements
spn <- t(t(sp)*nf)
# to check that everything worked as planned, we can compare the sum of euclidean distance between the normalized and un-normalized matrices:
# sum(dist(t(sp))) > sum(dist(t(spn)))
# alternatively, we check that mix assignment worked:
if(dinf$dataset=="w12"){
mix1 <- c("AJ86","AJ90","AJ91")
}else{
if(dinf$dataset=="w6"){
mix1 <- c("CFG.sh2","c3391S.sh2","w306o.shCTR","w306o.sh2")
}else{
mix1 <- paste("A",1:5,sep="_")
}
}
df$expected.mix <- ifelse(row.names(df) %in% mix1, 1, 2)
if(any(df$best.matching.mix != df$expected.mix)){
warning(paste(ANALYSIS_NAME,"There appears to be a problem with the spike-in quantification or normalization, because samples do not correlate with the correct spike-in mixes. This can happen with very poor quantifications, can be due to a mis-labeling of the columns, or wrong specification of the samples having mix1. Analysis will proceed, but you should double-check the samples' mix assignment:",sep=" - "),immediate.=TRUE)
print(df)
}
# next, we must create the x values (expected spike-in concentrations), according to the mix injected in each sample
xdf <- ercc[,df$expected.mix+4]
op <- options(warn=-1)
on.exit(options(op))
#calculate by-sample correlation
sc <- data.frame(analysis=ANALYSIS_NAME, sample=colnames(spn))
sc$cor <- sapply(1:ncol(spn),FUN=function(i){
x <- as.numeric(xdf[,i])
y <- as.numeric(spn[,i])
w <- which(x>0 & y>0)
cor(x[w],y[w])
})
write.table(sc,paste(dinf$fname,".perSampleCorrelation.tab",sep=""),col.names=TRUE,row.names=TRUE,sep="\t",quote=FALSE)
x <- as.numeric(t(xdf))
# to include also undetected genes in the plot, put them at half of the minimum detected FPKM
med <- median(x)
finc <- 0.001
y <- as.numeric(t(spn))
w <- which(y>0) # w indicates the genes that were detected by RNA-seq
# calculate correlations
sl[["cor.detected"]] <- cor(x[w],y[w],use="pairwise")
sl[["log.cor.detected"]] <- cor(log10(x[w]),log10(y[w]),use="pairwise")
sl[["cor.all"]] <- cor(x,y,use="pairwise")
sl[["cor.detected.abovemedian"]] <- try(cor(x[which(y>med)], y[which(y>med)],use="pairwise"),silent=TRUE)
sl[["cor.detected.belowmedian"]] <- try(cor(x[which(y>0 & y<=med)], y[which(y>0 & y<=med)],use="pairwise"),silent=TRUE)
sl[["prop.undetected"]] <- 1-sum(spn>0)/ncol(spn)/92
ww <- which(!as.logical(lapply(sl,FUN=is.numeric)))
for(wo in ww) sl[[wo]] <- NA
simStats(x,y,dinf$ptitle,paste(dinf$fname,"expr",sep="."),norm=TRUE)
# We can now plot the correlation between (normalized) FPKM and expected values
samples_colors <- sapply(dinf$sample_colors, alpha=150, FUN=maketrans)
png(paste(dinf$fname,"cor.png",sep="."),width=700,height=600)
plot(x,y+finc,xlab="Expected spike-in concentration (attomoles/ul)", ylab=qt, col=samples_colors,pch=20,cex=1.5,log="xy",main=dinf$ptitle)
legend("topleft",bty="n",legend=paste(names(sl[2:6]),lapply(sl[2:6],digits=3,FUN=round),sep=": "))
legend("bottomright",bty="n",legend=c(paste(round(100*sl[["prop.undetected"]],2),"% undetected",sep="")))
dev.off()
# we average the samples according to their mix, and calculate the observed foldchange between mixes
d <- data.frame(expected=ercc$expected.fold.change.ratio)
d$avg.mix1=apply(spn[,df$expected.mix==1],1,na.rm=TRUE,FUN=mean)
d$avg.mix2=apply(spn[,df$expected.mix==2],1,na.rm=TRUE,FUN=mean)
d$observed <- foldchange(d$avg.mix2,d$avg.mix1,fc.undetected)
w <- isCleanData(d$observed,d$expected)
sl[["cor.foldchange"]] <- cor(d$observed[w], d$expected[w])
w <- isCleanData(log2(d$observed),log2(d$expected))
sl[["cor.log2.foldchange"]] <- cor(log2(d$observed[w]), log2(d$expected[w]), use="pairwise")
simStats(d$observed,d$expected,dinf$ptitle,paste(dinf$fname,"foldchanges",sep="."),norm=FALSE)
# we calculate statistical significance of the difference between mixes
pvals <- apply(spn, 1, FUN=function(x){ t.test(as.numeric(x[df$expected.mix==1]),as.numeric(x[df$expected.mix==2]))$p.value})
sl[["significant"]] <- sum(pvals < 0.05 & ercc$expected.fold.change.ratio != 1, na.rm=TRUE)
sl[["false.positives"]] <- sum(pvals < 0.05 & ercc$expected.fold.change.ratio == 1, na.rm=TRUE)
# for plotting, we will color the transcripts according to their real concentration
genes_colors <- plColorMap(log10(rowMeans(ercc[,5:6],na.rm=TRUE)))
# plot observed vs expected foldchanges
png(paste(dinf$fname,"corFC.png",sep="."),width=500,height=500)
plot(log2(d$expected), log2(d$observed), pch=20, cex=1.2, col=genes_colors, xlab="Exepected log2 foldchange (mix1/mix2)", ylab="Observed log2 foldchange",main=dinf$ptitle)
abline(a=0,b=1,lty="dashed",col="grey")
legend("bottomright",bty="n",legend=c( paste("foldchanges cor:",round(sl[["cor.foldchange"]],3)),
paste("log2(foldchanges) cor:",round(sl[["cor.log2.foldchange"]],3)),
"",
paste(round(100*sl[["significant"]]/sum(ercc$expected.fold.change.ratio!=1)),"% of real differences have p<0.05",sep="")
))
dev.off()
ld <- log2(d$expected)-log2(d$observed)
png(paste(dinf$fname,"FC_deviations.png",sep="."),width=500,height=500)
layout(matrix(1:2,nrow=2))
boxplot(ld,horizontal=TRUE,xlab="Deviation in log2(Foldchange)",main=dinf$ptitle)
hist(ld[which(ld >-3 & ld < 3)],xlim=c(-3,3),breaks=seq(from=-3,to=3,by=0.2), main="", xlab="Deviation in log2(foldchange), excluding outliers")
dev.off()
write.table(as.data.frame(sl),paste(dinf$fname,"stats",sep="."),col.names=TRUE,row.names=FALSE,sep="\t",quote=FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.