knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, dev=c('png', 'pdf'), echo=FALSE, warning=FALSE, message=FALSE); # Loading inputs from yaml file # fn.yaml<-"~/R/source/DEGandMore/examples/IdentifyOutlier/identify_outlier.yml"; # yml <- yaml::yaml.load_file(fn.yaml); # writeLines(yaml::as.yaml(yml), '~/R/source/DEGandMore/examples/IdentifyOutlier/identify_outlier.yml') expr<-readRDS(yml$input$data); anno<-readRDS(yml$input$annotation); grps<-readRDS(yml$input$group); gset<-readRDS(yml$input$geneset); prms<-yml$parameter; smp2grp<-rep(names(grps), sapply(grps, length)); names(smp2grp)<-unlist(grps, use.names=FALSE); gset.src<-sort(unique(gset[[1]]$Source)); library(awsomics); library(DEGandMore); library(gplots); library(knitr); library(rmarkdown);
The goal of this procedure is to identify potential outlier samples that have lower inter-sample correlation than expected. It is based on the assumption that the outliers have poorer correlation to other samples in the same group. This procedure includes the following steps:
This procedure requires the following inputs:
lns<-lapply(names(yml$project), function(nm) { c(paste('##', nm), '\n', yml$project[[nm]], '\n'); }); lns<-paste(do.call('c', lns), collapse='\n');
r lns
The data set includes r nrow(expr)
genes and r ncol(expr)
samples in r length(grps)
groups.
IdentifyOutlierByCorr<-function(d, RMX=0.99, NMN=4, NSD=3) { # d Data matrix, rows are genes or other variables and columns are samples # RMX Maximal mean correlation coefficient for a sample to be an outlier # NMN Minimal number of samples # NSD Number of standard deviation if (NMN < 3) NMN<-3; if (ncol(d) < NMN) NA else { corr<-lapply(1:ncol(d), function(i) cor(d[, i], d[, -i], use='pair')); names(corr)<-colnames(d); ms<-sapply(corr, mean); nsd<-sapply(1:length(ms), function(i) (ms[i]-mean(ms[-i]))/sd(ms[-i])); names(nsd)<-colnames(d); out<-list(outlier=nsd[nsd<=-1*abs(NSD) & ms<RMX], nsd=nsd, mean=ms, corr=corr, parameters=c(RMX=RMX, NMN=NMN, NSD=NSD)); } } # Run IdentifyOutlierByCorr() function recursively until no more outliers were identified or there are no enough samples IdentifyOutlierByCorr.2<-function(d, RMX=0.99, NMN=4, NSD=3) { # d Data matrix, rows are genes or other variables and columns are samples # RMX Maximal mean correlation coefficient for a sample to be an outlier # NMN Minimal number of samples # NSD Number of standard deviation if (ncol(d) < NMN) NA else { out <- list(IdentifyOutlierByCorr(d, RMX, NMN, NSD)); d <- d[, !(colnames(d) %in% names(out[[1]][[1]]))]; while (length(out[[length(out)]][[1]])>0 & ncol(d)>=NMN) { # outlier identified, is there more out[[length(out)+1]]<-IdentifyOutlierByCorr(d, RMX, NMN, NSD); d <- d[, !(colnames(d) %in% names(out[[length(out)]][[1]]))]; } names(out)<-paste('Round', 1:length(out), sep=''); nsd<-lapply(out, function(x) x$nsd); mns<-lapply(out, function(x) x$mean); summ<-cbind(Round=rep(1:length(nsd), sapply(nsd, length)), NSD=as.vector(unlist(nsd)), Mean=as.vector(unlist(mns))); summ<-data.frame(Sample=as.vector(unlist(lapply(nsd, names))), round(summ, 6), stringsAsFactors = FALSE); summ$Outlier<-summ$NSD< -1*abs(NSD) & summ$Mean<RMX list(summary=summ, all=out); } }
The following steps were applied to identify outlier samples:
r prms$max.corr
and r prms$n.sd
standard deviations below mean(Ms[-i])out<-lapply(grps, function(g) IdentifyOutlierByCorr.2(expr[, g], RMX=prms$max.corr, NMN=prms$min.n, NSD=prms$n.sd)); smm<-lapply(out, function(x) x$summary); grp<-rep(names(smm), sapply(smm, nrow)); smm<-data.frame(Group=grp, do.call('rbind', smm), stringsAsFactors = FALSE); fn.stat<-CreateDatatable(smm, paste(yml$output, 'stat_table.html', sep='/'), rownames = FALSE); outliers<-smm[smm$Outlier, , drop=FALSE]; grps.filtered<-lapply(grps, function(g) g[!(g %in% outliers$Sample)]);
As a result, r nrow(outliers)
outliers were identified
r paste(unique(outliers$Sample), collapse=', ')
par(mar=c(5,5,2,2)); plot(smm$Mean, smm$NSD, pch=19, col=c('#0000FF88', '#FF000088')[smm$Outlier+1], cex=2, cex.lab=1.5, xlim=c(min(smm$Mean), 1), xlab="Mean correlation coefficient to other samples", ylab="Number of standard deviations"); abline(v=prms$max.corr, h=-1*abs(prms$n.sd), lty=2, lwd=2, col='darkgrey'); points(smm$Mean, smm$NSD, cex=.2);
Figure 1 There were r length(unique(outliers$Sample))
outliers identified from r length(unique(outliers$Group))
groups (highlighted in red) based on inter-sample correlation.
Each outlier was compared to non-outlier samples in the same group to calculate differential expression.
if (nrow(outliers) > 0) { d0<-lapply(outliers$Sample, function(s) expr[, grps.filtered[[smp2grp[s]]], drop=FALSE]); d1<-lapply(outliers$Sample, function(s) expr[, s]); names(d0)<-names(d1)<-outliers$Sample; path.fig<-paste(yml$output, 'scatterplot', sep='/'); if (!file.exists(path.fig)) dir.create(path.fig, recursive = TRUE); fn<-sapply(1:nrow(outliers), function(i) { id<-outliers$Sample[i]; fn<-paste(path.fig, '/', id, '.pdf', sep=''); pdf(fn, w=8, h=4); par(mar=c(5,5,2,2), mfrow=c(1, 2)); r<-sapply(1:ncol(d0[[i]]), function(j) cor(d0[[i]][, j], rowMeans(d0[[i]][, -j, drop=FALSE]))); ind<-which(r==max(r))[1]; x<-cbind(rowMeans(d0[[i]]), d1[[i]]); y<-cbind(rowMeans(d0[[i]][, -ind]), d0[[i]][, ind]); plot(x, pch=19, cex=0.5, col='#88888888', xlim=c(min(x), max(x)), ylim=c(min(x), max(x)), xlab='Others', ylab=id, cex.lab=1.5); abline(0, 1, col=4); plot(y, pch=19, cex=0.5, col='#88888888', xlim=c(min(x), max(x)), ylim=c(min(x), max(x)), xlab='Others', ylab=colnames(d0[[i]])[ind], cex.lab=1.5); abline(0, 1, col=4); dev.off(); fn<-ConvertPdf2Png(fn); fn; }); fn<-sub(yml$output, '', fn); fn<-sub('^/', '', fn); lns<-paste(' - [', outliers$Sample, '](', fn, ')', sep=''); lns<-paste(lns, collapse='\n'); } else { lns<-' - No outliers were identified' }
Click on sample ID to view scatterplots that compare the global differential expression of an outlier vs. non-outliers (left) and a non-outliers vs. other non-outliers (right).
r lns
The differential expression of each outlier vs. its corresponding non-outliers was compared to each other. High correlation of differential expression between 2 samples suggests that the 2 outliers could be caused by the same reason.
if (nrow(outliers) == 0) lns<-'No outliers found' else if (nrow(outliers) < 3) lns<-'No enough outliers for clustering analysis' else { dff<-sapply(1:nrow(outliers), function(i) d1[[i]]-rowMeans(d0[[i]])); colnames(dff)<-outliers$Sample; hc<-hclust(as.dist(1-cor(dff))); plot(hc, main='Differential expression: outlier vs. non-outlier', xlab='Outlier', sub=''); abline(h=1-prms$cut.corr, lty=2, col='#0000FF88'); ct<-cutree(hc, h=1-prms$cut.corr); g<-split(names(ct), ct); g<-g[sapply(g, length)>1]; if (length(g) == 0) lns<-'No outliers have similar differential expression' else { lns<-as.vector(sapply(g, function(g) paste(g, collapse='; '))); lns<-paste(' -', lns); lns<-paste(lns, collapse='\n'); } }
Figure 2 Clustering of outliers based on their differential expression to corresponding non-outliers.
The following subset(s) of outliers had similar differential expression:
r lns
Functional categorization of genes differentially expressed between outliers and corresponding non-outliers could be very suggestive of the cause leading to the outlier. The following steps were used to select DEGs:
r prms$deg$nsd
standard deviations of the non-outliersr prms$max
with the largest difference between the outlier and non-outliers r prms$deg$de
, or the top r prms$deg$min
genes having the largest differential expression, which ever gives morepath.deg<-paste(yml$output, 'deg', sep='/'); if (!file.exists(path.deg)) dir.create(path.deg, recursive = TRUE); if (nrow(outliers) == 0) lns<-'No outliers found' else { # select DEGs degs<-lapply(1:nrow(outliers), function(i) { dff<-d1[[i]]-rowMeans(d0[[i]]); nsd<-dff/apply(d0[[i]], 1, sd); deg<-dff[abs(nsd)>=abs(prms$deg$nsd)]; deg<-list(hi=deg[deg>0], lo=deg[deg<0]); deg<-lapply(deg, function(d) d[rev(order(abs(d)))]); deg<-lapply(deg, function(d) { if (length(d) > prms$deg$min) { d<-d[1:min(prms$deg$max, length(d))]; if (abs(prms$deg$de > abs(d[prms$deg$min]))) d[1:prms$deg$min] else d[abs(d) > abs(prms$deg$de)]; } else d; }); }); names(degs)<-outliers$Sample; # Prepare HTML tables tbls<-lapply(1:length(degs), function(i) lapply(degs[[i]], function(d) { x0<-d0[[i]][names(d), , drop=FALSE]; x1<-d1[[i]][names(d)]; dff<-x1-rowMeans(x0); sd<-apply(x0, 1, sd); stat<-round(cbind(x1, rowMeans(x0), dff, dff/sd), 4); colnames(stat)<-c(outliers$Sample[i], 'Non_Outliers', 'DE', 'NSD'); cbind(stat, anno[rownames(stat), ]); })); names(tbls)<-names(degs)<-outliers$Sample; # Write tables to HTML fn<-lapply(names(tbls), function(nm) { f<-paste(path.deg, paste(nm, '_', c('Higher', 'Lower'), '.html', sep=''), sep='/') CreateDatatable(tbls[[nm]][[1]], f[1], caption = nm); CreateDatatable(tbls[[nm]][[2]], f[2], caption = nm); f; }); # Write summary table fn<-t(sapply(fn, function(f) sub(paste(yml$output, '/', sep=''), '', f))); n<-t(sapply(degs, function(x) sapply(x, length))); rownames(n)<-rownames(fn)<-outliers$Sample; colnames(n)<-colnames(fn)<-c('Higher in outlier', 'Low in outlier'); tbl<-t(sapply(rownames(n), function(nm) paste('[', n[nm, ], '](', fn[nm, ], ')', sep=''))); dimnames(tbl)<-dimnames(n); lns<-kable(tbl, align=c('r', 'r')); }
The numbers of DEGs selected from each outlier (click on numbers to view full lists):
r lns
DEGs having higher or lower expression in each outliers were mapped to a collection of predefined genesets downloaded from:
r paste(paste(' -', gset.src), collapse='\n')
Enrichment of DEGs in each gene set was tested by Hypergeometric test. Gene set collection of a few model animals can be downloaded from: https://github.com/zhezhangsh/DEGandMore/tree/master/examples/geneset_collections
path.gse<-paste(yml$output, 'gse', sep='/'); if (!file.exists(path.gse)) dir.create(path.gse, recursive = TRUE); if (nrow(outliers) == 0) ln.hi<-ln.lo<-'No outliers found' else { gses<-lapply(names(degs), function(nm) { cat(nm, '\n'); gse<-lapply(degs[[nm]], function(gs) TestGSE(names(gs), rownames(anno), gset[[2]])[[1]]); hi<-WrapGSE(gse[[1]], gset[[1]], paste(path.gse, paste(nm, 'Higher', sep='_'), sep='/')); lo<-WrapGSE(gse[[2]], gset[[1]], paste(path.gse, paste(nm, 'Lower' , sep='_'), sep='/')); list(hi, lo); }); fn.hi<-sapply(gses, function(g) g[[1]]$file[gset.src]); fn.lo<-sapply(gses, function(g) g[[2]]$file[gset.src]); fn.hi<-sub(paste(yml$output, '/', sep=''), '', fn.hi); fn.lo<-sub(paste(yml$output, '/', sep=''), '', fn.lo); n.hi<-sapply(gses, function(g) sapply(g[[1]]$formatted[gset.src], nrow)); n.lo<-sapply(gses, function(g) sapply(g[[2]]$formatted[gset.src], nrow)); tbl.hi<-t(matrix(paste('[', n.hi, '](', fn.hi, ')', sep=''), nc=nrow(outliers))); tbl.lo<-t(matrix(paste('[', n.lo, '](', fn.lo, ')', sep=''), nc=nrow(outliers))); dimnames(tbl.hi)<-dimnames(tbl.lo)<-list(outliers$Sample, gset.src); ln.hi<-kable(tbl.hi, align = rep('r', ncol(tbl.hi))); ln.lo<-kable(tbl.lo, align = rep('r', ncol(tbl.lo))); }
Results from gene set enrichment analysis of genes having higher expression in outliers:
r ln.hi
Results from gene set enrichment analysis of genes having higher expression in outliers:
r ln.lo
fn<-paste(yml$output, 'outliers.rds', sep='/'); if (nrow(outliers) == 0) saveRDS(list(stat=smm), fn) else saveRDS(list(stat=smm, deg=degs, gse=gses), fn);
Results were saved to r fn
.
END OF DOCUMENT
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.