knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, dev=c('png', 'pdf'), echo=FALSE, warning=FALSE, message=FALSE); library(awsomics); library(gplots); library(knitr); library(rmarkdown); # fn.yaml<-"ClReport_Gas1_vs_control.yml"; # comment it out if running through knitr::knit function # fn.yaml<-"/nas/is1/zhangz/projects/barr/2015-11_Rat_Brain_Development/result/gene_clustering/Juvenile_SC_injury/ClReport.yml"; # Loading inputs from yaml file # if (!exists('fn.yaml')) stop('Input file not found\n'); # writeLines(yaml::as.yaml(yml), fn.yaml); # yml <- yaml::yaml.load_file(fn.yaml); expr<-readRDS(yml$input$data); anno<-readRDS(yml$input$annotation); smpl<-readRDS(yml$input$sample); gset<-readRDS(yml$input$geneset); if (!setequal(rownames(expr), rownames(anno))) stop("Data matrix and gene annotation have different row names.\n"); if (!setequal(colnames(expr), rownames(smpl))) stop("Data matrix and sample metadata have different names.\n"); expr<-expr[rownames(anno), rownames(smpl)]; prms<-yml$parameter;
lns<-lapply(names(yml$project), function(nm) { c(paste('##', nm), '\n', yml$project[[nm]], '\n'); }); lns<-paste(do.call('c', lns), collapse='\n');
r lns
CreateDatatable(smpl, paste(yml$output, 'samples.html', sep='/')); lns<-lapply(names(yml$methods), function(nm) { c(paste('##', nm), '\n', yml$methods[[nm]], '\n'); }); lns<-paste(do.call('c', lns), collapse='\n');
There are a total of r ncol(expr)
samples. Click here to view full table of samples
r lns
f<-as.factor(smpl[, prms$term[1]]); l<-unique(smpl[, prms$term[1]]); grp<-split(rownames(smpl), as.vector(f))[l]; m<-sapply(grp, function(s) rowMeans(expr[, s, drop=FALSE])); mn<-apply(expr, 1, min); mx<-apply(expr, 1, max); p<-apply(expr, 1, function(d) summary(aov(d ~ f))[[1]][1, 5]); q<-p.adjust(p, method='BH'); stat<-cbind(m, Min=mn, Max=mx, Range=mx-mn, pANOVA=p, FDR=q); CreateDatatable(cbind(anno, FormatNumeric(stat)), paste(yml$output, 'anova.html', sep='/'), caption="ANOVA results")
We applied ANOVA analysis on each gene to test its differential expression across samples of different groups.
r nrow(expr)
total genesr ncol(expr)
total samplesr length(l)
sample groups: r paste(l, collapse=', ')
Click here to view full ANOVA result table.
Select genes using the following steps:
r prms$selection$fdr
r prms$selection$min
genes were left; otherwise, select those with ANOVA p values less than r prms$selection$p
r prms$selection$min
genes were left; otherwise, select those with range (max-min) greater than r prms$selection$range
r prms$selection$max
genes left, select the top r prms$selection$max
with the biggest rangese<-expr[q<=prms$selection$fdr, , drop=FALSE]; if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum s<-stat[rownames(e), , drop=FALSE]; s<-s[order(s[, 'pANOVA']), , drop=FALSE]; e<-e[rownames(s), , drop=FALSE]; e<-e[1:max(prms$selection$min, max(which(s[, 'pANOVA']<=prms$selection$p))), , drop=FALSE]; if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum s<-stat[rownames(e), , drop=FALSE]; s<-s[rev(order(s[, 'Range'])), , drop=FALSE]; e<-e[rownames(s), , drop=FALSE]; e<-e[1:max(prms$selection$min, max(which(s[, 'Range']>=prms$selection$range))), , drop=FALSE]; } if (nrow(e) > prms$selection$max) e<-e[1:prms$selection$max, , drop=FALSE]; # trim if more selected genes than maximum }
A total of r nrow(e)
genes were selected. The genes were used for the next step to identify gene clusters
plot(hclust(as.dist(1-cor(expr))), main='Sample clustering using all genes', xlab='Sample', sub='');
Figure 1 Hierarchical clustering of samples using all genes.
plot(hclust(as.dist(1-cor(e))), main='Sample clustering using selected significant genes', xlab='Sample', sub='');
Figure 2 Hierarchical clustering of samples using just selected genes.
Genes selected by the last step were clustering using the following steps:
r prms$cluster$height
, which will classify genes into clusters. Then apply the following steps to refine the clustersr prms$cluster$corr
r 100*prms$cluster$size
% of the expected size (the expected size is 50 if there are 500 genes and 10 clusters)r length(grp)
(the number of sample groups) clusters left, reduce the height cutoff by 0.05 and repeat this step until there are enough clustersr prms$cluster$corr
. Repeat this step until no 2 clusters are that similarh<-prms$cluster$height; sz<-prms$cluster$size; # minimal size of a cluster, ratio against expected size (ex. if size=0.2, minimal cluster size is 0.2*500/10 when there are 500 genes and 10 clusters in total) # Hierarchical clustering hc<-hclust(as.dist(1-cor(t(e)))); ct<-cutree(hc, h=h); cl<-split(names(ct), ct); cl<-sapply(cl, function(c) { x<-e[c, ]; md<-apply(x, 2, median); r<-as.vector(cor(md, t(x))[1, ]); # remove outliers c[r>=prms$cluster$corr] }); cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)]; # Reduce cutoff if number of clusters is less than number of groups while(length(cl) < length(grp)) { h<-h-0.05; ct<-cutree(hc, h=h); cl<-split(names(ct), ct); cl<-sapply(cl, function(c) { x<-e[c, ]; md<-apply(x, 2, median); r<-as.vector(cor(md, t(x))[1, ]); # remove outliers c[r>=prms$cluster$corr] }); cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)]; } # merge similar clusters flag<-TRUE; while(flag) { cat('Number of clusters ', length(cl), '\n'); ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); tr<-cutree(hclust(as.dist(1-cor(ms))), k=length(cl)-1); # find the 2 most similar clusters i<-tr[duplicated(tr)]; c<-ms[, tr==i]; r<-cor(c[, 1], c[, 2]); p<-cor.test(c[, 1], c[, 2])$p.value[[1]]; if (r>prms$cluster$merge$corr & p<prms$cluster$merge$p) { cl[tr==i][[1]]<-as.vector(unlist(cl[tr==i])); cl<-cl[names(cl)!=names(i)]; } else flag<-FALSE; } # Sort clusters m<-sapply(cl, function(cl) colMeans(expr[cl, ])); #m<-sapply(grp, function(g) colMeans(m[g, ])); ind<-apply(m, 2, function(x) which(x==max(x))); cl<-cl[order(ind)]; names(cl)<-paste('Cluster', 1:length(cl), sep='_'); # preparing for the next blocks ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); ms<-data.frame(t(ms)); colnames(ms)<-colnames(e); rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length), ')', sep=''); sortColumn<-function(ms, g) { corr<-as.vector(cor(rowMeans(ms[, g[[2]], drop=FALSE]), ms[, g[[1]], drop=FALSE])); g[[1]]<-g[[1]][order(corr)]; for (i in 2:length(g)) { corr<-as.vector(cor(rowMeans(ms[, g[[i-1]], drop=FALSE]), ms[, grp[[i]], drop=FALSE])); g[[i]]<-g[[i]][rev(order(corr))]; } ms[, as.vector(unlist(g))]; } n<-sapply(cl, length); d<-expr[stat[, 'pANOVA']<=prms$recluster$p, , drop=FALSE];
The initial analysis identified r length(cl)
gene clusters using just selected significant genes. A total of r sum(n)
genes were clustered.
if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms; x<-sortColumn(x, grp); PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp);
Figure 3 This plot shows below the average expression levels of each cluster. Values indicate number of standard deviation from mean of all samples.
We further refine the gene clusters with the following steps:
r nrow(d)
genes with p values less than r prms$recluster$p
to continuer nrow(d)
X r length(cl)
matrixr prms$recluster$r
, and the correlation coefficient to any other cluster is at least r prms$recluster$diff
lessr prms$recluster$times
times unless the reclustering convergedreCl<-function(d, cl, r, dif) { md<-sapply(cl, function(cl) apply(d[cl[cl %in% rownames(d)], , drop=FALSE], 2, median)); corr<-cor(t(d), md); c<-lapply(1:ncol(corr), function(i) { mx<-apply(corr[, -i, drop=FALSE], 1, max); rownames(corr)[corr[, i]>=r & (corr[, i]-mx)>dif]; }); c; } cls<-list(cl, reCl(d, cl, prms$recluster$r, prms$recluster$diff)); while (!identical(cls[[length(cls)]], cls[[length(cls)-1]]) & length(cls)<=prms$recluster$times) { cls[[length(cls)+1]]<-reCl(d, cls[[length(cls)]], prms$recluster$r, prms$recluster$diff); cat("Reclustering #", length(cls)-1, '\n'); } cl<-cls[[length(cls)]]; names(cl)<-paste('Cluster', 1:length(cl), sep='_'); # summary cluster sizes n<-sapply(cls, function(c) sapply(c, length)); n<-data.frame(n); colnames(n)<-paste('Cycle #', 0:(ncol(n)-1), sep=''); ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); ms<-data.frame(t(ms)); colnames(ms)<-colnames(expr); rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length), ')', sep=''); if(identical(cls[[length(cls)]], cls[[length(cls)-1]])) msg<-paste("The reclustering converged after", length(cls)-1, 'cycles') else msg<-paste("The reclustering didn't converge after", length(cls)-1, 'cycles')
r msg
. A total of r sum(sapply(cls[[length(cls)]], length))
genes were clustered.
if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms; x<-sortColumn(x, grp); PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp);
Figure 4 This plot shows below the average expression levels of each cluster after r ncol(n)-1
cycles of reclustering. Values indicate number of standard deviation from mean of all samples.
ids<-as.vector(unlist(cls[[length(cls)]])); c<-rep(names(cl), sapply(cl, length)); tbl1<-cbind(anno[ids, ], Cluster=c, FormatNumeric(expr[ids, ])); tbl2<-cbind(anno[ids, ], Cluster=c, FormatNumeric(stat[ids, ])); fn1<-CreateDatatable(tbl1, paste(yml$output, 'clustered_data.html', sep='/'), caption="Expression data of clustered genes"); fn2<-CreateDatatable(tbl2, paste(yml$output, 'clustered_stat.html', sep='/'), caption="ANOVA statistical results of clustered genes"); write.csv2(tbl1, paste(yml$output, 'clustered_data.csv', sep='/')); write.csv2(tbl2, paste(yml$output, 'clustered_stat.csv', sep='/')); write.csv2(cbind(expr, anno), paste(yml$output, 'all_genes_data.csv', sep='/')); write.csv2(cbind(stat, anno), paste(yml$output, 'all_genes_stat.csv', sep='/')); saveRDS(tbl1, paste(yml$output, 'clustered_data.rds', sep='/')); saveRDS(tbl2, paste(yml$output, 'clustered_stat.rds', sep='/')); saveRDS(cbind(expr, anno), paste(yml$output, 'all_genes_data.rds', sep='/')); saveRDS(cbind(stat, anno), paste(yml$output, 'all_genes_stat.rds', sep='/')); saveRDS(gset, paste(yml$output, 'all_genesets.rds', sep='/'));
Result tables:
fn.temp<-paste(yml$output, 'ClDetail.Rmd', sep='/'); if (yml$input$remote) { if (!RCurl::url.exists(yml$input$subtemplate)) stop("Template Rmd file ', yml$input$subtemplate, ' not exists\n"); writeLines(RCurl::getURL(yml$input$subtemplate)[[1]], fn.temp); } else { file.copy(yml$input$subtemplate, fn.temp); } fns<-sapply(1:length(cl), function(i) { mtrx<-expr[cl[[i]], , drop=FALSE]; name<-names(cl)[i]; grps<-grp; univ<-rownames(anno); path<-paste(yml$output, name, sep='/'); home<-"../index.html"; if (prms$plot$zero) mtrx<-mtrx-rowMeans(mtrx[, grp[[1]], drop=FALSE]); if (!file.exists(path)) dir.create(path, recursive = TRUE); fn.html<-paste(name, '.html', sep=''); rmarkdown::render(fn.temp, output_file=fn.html, output_dir=paste(yml$output, name, sep='/'), quiet=TRUE); });
lnk<-paste(names(cl), paste(names(cl), '.html', sep=''), sep='/'); lns<-paste(' - [', names(cl), '](', lnk, ')', sep=''); lns<-paste(lns, collapse='\n');
Click cluster names to see more detaied information about individual clusters
r lns
ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); if (prms$plot$rescale) ms<-t(scale(t(ms))); m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE])); se<-sapply(grp, function(g) apply(ms[g, , drop=FALSE], 2, function(x) sd(x)/sqrt(length(x)))); if (prms$plot$zero) m<-m-m[, 1]; PlotSeries(m, se, c('', prms$plot$ylab));
Figure 5 Plot the group means and standard errors of all cluster as data series
ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); if (prms$plot$rescale) ms<-t(scale(t(ms))); m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE])); colnames(m)<-names(grp); rownames(m)<-names(cl); PlotColoredBlock(m, min=-1*max(abs(m), na.rm=TRUE), max=max(abs(m), na.rm=TRUE), key='Cluster average', groups=split(colnames(m), colnames(m)));
Figure 6 Heatmap of group and cluster means
d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE]; d<-sapply(grp, function(g) rowMeans(d[, g, drop=FALSE])); if (prms$plot$rescale) d<-t(scale(t(d))); colnames(d)<-names(grp); PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=split(colnames(d), colnames(d)), key='Group average'); abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1));
Figure 7 Group means of all clustered genes
d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE]; d<-sortColumn(d, grp); #rownames(d)<-paste(rownames(d), anno[rownames(d), 1], sep=':'); if (prms$plot$rescale) d<-t(scale(t(d))); PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=grp, key='Expression level'); abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); abline(v=c(0, cumsum(sapply(grp, length))), lwd=1);
Figure 8 All samples and all clustered genes
rownames(n)<-paste(rownames(n), ' (n=', n[, 1], ' to ', n[, ncol(n)], ')', sep='') PlotColoredBlock(n, key='Cluster size');
Figure 9 This plot traces the change of cluster sizes after each reclustering cycle.
x<-strsplit(yml$output, '/')[[1]]; fn.zip<-paste(x[length(x)], '.zip', sep=''); fn.zip.fn<-paste(yml$output, fn.zip, sep='/'); if (file.exists(fn.zip.fn)) file.remove(fn.zip.fn); zip(fn.zip, yml$output, "-rj9X", zip='zip'); file.rename(fn.zip, fn.zip.fn);
END OF DOCUMENT
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.