In zhezhangsh/DEGandMore:

knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, dev=c('png', 'pdf'), echo=FALSE, warning=FALSE, message=FALSE);

library(awsomics);
library(gplots);
library(knitr);
library(rmarkdown); 

# fn.yaml<-"ClReport_Gas1_vs_control.yml"; # comment it out if running through knitr::knit function
# fn.yaml<-"/nas/is1/zhangz/projects/barr/2015-11_Rat_Brain_Development/result/gene_clustering/Juvenile_SC_injury/ClReport.yml"; 

# Loading inputs from yaml file
# if (!exists('fn.yaml')) stop('Input file not found\n'); 
# writeLines(yaml::as.yaml(yml), fn.yaml);
# yml <- yaml::yaml.load_file(fn.yaml);  

expr<-readRDS(yml$input$data); 
anno<-readRDS(yml$input$annotation);
smpl<-readRDS(yml$input$sample);
gset<-readRDS(yml$input$geneset); 

if (!setequal(rownames(expr), rownames(anno))) stop("Data matrix and gene annotation have different row names.\n");
if (!setequal(colnames(expr), rownames(smpl))) stop("Data matrix and sample metadata have different names.\n");
expr<-expr[rownames(anno), rownames(smpl)]; 

prms<-yml$parameter;

**_[Go back to project home](`r yml$project$Home`)_**

Project background

lns<-lapply(names(yml$project), function(nm) {
  c(paste('##', nm), '\n', yml$project[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n');

r lns

**_[Go back to project home](`r yml$project$Home`)_**

Samples and methods

CreateDatatable(smpl, paste(yml$output, 'samples.html', sep='/'));
lns<-lapply(names(yml$methods), function(nm) {
  c(paste('##', nm), '\n', yml$methods[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n');

Samples

There are a total of r ncol(expr) samples. Click here to view full table of samples

r lns

Analysis and results

**_[Go back to project home](`r yml$project$Home`)_**

ANOVA analysis of each gene across sample

f<-as.factor(smpl[, prms$term[1]]); 
l<-unique(smpl[, prms$term[1]]);
grp<-split(rownames(smpl), as.vector(f))[l];
m<-sapply(grp, function(s) rowMeans(expr[, s, drop=FALSE])); 
mn<-apply(expr, 1, min);
mx<-apply(expr, 1, max); 

p<-apply(expr, 1, function(d) summary(aov(d ~ f))[[1]][1, 5]); 
q<-p.adjust(p, method='BH');
stat<-cbind(m, Min=mn, Max=mx, Range=mx-mn, pANOVA=p, FDR=q); 

CreateDatatable(cbind(anno, FormatNumeric(stat)), paste(yml$output, 'anova.html', sep='/'), caption="ANOVA results")

We applied ANOVA analysis on each gene to test its differential expression across samples of different groups.

There are r nrow(expr) total genes
There are r ncol(expr) total samples
There are r length(l) sample groups: r paste(l, collapse=', ')

Click here to view full ANOVA result table.

**_[Go back to project home](`r yml$project$Home`)_**

Gene clustering

Select genes differentially expressed across groups

Select genes using the following steps:

Select genes with FDR less than r prms$selection$fdr
Stop if less than r prms$selection$min genes were left; otherwise, select those with ANOVA p values less than r prms$selection$p
Stop if less than r prms$selection$min genes were left; otherwise, select those with range (max-min) greater than r prms$selection$range
If there are still more than r prms$selection$max genes left, select the top r prms$selection$max with the biggest ranges

e<-expr[q<=prms$selection$fdr, , drop=FALSE]; 
if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum
  s<-stat[rownames(e), , drop=FALSE];
  s<-s[order(s[, 'pANOVA']), , drop=FALSE];
  e<-e[rownames(s), , drop=FALSE]; 
  e<-e[1:max(prms$selection$min, max(which(s[, 'pANOVA']<=prms$selection$p))), , drop=FALSE]; 

  if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum 
    s<-stat[rownames(e), , drop=FALSE];
    s<-s[rev(order(s[, 'Range'])), , drop=FALSE];
    e<-e[rownames(s), , drop=FALSE]; 
    e<-e[1:max(prms$selection$min, max(which(s[, 'Range']>=prms$selection$range))), , drop=FALSE]; 
  }

  if (nrow(e) > prms$selection$max) e<-e[1:prms$selection$max, , drop=FALSE]; # trim if more selected genes than maximum
}

A total of r nrow(e) genes were selected. The genes were used for the next step to identify gene clusters

plot(hclust(as.dist(1-cor(expr))), main='Sample clustering using all genes', xlab='Sample', sub='');

Figure 1 Hierarchical clustering of samples using all genes.

plot(hclust(as.dist(1-cor(e))), main='Sample clustering using selected significant genes', xlab='Sample', sub='');

Figure 2 Hierarchical clustering of samples using just selected genes.

Cluster selected genes

Genes selected by the last step were clustering using the following steps:

Create a hierchical tree based on gene-gene correlation
Cut the tree at height r prms$cluster$height, which will classify genes into clusters. Then apply the following steps to refine the clusters
- Calculate correlation of each gene to the centroid (median) of its cluster. Remove the genes if the correlation is less than r prms$cluster$corr
- Remove clusters with size less than r 100*prms$cluster$size% of the expected size (the expected size is 50 if there are 500 genes and 10 clusters)
- If there are less than r length(grp) (the number of sample groups) clusters left, reduce the height cutoff by 0.05 and repeat this step until there are enough clusters
Merge the 2 most similar clusters if the correlation of their centroids is greater than or equal to r prms$cluster$corr. Repeat this step until no 2 clusters are that similar

h<-prms$cluster$height;
sz<-prms$cluster$size; # minimal size of a cluster, ratio against expected size (ex. if size=0.2, minimal cluster size is 0.2*500/10 when there are 500 genes and 10 clusters in total)

# Hierarchical clustering
hc<-hclust(as.dist(1-cor(t(e)))); 
ct<-cutree(hc, h=h);
cl<-split(names(ct), ct); 
cl<-sapply(cl, function(c) {
  x<-e[c, ]; 
  md<-apply(x, 2, median); 
  r<-as.vector(cor(md, t(x))[1, ]); # remove outliers 
  c[r>=prms$cluster$corr]
}); 
cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)];

# Reduce cutoff if number of clusters is less than number of groups
while(length(cl) < length(grp)) {
    h<-h-0.05;
    ct<-cutree(hc, h=h);
    cl<-split(names(ct), ct); 
    cl<-sapply(cl, function(c) {
    x<-e[c, ]; 
    md<-apply(x, 2, median); 
    r<-as.vector(cor(md, t(x))[1, ]); # remove outliers 
    c[r>=prms$cluster$corr]
  }); 
  cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)];
}

# merge similar clusters
flag<-TRUE;
while(flag) {
  cat('Number of clusters ', length(cl), '\n'); 
  ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
  tr<-cutree(hclust(as.dist(1-cor(ms))), k=length(cl)-1); # find the 2 most similar clusters
  i<-tr[duplicated(tr)];
  c<-ms[, tr==i]; 
  r<-cor(c[, 1], c[, 2]); 
  p<-cor.test(c[, 1], c[, 2])$p.value[[1]]; 
  if (r>prms$cluster$merge$corr & p<prms$cluster$merge$p) {
    cl[tr==i][[1]]<-as.vector(unlist(cl[tr==i]));
    cl<-cl[names(cl)!=names(i)]; 
  } else flag<-FALSE;
}

# Sort clusters
m<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
#m<-sapply(grp, function(g) colMeans(m[g, ])); 
ind<-apply(m, 2, function(x) which(x==max(x)));  
cl<-cl[order(ind)]; 
names(cl)<-paste('Cluster', 1:length(cl), sep='_'); 

# preparing for the next blocks
ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
ms<-data.frame(t(ms));
colnames(ms)<-colnames(e); 
rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length),  ')', sep=''); 

sortColumn<-function(ms, g) {
  corr<-as.vector(cor(rowMeans(ms[, g[[2]], drop=FALSE]), ms[, g[[1]], drop=FALSE]));
  g[[1]]<-g[[1]][order(corr)];
  for (i in 2:length(g)) {
    corr<-as.vector(cor(rowMeans(ms[, g[[i-1]], drop=FALSE]), ms[, grp[[i]], drop=FALSE]));
    g[[i]]<-g[[i]][rev(order(corr))]; 
  }
  ms[, as.vector(unlist(g))]; 
}

n<-sapply(cl, length); 
d<-expr[stat[, 'pANOVA']<=prms$recluster$p, , drop=FALSE];

The initial analysis identified r length(cl) gene clusters using just selected significant genes. A total of r sum(n) genes were clustered.

if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms;
x<-sortColumn(x, grp);
PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp);

Figure 3 This plot shows below the average expression levels of each cluster. Values indicate number of standard deviation from mean of all samples.

Refine clusters

We further refine the gene clusters with the following steps:

Select all r nrow(d) genes with p values less than r prms$recluster$p to continue
Assign selected genes to clusters:
- Calculate centroid (median expression level of all genes in the cluster) of each cluster
- Calculate correlation coefficient of each gene to centroid of each cluster to get a r nrow(d) X r length(cl) matrix
- Assign a gene to a cluster if its correlation coefficient to the cluster is greater than r prms$recluster$r, and the correlation coefficient to any other cluster is at least r prms$recluster$diff less
- Repeat this reclustering steps for r prms$recluster$times times unless the reclustering converged

reCl<-function(d, cl, r, dif) {
  md<-sapply(cl, function(cl) apply(d[cl[cl %in% rownames(d)], , drop=FALSE], 2, median));
  corr<-cor(t(d), md); 
  c<-lapply(1:ncol(corr), function(i) {
    mx<-apply(corr[, -i, drop=FALSE], 1, max);
    rownames(corr)[corr[, i]>=r & (corr[, i]-mx)>dif]; 
  });
  c;
}

cls<-list(cl, reCl(d, cl, prms$recluster$r, prms$recluster$diff)); 

while (!identical(cls[[length(cls)]], cls[[length(cls)-1]]) & length(cls)<=prms$recluster$times) {
  cls[[length(cls)+1]]<-reCl(d, cls[[length(cls)]], prms$recluster$r, prms$recluster$diff); 
  cat("Reclustering #", length(cls)-1, '\n'); 
}

cl<-cls[[length(cls)]]; 
names(cl)<-paste('Cluster', 1:length(cl), sep='_'); 

# summary cluster sizes
n<-sapply(cls, function(c) sapply(c, length)); 
n<-data.frame(n); 
colnames(n)<-paste('Cycle #', 0:(ncol(n)-1), sep=''); 

ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
ms<-data.frame(t(ms));
colnames(ms)<-colnames(expr); 
rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length),  ')', sep=''); 

if(identical(cls[[length(cls)]], cls[[length(cls)-1]])) msg<-paste("The reclustering converged after", length(cls)-1, 'cycles') else msg<-paste("The reclustering didn't converge after", length(cls)-1, 'cycles')

r msg. A total of r sum(sapply(cls[[length(cls)]], length)) genes were clustered.

if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms;
x<-sortColumn(x, grp);
PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp);

Figure 4 This plot shows below the average expression levels of each cluster after r ncol(n)-1 cycles of reclustering. Values indicate number of standard deviation from mean of all samples.

ids<-as.vector(unlist(cls[[length(cls)]])); 
c<-rep(names(cl), sapply(cl, length)); 
tbl1<-cbind(anno[ids, ], Cluster=c, FormatNumeric(expr[ids, ])); 
tbl2<-cbind(anno[ids, ], Cluster=c, FormatNumeric(stat[ids, ])); 

fn1<-CreateDatatable(tbl1, paste(yml$output, 'clustered_data.html', sep='/'), caption="Expression data of clustered genes");
fn2<-CreateDatatable(tbl2, paste(yml$output, 'clustered_stat.html', sep='/'), caption="ANOVA statistical results of clustered genes"); 

write.csv2(tbl1, paste(yml$output, 'clustered_data.csv', sep='/')); 
write.csv2(tbl2, paste(yml$output, 'clustered_stat.csv', sep='/')); 
write.csv2(cbind(expr, anno), paste(yml$output, 'all_genes_data.csv', sep='/')); 
write.csv2(cbind(stat, anno), paste(yml$output, 'all_genes_stat.csv', sep='/')); 

saveRDS(tbl1, paste(yml$output, 'clustered_data.rds', sep='/')); 
saveRDS(tbl2, paste(yml$output, 'clustered_stat.rds', sep='/')); 
saveRDS(cbind(expr, anno), paste(yml$output, 'all_genes_data.rds', sep='/')); 
saveRDS(cbind(stat, anno), paste(yml$output, 'all_genes_stat.rds', sep='/')); 

saveRDS(gset, paste(yml$output, 'all_genesets.rds', sep='/'));

Result tables:

Normalized expressiond data of clustered genes
- Click here to view online
- Click here to download table
Statistical results of clustered genes
- Click here to view online
- Click here to download table

**_[Go back to project home](`r yml$project$Home`)_**

Analysis of individual clusters

fn.temp<-paste(yml$output, 'ClDetail.Rmd', sep='/'); 
if (yml$input$remote) {
  if (!RCurl::url.exists(yml$input$subtemplate)) stop("Template Rmd file ', yml$input$subtemplate, ' not exists\n");
  writeLines(RCurl::getURL(yml$input$subtemplate)[[1]], fn.temp);
} else {
  file.copy(yml$input$subtemplate, fn.temp); 
}

fns<-sapply(1:length(cl), function(i) {
  mtrx<-expr[cl[[i]], , drop=FALSE]; 
  name<-names(cl)[i]; 
  grps<-grp;
  univ<-rownames(anno); 
  path<-paste(yml$output, name, sep='/'); 
  home<-"../index.html";

  if (prms$plot$zero) mtrx<-mtrx-rowMeans(mtrx[, grp[[1]], drop=FALSE]); 

  if (!file.exists(path)) dir.create(path, recursive = TRUE); 

  fn.html<-paste(name, '.html', sep=''); 

  rmarkdown::render(fn.temp, output_file=fn.html, output_dir=paste(yml$output, name, sep='/'), quiet=TRUE); 
});

lnk<-paste(names(cl), paste(names(cl), '.html', sep=''), sep='/'); 
lns<-paste('  - [', names(cl), '](', lnk, ')', sep='');
lns<-paste(lns, collapse='\n');

Click cluster names to see more detaied information about individual clusters

r lns

Go back to project home

Extra plots

ms<-sapply(cl, function(cl) colMeans(expr[cl, ]));
if (prms$plot$rescale) ms<-t(scale(t(ms))); 
m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE]));
se<-sapply(grp, function(g) apply(ms[g, , drop=FALSE],  2, function(x) sd(x)/sqrt(length(x)))); 
if (prms$plot$zero) m<-m-m[, 1];
PlotSeries(m, se, c('', prms$plot$ylab));

Figure 5 Plot the group means and standard errors of all cluster as data series

ms<-sapply(cl, function(cl) colMeans(expr[cl, ]));
if (prms$plot$rescale) ms<-t(scale(t(ms))); 
m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE]));
colnames(m)<-names(grp);
rownames(m)<-names(cl); 
PlotColoredBlock(m, min=-1*max(abs(m), na.rm=TRUE), max=max(abs(m), na.rm=TRUE), key='Cluster average', groups=split(colnames(m), colnames(m)));

Figure 6 Heatmap of group and cluster means

d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE];
d<-sapply(grp, function(g) rowMeans(d[, g, drop=FALSE])); 
if (prms$plot$rescale) d<-t(scale(t(d))); 
colnames(d)<-names(grp); 
PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=split(colnames(d), colnames(d)), key='Group average'); 
abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1));

Figure 7 Group means of all clustered genes

d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE];
d<-sortColumn(d, grp); 
#rownames(d)<-paste(rownames(d), anno[rownames(d), 1], sep=':'); 
if (prms$plot$rescale) d<-t(scale(t(d))); 
PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=grp, key='Expression level'); 
abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); 
abline(v=c(0, cumsum(sapply(grp, length))), lwd=1);

Figure 8 All samples and all clustered genes

rownames(n)<-paste(rownames(n), ' (n=', n[, 1], ' to ', n[, ncol(n)], ')', sep='')
PlotColoredBlock(n, key='Cluster size');

Figure 9 This plot traces the change of cluster sizes after each reclustering cycle.

x<-strsplit(yml$output, '/')[[1]]; 
fn.zip<-paste(x[length(x)], '.zip', sep='');
fn.zip.fn<-paste(yml$output, fn.zip, sep='/');
if (file.exists(fn.zip.fn)) file.remove(fn.zip.fn); 
zip(fn.zip, yml$output, "-rj9X", zip='zip'); 
file.rename(fn.zip, fn.zip.fn);