knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, dev=c('png', 'pdf'), echo=FALSE, warning=FALSE, message=FALSE);

library(awsomics);
library(gplots);
library(knitr);
library(rmarkdown); 

# fn.yaml<-"ClReport_Gas1_vs_control.yml"; # comment it out if running through knitr::knit function
# fn.yaml<-"/nas/is1/zhangz/projects/barr/2015-11_Rat_Brain_Development/result/gene_clustering/Juvenile_SC_injury/ClReport.yml"; 

# Loading inputs from yaml file
# if (!exists('fn.yaml')) stop('Input file not found\n'); 
# writeLines(yaml::as.yaml(yml), fn.yaml);
# yml <- yaml::yaml.load_file(fn.yaml);  

expr<-readRDS(yml$input$data); 
anno<-readRDS(yml$input$annotation);
smpl<-readRDS(yml$input$sample);
gset<-readRDS(yml$input$geneset); 

if (!setequal(rownames(expr), rownames(anno))) stop("Data matrix and gene annotation have different row names.\n");
if (!setequal(colnames(expr), rownames(smpl))) stop("Data matrix and sample metadata have different names.\n");
expr<-expr[rownames(anno), rownames(smpl)]; 

prms<-yml$parameter;
**_[Go back to project home](`r yml$project$Home`)_**

Project background

lns<-lapply(names(yml$project), function(nm) {
  c(paste('##', nm), '\n', yml$project[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n'); 

r lns

**_[Go back to project home](`r yml$project$Home`)_**

Samples and methods

CreateDatatable(smpl, paste(yml$output, 'samples.html', sep='/'));
lns<-lapply(names(yml$methods), function(nm) {
  c(paste('##', nm), '\n', yml$methods[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n'); 

Samples

There are a total of r ncol(expr) samples. Click here to view full table of samples

r lns

Analysis and results

**_[Go back to project home](`r yml$project$Home`)_**

ANOVA analysis of each gene across sample

f<-as.factor(smpl[, prms$term[1]]); 
l<-unique(smpl[, prms$term[1]]);
grp<-split(rownames(smpl), as.vector(f))[l];
m<-sapply(grp, function(s) rowMeans(expr[, s, drop=FALSE])); 
mn<-apply(expr, 1, min);
mx<-apply(expr, 1, max); 

p<-apply(expr, 1, function(d) summary(aov(d ~ f))[[1]][1, 5]); 
q<-p.adjust(p, method='BH');
stat<-cbind(m, Min=mn, Max=mx, Range=mx-mn, pANOVA=p, FDR=q); 

CreateDatatable(cbind(anno, FormatNumeric(stat)), paste(yml$output, 'anova.html', sep='/'), caption="ANOVA results")

We applied ANOVA analysis on each gene to test its differential expression across samples of different groups.

Click here to view full ANOVA result table.

**_[Go back to project home](`r yml$project$Home`)_**

Gene clustering

Select genes differentially expressed across groups

Select genes using the following steps:

  1. Select genes with FDR less than r prms$selection$fdr
  2. Stop if less than r prms$selection$min genes were left; otherwise, select those with ANOVA p values less than r prms$selection$p
  3. Stop if less than r prms$selection$min genes were left; otherwise, select those with range (max-min) greater than r prms$selection$range
  4. If there are still more than r prms$selection$max genes left, select the top r prms$selection$max with the biggest ranges
e<-expr[q<=prms$selection$fdr, , drop=FALSE]; 
if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum
  s<-stat[rownames(e), , drop=FALSE];
  s<-s[order(s[, 'pANOVA']), , drop=FALSE];
  e<-e[rownames(s), , drop=FALSE]; 
  e<-e[1:max(prms$selection$min, max(which(s[, 'pANOVA']<=prms$selection$p))), , drop=FALSE]; 

  if (nrow(e) > prms$selection$min) { # continue if there are more selected genes than minimum 
    s<-stat[rownames(e), , drop=FALSE];
    s<-s[rev(order(s[, 'Range'])), , drop=FALSE];
    e<-e[rownames(s), , drop=FALSE]; 
    e<-e[1:max(prms$selection$min, max(which(s[, 'Range']>=prms$selection$range))), , drop=FALSE]; 
  }

  if (nrow(e) > prms$selection$max) e<-e[1:prms$selection$max, , drop=FALSE]; # trim if more selected genes than maximum
}

A total of r nrow(e) genes were selected. The genes were used for the next step to identify gene clusters

plot(hclust(as.dist(1-cor(expr))), main='Sample clustering using all genes', xlab='Sample', sub='');

Figure 1 Hierarchical clustering of samples using all genes.

plot(hclust(as.dist(1-cor(e))), main='Sample clustering using selected significant genes', xlab='Sample', sub='');

Figure 2 Hierarchical clustering of samples using just selected genes.

Cluster selected genes

Genes selected by the last step were clustering using the following steps:

h<-prms$cluster$height;
sz<-prms$cluster$size; # minimal size of a cluster, ratio against expected size (ex. if size=0.2, minimal cluster size is 0.2*500/10 when there are 500 genes and 10 clusters in total)

# Hierarchical clustering
hc<-hclust(as.dist(1-cor(t(e)))); 
ct<-cutree(hc, h=h);
cl<-split(names(ct), ct); 
cl<-sapply(cl, function(c) {
  x<-e[c, ]; 
  md<-apply(x, 2, median); 
  r<-as.vector(cor(md, t(x))[1, ]); # remove outliers 
  c[r>=prms$cluster$corr]
}); 
cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)];

# Reduce cutoff if number of clusters is less than number of groups
while(length(cl) < length(grp)) {
    h<-h-0.05;
    ct<-cutree(hc, h=h);
    cl<-split(names(ct), ct); 
    cl<-sapply(cl, function(c) {
    x<-e[c, ]; 
    md<-apply(x, 2, median); 
    r<-as.vector(cor(md, t(x))[1, ]); # remove outliers 
    c[r>=prms$cluster$corr]
  }); 
  cl<-cl[sapply(cl, length)>=sz*nrow(e)/length(cl)];
}

# merge similar clusters
flag<-TRUE;
while(flag) {
  cat('Number of clusters ', length(cl), '\n'); 
  ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
  tr<-cutree(hclust(as.dist(1-cor(ms))), k=length(cl)-1); # find the 2 most similar clusters
  i<-tr[duplicated(tr)];
  c<-ms[, tr==i]; 
  r<-cor(c[, 1], c[, 2]); 
  p<-cor.test(c[, 1], c[, 2])$p.value[[1]]; 
  if (r>prms$cluster$merge$corr & p<prms$cluster$merge$p) {
    cl[tr==i][[1]]<-as.vector(unlist(cl[tr==i]));
    cl<-cl[names(cl)!=names(i)]; 
  } else flag<-FALSE;
}

# Sort clusters
m<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
#m<-sapply(grp, function(g) colMeans(m[g, ])); 
ind<-apply(m, 2, function(x) which(x==max(x)));  
cl<-cl[order(ind)]; 
names(cl)<-paste('Cluster', 1:length(cl), sep='_'); 

# preparing for the next blocks
ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
ms<-data.frame(t(ms));
colnames(ms)<-colnames(e); 
rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length),  ')', sep=''); 

sortColumn<-function(ms, g) {
  corr<-as.vector(cor(rowMeans(ms[, g[[2]], drop=FALSE]), ms[, g[[1]], drop=FALSE]));
  g[[1]]<-g[[1]][order(corr)];
  for (i in 2:length(g)) {
    corr<-as.vector(cor(rowMeans(ms[, g[[i-1]], drop=FALSE]), ms[, grp[[i]], drop=FALSE]));
    g[[i]]<-g[[i]][rev(order(corr))]; 
  }
  ms[, as.vector(unlist(g))]; 
}

n<-sapply(cl, length); 
d<-expr[stat[, 'pANOVA']<=prms$recluster$p, , drop=FALSE]; 

The initial analysis identified r length(cl) gene clusters using just selected significant genes. A total of r sum(n) genes were clustered.

if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms;
x<-sortColumn(x, grp);
PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp); 

Figure 3 This plot shows below the average expression levels of each cluster. Values indicate number of standard deviation from mean of all samples.

Refine clusters

We further refine the gene clusters with the following steps:

reCl<-function(d, cl, r, dif) {
  md<-sapply(cl, function(cl) apply(d[cl[cl %in% rownames(d)], , drop=FALSE], 2, median));
  corr<-cor(t(d), md); 
  c<-lapply(1:ncol(corr), function(i) {
    mx<-apply(corr[, -i, drop=FALSE], 1, max);
    rownames(corr)[corr[, i]>=r & (corr[, i]-mx)>dif]; 
  });
  c;
}

cls<-list(cl, reCl(d, cl, prms$recluster$r, prms$recluster$diff)); 

while (!identical(cls[[length(cls)]], cls[[length(cls)-1]]) & length(cls)<=prms$recluster$times) {
  cls[[length(cls)+1]]<-reCl(d, cls[[length(cls)]], prms$recluster$r, prms$recluster$diff); 
  cat("Reclustering #", length(cls)-1, '\n'); 
}

cl<-cls[[length(cls)]]; 
names(cl)<-paste('Cluster', 1:length(cl), sep='_'); 

# summary cluster sizes
n<-sapply(cls, function(c) sapply(c, length)); 
n<-data.frame(n); 
colnames(n)<-paste('Cycle #', 0:(ncol(n)-1), sep=''); 

ms<-sapply(cl, function(cl) colMeans(expr[cl, ])); 
ms<-data.frame(t(ms));
colnames(ms)<-colnames(expr); 
rownames(ms)<-paste(rownames(ms), ' (n=', sapply(cl, length),  ')', sep=''); 

if(identical(cls[[length(cls)]], cls[[length(cls)-1]])) msg<-paste("The reclustering converged after", length(cls)-1, 'cycles') else msg<-paste("The reclustering didn't converge after", length(cls)-1, 'cycles')

r msg. A total of r sum(sapply(cls[[length(cls)]], length)) genes were clustered.

if (prms$plot$rescale) x<-t(scale(t(ms))) else x<-ms;
x<-sortColumn(x, grp);
PlotColoredBlock(x, min=-1*max(abs(x), na.rm=TRUE), max=max(abs(x), na.rm=TRUE), key='Average expression', groups=grp);

Figure 4 This plot shows below the average expression levels of each cluster after r ncol(n)-1 cycles of reclustering. Values indicate number of standard deviation from mean of all samples.

ids<-as.vector(unlist(cls[[length(cls)]])); 
c<-rep(names(cl), sapply(cl, length)); 
tbl1<-cbind(anno[ids, ], Cluster=c, FormatNumeric(expr[ids, ])); 
tbl2<-cbind(anno[ids, ], Cluster=c, FormatNumeric(stat[ids, ])); 

fn1<-CreateDatatable(tbl1, paste(yml$output, 'clustered_data.html', sep='/'), caption="Expression data of clustered genes");
fn2<-CreateDatatable(tbl2, paste(yml$output, 'clustered_stat.html', sep='/'), caption="ANOVA statistical results of clustered genes"); 

write.csv2(tbl1, paste(yml$output, 'clustered_data.csv', sep='/')); 
write.csv2(tbl2, paste(yml$output, 'clustered_stat.csv', sep='/')); 
write.csv2(cbind(expr, anno), paste(yml$output, 'all_genes_data.csv', sep='/')); 
write.csv2(cbind(stat, anno), paste(yml$output, 'all_genes_stat.csv', sep='/')); 

saveRDS(tbl1, paste(yml$output, 'clustered_data.rds', sep='/')); 
saveRDS(tbl2, paste(yml$output, 'clustered_stat.rds', sep='/')); 
saveRDS(cbind(expr, anno), paste(yml$output, 'all_genes_data.rds', sep='/')); 
saveRDS(cbind(stat, anno), paste(yml$output, 'all_genes_stat.rds', sep='/')); 

saveRDS(gset, paste(yml$output, 'all_genesets.rds', sep='/')); 

Result tables:

**_[Go back to project home](`r yml$project$Home`)_**

Analysis of individual clusters

fn.temp<-paste(yml$output, 'ClDetail.Rmd', sep='/'); 
if (yml$input$remote) {
  if (!RCurl::url.exists(yml$input$subtemplate)) stop("Template Rmd file ', yml$input$subtemplate, ' not exists\n");
  writeLines(RCurl::getURL(yml$input$subtemplate)[[1]], fn.temp);
} else {
  file.copy(yml$input$subtemplate, fn.temp); 
}

fns<-sapply(1:length(cl), function(i) {
  mtrx<-expr[cl[[i]], , drop=FALSE]; 
  name<-names(cl)[i]; 
  grps<-grp;
  univ<-rownames(anno); 
  path<-paste(yml$output, name, sep='/'); 
  home<-"../index.html";

  if (prms$plot$zero) mtrx<-mtrx-rowMeans(mtrx[, grp[[1]], drop=FALSE]); 

  if (!file.exists(path)) dir.create(path, recursive = TRUE); 

  fn.html<-paste(name, '.html', sep=''); 

  rmarkdown::render(fn.temp, output_file=fn.html, output_dir=paste(yml$output, name, sep='/'), quiet=TRUE); 
}); 
lnk<-paste(names(cl), paste(names(cl), '.html', sep=''), sep='/'); 
lns<-paste('  - [', names(cl), '](', lnk, ')', sep='');
lns<-paste(lns, collapse='\n'); 

Click cluster names to see more detaied information about individual clusters

r lns

Go back to project home

Extra plots

ms<-sapply(cl, function(cl) colMeans(expr[cl, ]));
if (prms$plot$rescale) ms<-t(scale(t(ms))); 
m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE]));
se<-sapply(grp, function(g) apply(ms[g, , drop=FALSE],  2, function(x) sd(x)/sqrt(length(x)))); 
if (prms$plot$zero) m<-m-m[, 1];
PlotSeries(m, se, c('', prms$plot$ylab)); 

Figure 5 Plot the group means and standard errors of all cluster as data series

ms<-sapply(cl, function(cl) colMeans(expr[cl, ]));
if (prms$plot$rescale) ms<-t(scale(t(ms))); 
m<-sapply(grp, function(g) colMeans(ms[g, , drop=FALSE]));
colnames(m)<-names(grp);
rownames(m)<-names(cl); 
PlotColoredBlock(m, min=-1*max(abs(m), na.rm=TRUE), max=max(abs(m), na.rm=TRUE), key='Cluster average', groups=split(colnames(m), colnames(m))); 

Figure 6 Heatmap of group and cluster means

d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE];
d<-sapply(grp, function(g) rowMeans(d[, g, drop=FALSE])); 
if (prms$plot$rescale) d<-t(scale(t(d))); 
colnames(d)<-names(grp); 
PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=split(colnames(d), colnames(d)), key='Group average'); 
abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); 

Figure 7 Group means of all clustered genes

d<-expr[unlist(cl, use.names = FALSE), , drop=FALSE];
d<-sortColumn(d, grp); 
#rownames(d)<-paste(rownames(d), anno[rownames(d), 1], sep=':'); 
if (prms$plot$rescale) d<-t(scale(t(d))); 
PlotColoredBlock(d, min=-1*max(abs(d), na.rm=TRUE), max=max(abs(d), na.rm=TRUE), num.breaks = 127, groups=grp, key='Expression level'); 
abline(h=c(nrow(d), nrow(d)-cumsum(sapply(cl, length)), lwd=1)); 
abline(v=c(0, cumsum(sapply(grp, length))), lwd=1); 

Figure 8 All samples and all clustered genes

rownames(n)<-paste(rownames(n), ' (n=', n[, 1], ' to ', n[, ncol(n)], ')', sep='')
PlotColoredBlock(n, key='Cluster size'); 

Figure 9 This plot traces the change of cluster sizes after each reclustering cycle.

x<-strsplit(yml$output, '/')[[1]]; 
fn.zip<-paste(x[length(x)], '.zip', sep='');
fn.zip.fn<-paste(yml$output, fn.zip, sep='/');
if (file.exists(fn.zip.fn)) file.remove(fn.zip.fn); 
zip(fn.zip, yml$output, "-rj9X", zip='zip'); 
file.rename(fn.zip, fn.zip.fn); 
**_[Go back to project home](`r yml$project$Home`)_**

END OF DOCUMENT



zhezhangsh/DEGandMore documentation built on Sept. 22, 2022, 9:55 a.m.