In zhezhangsh/DEGandMore:

cat("Setting up and loading data\n");

knitr::opts_chunk$set(eval=TRUE, dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE);

library(MASS);
library(gplots);
library(knitr);
library(rmarkdown); 
library(yaml); 
library(awsomics);
library(rchive);
library(DEGandMore);
library(pathview); 

#fn.yaml<-"/nas/is1/zhangz/projects/falk/2015-01_Worm_Antioxidant_Drugs/R/pairwise/MS010_A-vs-dTPP_A.yml"; # comment it out if running through knitr::knit function

# Loading inputs from yaml file
# if (!exists('fn.yaml')) stop('Input file not found\n'); 
# writeLines(yaml::as.yaml(yml), fn.yaml);
#yml <- yaml.load_file(fn.yaml);  

inputs<-yml$input;
if (class(inputs$anno) == 'character') inputs$anno<-readRDS(inputs$anno); 
if (class(inputs$expr) == 'character') inputs$expr<-readRDS(inputs$expr); 
if (class(inputs$geneset) == 'character') inputs$geneset<-readRDS(inputs$geneset); 

# All input variables
anno<-inputs$anno;
expr<-as.matrix(inputs$expr);
geneset<-inputs$geneset;
genome<-inputs$genome;
paired<-inputs$paired;
penalty<-inputs$penalty;
grps<-inputs$comparison;
g1.ind<-grps[[1]];
g2.ind<-grps[[2]];
g1.name<-names(grps)[1]
g2.name<-names(grps)[2];
g1.name<-gsub('-', '_', g1.name);
g2.name<-gsub('-', '_', g2.name);

# parameters of differential expression
deg.method<-inputs$deg$method;
cutoff.l2r<-inputs$deg$cutoff.l2r;
cutoff.p<-inputs$deg$cutoff.p;
cutoff.fdr<-inputs$deg$cutoff.fdr;
num.top<-inputs$deg$num.top;
nperm<-inputs$deg$nperm;
logged<-inputs$deg$logged;

# Check validity of inputs
if (!identical(rownames(expr), rownames(anno))) stop('Row names of annotation and gene expression data are not identical');
if (is.null(paired)) paired<-FALSE; 
if (is.null(g1.ind)) stop('Error: Index of samples in group 1 unknown');
if (is.null(g2.ind)) stop('Error: Index of samples in group 2 unknown');
if (is.null(g1.name)) stop('Error: Name of group 1 unknown');
if (is.null(g2.name)) stop('Error: Name of group 2 unknown');
if (length(g1.ind)<2) stop('Error: Not enough samples in group ', g1.name, ', minimum=2');
if (length(g2.ind)<2) stop('Error: Not enough samples in group ', g2.name, ', minimum=2');
if (paired & length(g1.ind)!=length(g2.ind)) stop('Error: Unequal sample sizes for paired comparison, ', length(g1.ind), ' vs. ', length(g2.ind));

# path to output files 
path.output<-yml$output;
path0<-paste(g1.name, g2.name, sep='-vs-');
path<-paste(path.output, path0, sep='/');
path.figure<-paste(path, 'figure', sep='/');
path.table <-paste(path, 'table',  sep='/');
if (!file.exists(path.output)) dir.create(path.output, recursive = TRUE);
if (!file.exists(path)) dir.create(path, recursive = TRUE);
if (!file.exists(path.figure)) dir.create(path.figure, recursive = TRUE);
if (!file.exists(path.table )) dir.create(path.table , recursive = TRUE);

# Re-process gene expression matrix
e1<-expr[, g1.ind, drop=FALSE];
e2<-expr[, g2.ind, drop=FALSE];
e1.2<-cbind(e1, e2); 
pctl<-apply(e1.2, 2, function(e) 100*rank(e)/length(e)); # percentile

inputs$expr<-e1.2;
fn.fig<-c();  # Figure file name and index
res<-list(inputs=inputs);  # Result set

cat('Analyze data variance\n');

fn.fig['variance']<-paste(path.figure, 'variance.pdf', sep='/'); 
pdf(fn.fig['variance'], width=6.4, height=8);
par(mfrow=c(3,1), mar=c(4,5,2,2));

# Distribution of average expression level
m<-rowMeans(e1.2);
d<-density(m);
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='A. Distribution of Expression Measurement', cex.lab=2, cex.main=1.5);
title(xlab='Gene Expression Measurement (sample average)', line=2, cex=2);

x0<-as.vector(summary(m))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4); 
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

# Distribution of standard deviation across samples
sd<-apply(e1.2, 1, sd);
d<-density(sd); 
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='B. Distribution of Variance', cex.lab=2, cex.main=1.5);
title(xlab='Gene Standard Deviation (between samples)', line=2, cex=2);

x0<-as.vector(summary(sd))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4);
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

plot(m, sd, xlab='', ylab='Standard Deviation', main='C. Variance vs. Expression Measurement', cex.lab=2, cex.main=1.5, cex=0.5, col='darkgrey');
title(xlab='Expression Measurement', line=2, cex=2);
lines(lowess(m, sd), lwd=3, col=2);

dev.off();

cat('Plot variance boxplot\n');

fn.fig['boxplot']<-paste(path.figure, 'boxplot.pdf', sep='/');
pdf(fn.fig['boxplot'], width=6.4, height=4); 
par(mfrow=c(1,1), mai=c(0.7, 0.6, .1,.1))
plot(0, type='n', xlim=c(0.5, 0.5+ncol(e1.2)), ylim=c(min(e1.2), max(e1.2)), ylab='', xlab='', xaxt='n'); 
lines(1:ncol(e1.2), apply(e1.2, 2, median), col='green', lwd=1);
title(ylab='Expression Level', line=2);
cex<-max(strwidth(colnames(e1.2), unit='in'));
boxplot(data.frame(e1.2), las=3, notch=TRUE, boxwex=.6, col=c(rep('lightgrey', ncol(e1)), rep('lightblue', ncol(e2))), names=colnames(e1.2), cex.axis=min(1, 0.4/cex), yaxt='n', add=TRUE, pch=19, cex=0.4)->x;
dev.off();

cat('Unsupervised hierarchical clustering\n');

fn.fig['clustering']<-paste(path.figure, 'clustering.pdf', sep='/');
pdf(fn.fig['clustering'], width=6.4, height=4.8);
par(mfrow=c(1,1), mai=c(0.2, 0.8, .2 , .2));
plot(hclust(dist(t(e1.2))), main='', ylab='', xlab='', sub='', cex=min(1, 5/ncol(e1.2)/0.12), frame.plot=TRUE);
title(ylab='Distance', line=2.5, cex.lab=2);
dev.off();

cat('Plot PCA\n'); 

fn.fig['pca']<-paste(path.figure, 'pca.pdf', sep='/');
pdf(fn.fig['pca'], width=6.4, height=4.8);

pca<-prcomp(t(e1.2));

X<-pca$x[,1];
Y<-pca$x[,2];
per<-round(summary(pca)$importance[2, 1:2]*100, 2);
pca$importance<-summary(pca)$importance;
res$pca<-pca;

col<-rep(c('darkgreen', 'deeppink1'), c(length(g1.ind), length(g2.ind)));

layout(matrix(1:2, nrow=1), width=c(3,1));
par(mai=c(1,1,.25,.25));

cx<-max(1.25, min(4, 24/length(X)))
plot(X, Y, col=col, pch=19, cex=cx, xlim=c(min(X)*1.1, max(X)*1.1), ylim=c(min(Y)*1.1, max(Y)*1.1), 
    xlab=paste('PC1', ', ', per[1], '%', sep=''), ylab=paste('PC2', ', ', per[2], '%', sep=''), cex.lab=1.5);

text(X, Y, label=1:length(X), col='white', cex=.3*cx);

par(mai=c(1, 0, 0.25, 0));
plot(0, type='n', xlim=c(0, 100), ylim=c(1, 100), axes=FALSE, bty='n', xaxs='i', yaxs='i', xlab='', ylab='');

w<-strwidth(1:length(X), cex=1.2);
w0<-1.2*80/max(w, 80);
h0<-1.2*(100/length(X))/4.0;
cex<-min(1.2, w0, h0); 

points(rep(5, length(X)), 100-(1:length(X))*4.0*cex, col=col, pch=19, cex=1.5*cex);
text(5+cex*5, 100-(1:length(X))*4.0*cex, labels=colnames(e1.2), adj=0, col=col, pch=19, cex=cex);
text(rep(5, length(labels)), 100-(1:length(X))*4.0*cex, labels=1:length(X), col='white', cex=0.8*cex);

dev.off();

cat('DEG analysis\n'); 

pth.deg<-paste(path, '/DEG', sep='');
if (!file.exists(pth.deg)) dir.create(pth.deg);

means<-cbind(rowMeans(expr[, g1.ind]), rowMeans(expr[, g2.ind]));
colnames(means)<-c(g1.name, g2.name);
if (logged) l2r<-means[,2]-means[,1] else l2r<-log2(means[,2]/means[,1]);
l2r[is.na(l2r)]<-0;
fc<-exp(l2r*log(2));
mn<-cbind(means, l2r, fc);
colnames(mn)<-c(paste('Mean', names(grps), sep='_'), 'LogFC', 'FoldChange');

# Adjust expr matrix if a penalty is given for large between-sample variance
if(penalty>0) {
  if (penalty>1) penalty<-1;
  colnames(means)<-paste('Mean_', c(g1.name, g2.name), sep='');
  if (paired) sd<-apply(expr[, g2.ind]-expr[, g1.ind], 1, sd) else {
    v1<-apply(expr[, g1.ind], 1, var);
    v2<-apply(expr[, g2.ind], 1, var);
    df1<-length(g1.ind)-1;
    df2<-length(g2.ind)-1;
    sd<-sqrt((df1*v1+df2*v2)/(df1+df2));
    sd[sd==0]<-min(sd[sd>0])/2;
   }
  pnl<-quantile(sd, probs=seq(0, 1, 0.01))[100*round(1-penalty,2)+1];
  e<-expr; # save the original
  e[, g2.ind]<-apply(e[, g2.ind], 2, function(d) means[,1]+(d-means[,1])/(pnl+sd));
} else e<-expr;

############################################################################################
de<-DeWrapper(e, grps, deg.method, list(nperm=nperm, paired=paired, logged=logged));
############################################################################################

de$results$stat[, c(1,2,4)]<-mn[, 1:3];
de$results$stat[, 3]<-de$results$stat[, 2]-de$results$stat[, 1]; 
de$data<-expr; 
saveRDS(de, file=paste(pth.deg, 'deg.rds', sep='/')); 
if (deg.method == DeMethods()[3]) { # Rank Product methods
  s<-de$results$stat; 
  rp<-de$results$rp$summarized; 
  out<-lapply(rp, function(x) x[, c('Rank', 'Pvalue', 'FDR')]);
  out[[3]]<-cbind(out[[1]], out[[2]]);
  cnm<-rep(paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), each=3);
  colnames(out[[3]])<-paste(colnames(out[[3]]), cnm, sep='_')
  out<-lapply(out, function(out) cbind(mn[rownames(out), , drop=FALSE], out));
  names(out)<-c(paste(c('Higher', 'Lower'), '_in_', g2.name, sep=''), 'Single_rank');
  degs<-lapply(out[1:2], function(x) {
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'Rank']<=num.top, , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(d[, 'Rank']), ]);
  stat<-out[[3]];
} else {
  out<-cbind(mn[rownames(de$results$stat), , drop=FALSE], de$results$stat[, 5:ncol(s)]);
  degs<-lapply(c(1, -1), function(di) {
    x<-out[sign(out[, 'LogFC'])==di, , drop=FALSE];
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & abs(x[, 'LogFC'])>=sort(abs(x[, 'LogFC']), decreasing = TRUE)[min(num.top, nrow(x))], , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(-1*abs(d[, 'LogFC'])), , drop=FALSE]);
  stat<-out;
}

de$DEG<-degs;
up<-degs[[1]];
dn<-degs[[2]];

stat.table<-cbind(anno[rownames(stat), ], stat);
stat.formatted<-awsomics::FormatNumeric(cbind(ID=rownames(stat.table), stat.table));
stat.formatted$ID<-awsomics::AddHref(stat.formatted$ID, awsomics::UrlEntrezGene(stat.formatted$ID));
CreateDatatable(stat.formatted, fn = paste(path, 'DEG', 'all_genes.html', sep='/'), rownames = FALSE, caption = "Differential expression of all genes");

cat('Summarizing DEG analysis\n'); 

fn.fig['deg']<-paste(path.figure, 'deg.pdf', sep='/');
pdf(fn.fig['deg'], width=6.4, height=4.8);
par(mai=c(0.4, 0.5, 0.3, 0.05), mfrow=c(2,2));

#if (class(out)=='list') d<-out[[length(out)]] else d<-out;
d<-cbind(mn, de$results$stat[, 5:6]);

# M-A Plot
plot(rowMeans(d[,1:2]), d[,'LogFC'], main='M-A Plot', pch=19, col='darkgrey',  cex=.25, xlab='', ylab='', cex.axis=.75);
abline(h=0, lwd=1, col=1);
lines(lowess(d[,'LogFC']~rowMeans(d[,1:2])), lwd=2, col=3);
ylab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
xlab<-paste('Log2(', g1.name, ')/2 + Log2(', g2.name, ')/2', sep='');
title(xlab=xlab, ylab=ylab, line=1.6, cex.lab=.75);

#Volcano Plot
x<-d[,'LogFC'];
p<-d[,'Pvalue'];
p[p==0]<-min(p[p>0])/exp(abs(d[p==0,3])*log(2));
y<--1*log10(p);
z<-sqrt(abs(x*y));
cx<-z/(max(z));
plot(x, y, main='Volcano Plot', pch=19, col='darkgrey',  cex=0.8*cx, xlab='', ylab='', cex.axis=.75, ylim=c(0, 1.1*max(y)), yaxs='i', xlim=c(-1*max(abs(x)), max(abs(x))), yaxt='n');
xlab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
title(xlab=xlab, ylab='P value', line=1.6, cex.lab=.75);
axis(2, at=0:ceiling(max(y)), labels=10^(-1*(0:ceiling(max(y)))));
abline(h=-1*log10(0.05), v=c(-1,1), col=4, lty=3);
box();

# P Value Distribution
hist(p[p<1], br=seq(0, 1, 0.01), cex.axis=.75, main='P Value Distribution');
title(xlab='P value', ylab='Count', line=1.6, cex.lab=.75);
box();

# FDR
q<-round(d[, 'FDR'], 2);
n<-sapply(seq(min(q), 1, 0.01), function(x) length(q[q<=x]));
plot(1, type='n', log='y', xlab='', ylab='', cex.axis=.75, xlim=0.01*c(1, 100), ylim=c(1, nrow(expr)), main='FDR');
abline(v=seq(0, 1, .05), lty=3, col=8);
lines(seq(min(q), 1, 0.01), n, lwd=2, col=2);
title(xlab='FDR cutoff', ylab='Count', line=1.6, cex.lab=.75);
box();

dev.off();

#FDR counts
c<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
n.fdr<-sapply(c, function(c) sapply(c(1, -1), function(di) nrow(d[sign(d[, 'LogFC'])==di & d[, 'FDR']<=c, , drop=FALSE])));
fdr.table<-cbind(c, t(n.fdr), colSums(n.fdr)); 
colnames(fdr.table)<-c('FDR', paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), 'Total')

# Plot top genes
fn.fig['top']<-paste(path.figure, 'top.pdf', sep='/');
pdf(fn.fig['top'], width=6.4, height=2.4);
par(mfrow=c(1,2), mai=c(0.7, 0.35, 0.3, 0.05));
ind<-c(rownames(up)[1], rownames(dn)[1]);

cex<-min(0.75, min(0.5/max(strwidth(colnames(e1.2), unit='in'))));
col<-rep(c('grey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
for (ii in 1:2) barplot(e1.2[ind[ii], ], las=3, cex.names=cex, main=CleanHtmlTags(anno[ind[ii], 1], FALSE), cex.main=.75, cex.axis=.75, col=col);
dev.off();

# Plot heatmap of top genes
fn.fig['heatmap']<-paste(path.figure, 'heatmap.pdf', sep='/');
pdf(fn.fig['heatmap'], width=6.4, height=6.4);
par(mfrow=c(1,1));
goi<-e1.2[c(rownames(up), rownames(dn)), ];
rownames(goi)<-as.vector(anno[rownames(goi), 1]);
DegHeatmap(goi,  col=rep(c('chartreuse1', 'deeppink1'), c(length(g1.ind), length(g2.ind))), plot.new=FALSE);
dev.off();

## Gene counts by FDR cutoffs
fdr<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
if (deg.method == DeMethods()[3]) {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]>0 & stat[, 7]<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]<0 & stat[, 10]<=fdr, , drop=FALSE]));
} else {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']>0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']<0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
} 
fdr.table<-cbind(fdr, c1, c2, c1+c2);
colnames(fdr.table)<-c('FDR cutoff', names(degs), 'Total');

# pre-ranking of genes
rnk<-sign(d[, 'LogFC'])*sqrt(d[,'LogFC']^2+log10(p)^2);

cat('Write out DEG results\n');

pth.deg1<-paste(pth.deg, '/Higher_in_', g2.name, sep='');
pth.deg2<-paste(pth.deg, '/Lower_in_', g2.name, sep='');
pth.deg1.bars<-paste(pth.deg1, '/bars', sep='');
pth.deg2.bars<-paste(pth.deg2, '/bars', sep='');

if (!exists(pth.deg)) dir.create(pth.deg, showWarnings = FALSE);
if (!exists(pth.deg1.bars)) dir.create(pth.deg1.bars, showWarnings = FALSE, recursive = TRUE);
if (!exists(pth.deg2.bars)) dir.create(pth.deg2.bars, showWarnings = FALSE, recursive = TRUE);
ids<-c(up=rownames(up), dn=rownames(dn));
names(ids)<-paste(rep(c(pth.deg1.bars, pth.deg2.bars), c(nrow(up), nrow(dn))), '/', ids, '.pdf', sep='');

col<-rep(c('lightgrey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
wid<-min(1.8, 1.2/(0.1*max(nchar(colnames(expr[, c(g1.ind, g2.ind)])))));
fn.barplot<-sapply(names(ids), function(nm) {
    pdf(nm, w=8, h=6); 
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(expr[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Expression level', cex.lab=wid, cex.names=wid);
    title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2);
    #plot.new();
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(pctl[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Percentile (%)', ylim=c(0, 100), cex.lab=2, cex.names=wid);
    title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2);

    dev.off();
    nm;
});

# Write index tables of DEGs
cnm<-c(colnames(anno), colnames(up), 'Samples');
up.tbl<-data.frame(anno[rownames(up), , drop=FALSE], up, rep('Figure', nrow(up)));
colnames(up.tbl)<-cnm;
up.tbl$Samples<-AddHref(up.tbl$Samples, paste('./bars/', rownames(up), '.pdf', sep='')); 
up.formatted<-GeneList2Datatable(FormatNumeric(up.tbl), paste(pth.deg1, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with higher expression in', g2.name));

dn.tbl<-data.frame(anno[rownames(dn), , drop=FALSE], dn, rep('Figure', nrow(dn)));
colnames(dn.tbl)<-cnm;
dn.tbl$Samples<-AddHref(dn.tbl$Samples, paste('./bars/', rownames(dn), '.pdf', sep='')); 
dn.formatted<-GeneList2Datatable(FormatNumeric(dn.tbl), paste(pth.deg2, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with lower expression in', g2.name));

cat('ORA analysis\n');

########################################################################################################
ora<-lapply(degs, function(gs) awsomics::TestGSE(rownames(gs), rownames(expr), geneset$list));
########################################################################################################

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora);
sapply(names(degs), function(nm) if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'), recursive = TRUE))->x;
names(ora)<-names(degs);
saveRDS(ora, paste(path.ora, 'ora_full.rds', sep='/'));

ora.table<-lapply(ora, function(ora) {
  g<-ora$stat;
  lst<-ora$list[rownames(g)];
  rand<-sapply(lst, function(l) { 
    mtrx<-e1.2[rownames(e1.2) %in% l, , drop=FALSE];
    flexclust::randIndex(table(kmeans(t(mtrx), 2)[[1]], rep(1:2, sapply(grps, length))));
  })
  an<-geneset[[1]][rownames(g), ];
  data.frame(row.names = rownames(g), stringsAsFactors = TRUE, Source=an[,1], Collection=an[,2], Term=an[, 'Name'], 
                Total=rowSums(ora$size[rownames(g), 3:4]), Within=ora$size[rownames(g), 4], Enrichment=g[, 'Odds_Ratio'], 
                Rand=rand, Pvalue=g[, 'P_HyperGeo'], FDR=g[, 'FDR_BH']);
}); 

########################################################################################################
# Biclustering
bic<-lapply(ora, function(g) if (nrow(g[[1]])<5) NA else BiclusterFromList(g[[2]][rownames(g[[1]])[1:min(250, nrow(g[[1]]))]]));
ora$bicluster<-bic;

bic2gs<-lapply(bic, function(bic) lapply(bic[[2]], rownames));
gs2bic<-lapply(bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x))));
gs2bic<-lapply(gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';')));

########################################################################################################
# Output tables
ora.table<-lapply(names(ora.table), function(nm) {
  b<-gs2bic[[nm]];
  b<-b[rownames(ora.table[[nm]])];
  b[is.na(b)]<-'';
  ora.table[[nm]]$Cluster<-b;
  ora.table[[nm]];
});
names(ora.table)<-names(bic);

cat('Write out ORA results\n');

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora, recursive = TRUE);

# Split results into tables by collections
ora.wrapped<-lapply(names(ora.table), function(nm) {
  pth<-paste(path.ora, nm, 'table', sep='/');
  if (!file.exists(pth)) dir.create(pth, recursive = TRUE); 
  WrapGSE(ora.table[[nm]][, -(1:3)], geneset$meta, pth, TRUE); 
});
names(ora.wrapped)<-names(ora.table);

# create index.html file
ora.tbl<-lapply(ora.wrapped, function(o) {
  n<-lapply(o$formatted, function(o) sapply(o, nrow)); 
  fn<-unlist(o$file, use.names=FALSE); 
  fn<-sub(paste(path.ora, '/', sep=''), '', fn); 
  s<-rep(names(n), sapply(n, length));
  c<-unlist(lapply(n, names), use.names=FALSE); 
  n<-as.vector(unlist(n, use.names=FALSE)); 
  tbl<-data.frame(Source=s, Collection=c, N=n, stringsAsFactors = FALSE);
  tbl$N<- AddHref(tbl$N, fn);
  rownames(tbl)<-paste(tbl[[1]], tbl[[2]], sep='_'); 
  tbl
});
u<-sort(union(rownames(ora.tbl[[1]]), rownames(ora.tbl[[2]]))); 
ora.tbl<-lapply(ora.tbl, function(t) {
  t<-t[u, ];
  t[is.na(t[,3]), 3]<-0; 
  rownames(t)<-u; 
  t;
});
ora.ind<-cbind(ora.tbl[[1]], N2=ora.tbl[[2]]$N); 
ora.ind[is.na(ora.ind[,1]), 1]<-ora.tbl[[2]][is.na(ora.ind[,1]), 1]; 
ora.ind[is.na(ora.ind[,2]), 2]<-ora.tbl[[2]][is.na(ora.ind[,2]), 2]; 
names(ora.ind)<-c('Source', 'Collection', names(ora.wrapped)); 
CreateDatatable(ora.ind, paste(path.ora, 'index.html', sep='/'), rownames = FALSE, caption = "Click on number to see list"); 

# All tested gene sets
gs.table<-lapply(split(geneset[[1]][, 'Collection'], geneset[[1]][, 'Source']), table);
gs.table<-data.frame(stringsAsFactors = FALSE, Source=rep(names(gs.table), sapply(gs.table, length)), 
                     Collection=unlist(lapply(gs.table, names), use.names=FALSE), Geneset=as.numeric(unlist(gs.table)));
awsomics::CreateDatatable(gs.table, paste(path.ora, 'geneset.html', sep='/'), rownames = FALSE, caption = "Gene set collections");

# Formatted tables of ORA results
stat.slim<-data.frame(anno[rownames(stat), 1, drop=FALSE], awsomics::FormatNumeric(stat));
ora.formatted<-lapply(names(ora.table), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'term', sep='/'))) dir.create(paste(path.ora, nm, 'term', sep='/'));

  t<-ora.table[[nm]];
  an<-geneset[[1]][rownames(t), ];
  t$Term<-awsomics::AddHref(t$Term, an[, 'URL']);
  t$Total[1:min(250, nrow(t))]<-awsomics::AddHref(t$Total[1:min(250, nrow(t))], paste('./term/term_', 1:min(250, nrow(t)), '.html', sep=''));
  #t$Within<-awsomics::AddHref(t$Within, paste('./term/term_', rownames(t), '_within.html', sep=''));

  t<-awsomics::FormatNumeric(t);
  awsomics::CreateDatatable(t, fn=paste(path.ora, nm, 'term.html', sep='/'), rownames = FALSE);
  sapply(1:min(250, nrow(t)), function(i) {
    g<-geneset$list[rownames(t)[i]][[1]];
    t1<-stat.slim[rownames(stat.slim) %in% g, , drop=FALSE];
    t1<-t1[order(t1[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    fn<-paste(path.ora, nm, 'term', paste('term_', i, '.html', sep=''), sep='/');
    GeneList2Datatable(t1, fn, genome = genome, title = rownames(t)[i]);
  });

  t;
}); 
names(ora.formatted)<-names(ora.table);

# Formatted table of biclustering
bic.formatted<-lapply(names(bic), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'bicluster', sep='/'))) dir.create(paste(path.ora, nm, 'bicluster', sep='/'));

  t<-awsomics::FormatNumeric(bic[[nm]]$summary);

  # write individual files of biclusters
  fns<-sapply(1:nrow(t), function(i) {
    fn<- paste(path.ora, nm, 'bicluster', paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/');
    t1<-ora.formatted[[nm]][rownames(bic[[nm]][[2]][[i]]), ];
    CreateDatatable(t1, fn[1], FALSE, caption=paste('Terms of', t[i,1]));
    t2<-stat.slim[colnames(bic[[nm]][[2]][[i]]), , drop=FALSE];
    t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); 
    t3<-bic[[nm]][[2]][[i]];
    rownames(t3)<-paste(ora.table[[nm]][rownames(t3), 2], ora.table[[nm]][rownames(t3), 3], sep=' - ');
    colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - ');
    if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999';
    PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF');
    fn;
  });
  bic.fn<-t(fns);

  # write index table
  t$Num_Terms<-AddHref(t$Num_Terms, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 1]));
  t$Num_Genes<-AddHref(t$Num_Genes, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 2]));
  t$ID<-AddHref(t$ID, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 3]));
  CreateDatatable(t, fn=paste(path.ora, nm, 'bicluster.html', sep='/'), rownames = FALSE);

  t;
});

cat('Preparing GSEA analysis\n');

path.gsea<-paste(path, 'GSEA', sep='/');
if (file.exists(path.gsea)) unlink(path.gsea, recursive = TRUE); # Path must not exist

tmp<-as.character(as.integer(Sys.time())); 
if (file.exists(tmp)) unlist(tmp, recursive = TRUE); 
dir.create(tmp, recursive = TRUE); 

# loading template yaml
input.gsea<-yml$input$gsea;
gsea.yml<-input.gsea$template;
if (input.gsea$remote) gsea.yml<-yaml.load(RCurl::getURL(gsea.yml)) else gsea.yml<-yaml.load_file(gsea.yml)

# If Rank Product was used for DE, adjust the data for GSEA so the group mean difference reflects RP results
e1.2_gsea<-e1.2; 
if (deg.method[1] == 'DeRankP') {
  log.rp<-log10(de$results$stat[, 'RP2']/de$results$stat[, 'RP1']);
  m1<-rowMeans(e1.2[, g1.ind]); 
  m2<-rowMeans(e1.2[, g2.ind]); 
  m0<-m1+log.rp; 
  e1.2_gsea[, g2.ind]<-e1.2[, g2.ind] - (m2-m0); 
}; 

fn.gsea<-PrepareGSEA(e1.2_gsea, grps, paste(tmp, 'input', sep='/'), desc = anno[[1]]); 

gsea.yml$jar<-input.gsea$jar;
gsea.yml$preranked<-FALSE;
gsea.yml$thread<-input.gsea$thread; 
gsea.yml$name<-tmp;
gsea.yml$groups$control<-g1.name;
gsea.yml$groups$case<-g2.name;
gsea.yml$out<-path;
gsea.yml$input<-fn.gsea[1];
gsea.yml$class<-fn.gsea[2];
gsea.yml$chip<-c();
gsea.yml$gmt<-input.gsea$gmt;

cat('GSEA analysis\n');

gsea.cmmd<-GSEAviaJava(gsea.yml); 
file.rename(paste(gsea.yml$out, tmp, sep='/'), path.gsea); 
gsea.tbl<-readRDS(paste(path.gsea, 'full_table.rds', sep='/'));
for (i in 1:ncol(gsea.tbl)) gsea.tbl[[i]]<-as.vector(gsea.tbl[[i]]); 
saveRDS(gsea.tbl, file=paste(path.gsea, 'full_table.rds', sep='/')); 

# Load gene lists of gene sets from GMT files
fn.gmt<-gsea.yml$gmt
gmts<-lapply(fn.gmt, PGSEA::readGmt); 
gsea.lst<-lapply(gmts, function(gmt) {
  nm<-sub(' $', '', sapply(gmt, function(g) g@reference)); 
  lst<-lapply(gmt, function(g) g@ids); 
  names(lst)<-toupper(nm); 
  lst;
}); 

########################################################################################################
# Biclustering
gsea.tbl.sig<-gsea.tbl[gsea.tbl$PValue<=0.05, ]; 
gsea.tbl.sig<-gsea.tbl.sig[order(gsea.tbl.sig$PValue), , drop=FALSE]; 
gsea.lst.sig<-lapply(1:nrow(gsea.tbl.sig), function(i) gsea.lst[[gsea.tbl.sig[i, 1]]][[gsea.tbl.sig[i, 2]]]); 
gsea.lst.sig<-lapply(gsea.lst.sig, function(l) l[l %in% unlist(lapply(degs, rownames), use.names=FALSE)]); 
names(gsea.lst.sig)<-rownames(gsea.tbl.sig); 
gsea.lst.sig<-gsea.lst.sig[sapply(gsea.lst.sig, length)>1];
gsea.gs<-split(gsea.lst.sig, gsea.tbl.sig[names(gsea.lst.sig), ncol(gsea.tbl.sig)]); 
#gsea.gs<-rev(gsea.gs); 
gsea.gs<-lapply(rev(gsea.gs), function(x) x[!duplicated(x)]); 
names(gsea.gs)<-names(degs); 

gsea.bic<-lapply(gsea.gs, function(g) BiclusterFromList(g[1:min(250, length(g))])); 
gsea.bic2gs<-lapply(gsea.bic, function(bic) if (identical(NA, bic)) NA else lapply(bic[[2]], rownames));
gsea.gs2bic<-lapply(gsea.bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x))));
gsea.gs2bic<-lapply(gsea.gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';')));

cat('Biclustering GSEA results\n');

# Formatted table of biclustering
gsea.bic.formatted<-lapply(names(gsea.bic), function(nm) {
  path.bic<-paste(path.gsea, 'bicluster', nm, sep='/'); 
  if (!file.exists(path.bic)) dir.create(path.bic, recursive = TRUE);

  if (identical(NA, gsea.bic[[nm]])) t<-data.frame(ID='No clusters found', stringsAsFactors = FALSE) else {
    t<-FormatNumeric(gsea.bic[[nm]]$summary);
    t$ID<-as.vector(t$ID); 

    # write individual files of biclusters
    fns<-sapply(1:nrow(t), function(i) {
      fn<-paste(path.bic, paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/');
      t1<-gsea.tbl[rownames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE];
      CreateDatatable(FormatNumeric(t1[, 1:6]), fn[1], FALSE, caption=paste('Terms of', t[i,1]));
      t2<-stat.slim[colnames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE];
      t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE];
      GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); 
      t3<-gsea.bic[[nm]][[2]][[i]];
      rownames(t3)<-paste(gsea.tbl[rownames(t3), 1], gsea.tbl[rownames(t3), 2], sep=' - ');
      colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - ');
      if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999';
      PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF');
      fn;
    });
    gsea.bic.fn<-t(fns);

    # write index table
    t$Num_Terms<-AddHref(t$Num_Terms, paste(t$ID, '_terms.html', sep=''));
    t$Num_Genes<-AddHref(t$Num_Genes, paste(t$ID, '_genes.html', sep=''));
    t$ID<-AddHref(t$ID, paste(t$ID, '_heatmap.pdf', sep=''));
  }

  CreateDatatable(t, fn=paste(path.bic, 'index.html', sep='/'), rownames = FALSE);

  t;
});

gsea<-list(path=gsea.yml$out, geneset=gsea.lst, stat=gsea.tbl, bicluster=gsea.bic);

cat('Write out biclustering results\n');

t<-data.frame(matrix('table', nr=2, nc=2), stringsAsFactors = FALSE); 
t[1, ]<-AddHref(t[1, ], paste("../ORA", names(degs), 'bicluster.html', sep='/')); 
t[2, ]<-AddHref(t[2, ], paste("../GSEA/bicluster", names(degs), 'index.html', sep='/')); 
colnames(t)<-names(degs);
rownames(t)<-c('Over-representation analysis', 'Gene set enrichment analysis'); 
CreateDatatable(t, paste(path, 'DEG', 'bicluster_index.html', sep='/'));

cat('GSEA analysis of KEGG pathways\n');

path.kegg<-paste(path.gsea, 'kegg', sep='/'); 
if (file.exists(path.kegg)) unlink(path.kegg, recursive = TRUE); 
dir.create(path.kegg, recursive = TRUE); 

kegg.stat<-gsea.tbl[gsea.tbl$Collection=='KEGG_pathway' & gsea.tbl$PValue<=0.05, , drop=FALSE];

# Run pathview, generate figures in a temp directory and copy them to output directory
path.temp<-as.character(as.integer(Sys.time()));
if (!file.exists(path.temp)) dir.create(path.temp);
setwd(path.temp); 
if (nrow(kegg.stat) == 0) kegg.ind<-kegg.stat[, -1] else {
  kegg.id<-tolower(substr(kegg.stat$Gene_set, 1, 8)); 
  kegg.dff<-rowMeans(e1.2_gsea[, g2.ind])-rowMeans(e1.2_gsea[, g1.ind]); 

  fn.kegg<-sapply(kegg.id, function(id) {
    capture.output(kegg<-pathview(kegg.dff, pathway.id=id, species = GetGenomeAlias(genome, "code"), 
                                low = list(gene = "blue"), kegg.dir=yml$input$gsea$kegg))->x; 
    paste(id, '.pathview.png', sep=''); 
  }); 
  names(fn.kegg)<-kegg.id;
  fn.kegg<-fn.kegg[file.exists(fn.kegg)]; 
  if (length(fn.kegg)>0) file.rename(fn.kegg, paste(path.kegg, fn.kegg, sep='/')); 

  kegg.ind<-FormatNumeric(kegg.stat[kegg.id %in% names(fn.kegg), -1]);
  if (nrow(kegg.ind) > 0) kegg.ind$Gene_set<-AddHref(kegg.ind$Gene_set, fn.kegg); 
  kegg.ind<-kegg.ind[order(kegg.ind$NES), , drop=FALSE]; 
}
setwd('..');
unlink(path.temp, recursive = TRUE); 

CreateDatatable(kegg.ind, paste(path.kegg, 'index.html', sep='/'));

cat('Writing tables to Excel file\n');

# Excel spreadsheets
xls<-list(stat.table, stat.table[rownames(up), , drop=FALSE], stat.table[rownames(dn), , drop=FALSE],
          ora.table[[1]], ora.table[[2]], gsea.tbl);
xls<-lapply(xls, function(x) {
  for (i in 1:ncol(x)) x[[i]]<-CleanHtmlTags(x[[i]], FALSE); 
  x;
});
names(xls)<-c('Complete List', 
  paste('Top', nrow(up), ', higher in ', g2.name, sep=''), 
  paste('Top', nrow(dn), ', lower in ', g2.name, sep=''),
  paste('ORA, higher in ', g2.name, sep=''),
  paste('ORA, lower in ', g2.name, sep=''),
  "GSEA"
);
xls<-lapply(xls, awsomics::FormatNumeric);
tbl<-sapply(names(xls), function(nm) write.csv(xls[[nm]], paste(path.table, paste(nm, '.csv', sep=''), sep='/'))); 
fn.xls<-paste(path, paste(g1.name, g2.name, sep='-vs-'), sep='/');
WriteExcel(xls, fn.xls);

cat('Converting figures to png format\n');

fn.png<-lapply(fn.fig, function(f) ConvertPdf2Png(f, resize=80, density=150));
fn.png[[length(fn.png)+1]]<-sapply(names(bic), function(nm) {
  t<-bic[[nm]]$summary;
  n<-t[,2]*t[,3];
  f<-paste('bicluster/', t[n==max(n)[1], 1], '_heatmap.pdf', sep='');
  awsomics::ConvertPdf2Png(paste(path.ora, nm, f, sep='/'), resize=80, density=150); 
});
# fn.png[[length(fn.png)+1]]<-sapply(kegg$Formatted, function(t) {
#   ids<-as.vector(t$Pathway_ID);
#   fn<-paste(path, 'KEGG', 'Files', 'figures', 'colored_plots', paste(ids, '.pathview.png', sep=''), sep='/');
#   fn[file.exists(fn)][1];
# });

cat('Writing out all results\n'); 

res$de<-de;
res$ora<-ora;
res$gsea<-gsea;
saveRDS(res, file=paste(path, 'result.rds', sep='/'));

**_[Go back to project home](`r yml$project$Home`)_**

`r g1.name` vs. `r g2.name`

Results at a glance

Click links to go directly to results, or read through the rest of this page for details: - Differential gene expression ([All genes](DEG/all_genes.html)) - [`r nrow(degs[[1]])` genes](`r paste('DEG', names(degs)[1], 'index.html', sep='/')`) with higher expression in `r g2.name` - [`r nrow(degs[[2]])` genes](`r paste('DEG', names(degs)[2], 'index.html', sep='/')`) with lower expression in `r g2.name` - Analysis of differentially expressed genes (DEGs) - [ORA](ORA/index.html) (Over-representation analysis) of gene sets - [GSEA](GSEA/index.html) (Gene set enrichment analysis) of gene sets - [Top hits](GSEA/full_table.html) cheat sheet - [Colored maps](GSEA/kegg/index.html) of KEGG pathways - [Biclustering](DEG/bicluster_index.html) of significant genes and gene sets - Download - [Excel](`r paste(path0, '.xlsx', sep='')`) file with statistical tables - [Zip](`r paste(path0, '.zip', sep='')`) file with complete results.

**_[Go back to project home](`r yml$project$Home`)_**

Project

cat('Writing DE report\n');
cat('Writing project background\n'); 

lns<-lapply(names(yml$project), function(nm) {
  c(paste('##', nm), '\n', yml$project[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n');

r lns

**_[Go back to project home](`r yml$project$Home`)_**

Inputs

This document reports the analysis results from comparing the transcriptomes of two sample groups: r g1.name vs. r g2.name.

Required variables

The following inputs are required to generate this report. You can download an example of input specifications as a yaml file from https://raw.githubusercontent.com/zhezhangsh/DEGandMore/master/examples/DeReport/DeReport.yml

home: URL of project home page
output: Location of all output files
project: Project information, can have multiple sections
input: All inputs
- anno: File location to gene annotation information as a data.frame. Must be a .rds file.
  - the row names are unique Entrez gene IDs
  - the first column must be official gene symbol, can be bracketed by a hyperlink html tag
  - the last column is usually full gene name
- expr: File location to a data matrix includes processed gene expression data. Must be a .rds file
  - each row corresponds to a gene and each column corresponds to a sample
  - rows must exactly match the rows of anno, with the same IDs and the same number of rows
  - it is assumed that the data have been normalized and in linear scale (usually by log2-transformation)
- geneset: File location to a collection of predefined gene sets for over-representation analysis. Must be a .rds file
  - Examples can be found at: https://github.com/zhezhangsh/DEGandMore/tree/master/examples/geneset_collections
- genome: Name of the reference genome, in the form of "human", "hsa", or "hg38".
- paired: Boolean indicates if the samples in the 2 groups are paired. Default is FALSE. If TRUE, the samples will be paired by their order.
- penalty: Whether to give penalty to genes with high sample-sample variance within groups. No penalty if 0; highest possible penalty if 1.
- comparison: A list of 2 named character vectors
  - Vector names will be used as sample group names
  - The character vectors correspond to sample names and must match column names of the data matrix
  - The first vector has the control samples and the second has the case samples
- deg: Options of statistic test to identifiy differentially expressed genes (DEGs).
  - method: Statistical method to use. One of r paste(DeMethods(), collapse='; '). Default is Rank Product.
  - cutoff.l2r: Cutoff of group difference to choose DEGs
  - cutoff.p: Cutoff of p value to choose DEGs
  - cutoff.fdr: Cutoff of FDR to choose DEGs
  - num.top Number of genes to choose if not enough DEGs
  - nperm Number permutations for Rand Product
  - logged Whether the data has been log-transformed into linear scale
- gsea: Options of running gene set enrichment analysis (GSEA)
  - An example of full specifications in .yaml format: https://raw.githubusercontent.com/zhezhangsh/DEGandMore/master/examples/gsea/GSEA_example.yaml

**_[Go back to project home](`r yml$project$Home`)_**

Input summary

Total number of genes: r nrow(expr)
Total number of samples: r ncol(e1.2)
Comparison: r g1.name (n=r ncol(e1)) vs. r g2.name (n=r ncol(e2))
- r g1.name: r paste(colnames(e1), collapse='; ')
- r g2.name: r paste(colnames(e2), collapse='; ')
Differential expression method: r yml$input$deg$method
Output location: r path
Data file: r yml$input$expr
Gene annotation file: r yml$input$anno
ORA gene set file: r yml$input$geneset
GSEA gene set files: r paste(paste(' - **', names(yml$input$gsea$gmt), ':** ', unlist(yml$input$gsea$gmt), sep=''), collapse='\n')

**_[Go back to project home](`r yml$project$Home`)_**

Analysis and results

Sample analysis

This section analyzes the samples by summarizing their global expression patterns, through descriptive statistics, unsupervised clustering, and sample-sample correlation.

Data distribution of all genes

Distribution of average expression level and sample-sample variance.

Figure 1A: This figure is based on the average expression measurements of all genes. If the measurements have been log-transformed, they often have a bi-modal (two-peak) distribution: a high and narrow peak on the left corresponding to inactive genes and a relatively lower and wider peak on the right corresponding to actively expressed genes. If the base of log-transformation is 2, a difference of 1.0 on the x-axis corresponds to a 2-fold change of expression. Similarly, if the base is 10, the corresponding expression change is 10-fold per 1.0 difference on the x-axis. (note that microarray measurements, raw or processed, are not exactly proportional to the actual mRNA abundancy, so the term fold change should not be taken literally.)
Figure 1B: This figure is based on the between-sample standard deviations of all genes. The distribution usually has a single peak and a long tail on the right side. An enlarged tail usually indicates more genes are differentially expressed between samples. In a high quality data set, a gene not differentially expressed between samples should have small variance. So, if we assume that the majority of genes have no or little differential expression, the skewness of the peak towards the left side roughly represents data quality.
Figure 1C: This figure summarizes the relationship between gene expression measurements and their between-sample variance. Genes having smaller variance are more likely to get significant results from Student t test and similar methods. So, if there is a strong dependency of variance on expression measurements, the analysis of differential expression will be affected by the baseline expression level of genes. Variance stabilization procedure is commonly used during data normalization to minimize such dependency. In the figure, the red line is the result of LOWESS smoothing and the majority of this line should be approximately straight and horizontal if the data has been normalized properly. In practice, the LOWESS line is usually a curve higher in the middle.

Figure 1

![](`r fn.png[[1]]`)

**_[Go back to project home](`r yml$project$Home`)_**

Sample-sample correlation

All genes in this data set were used to calculate the Pearson's correlation between each pair of samples. The result is often used to identify potential outlier samples as the outliers tend to have relatively lower correlation coefficient to any of the other samples.

Figure 2: Each boxplot summarizes the distribution of expression measurements within each sample. If we can assume that all samples have approximately the same global gene expression profile, the boxes should have similar positions regardless of which group they belong to. This assumption is not always true though. For example, normal cells and cancer cells could have dramatically different global gene expression pattern. Within each boxplot, the notches indicate the median, the lower and upper sides of the box represent the first and third quantile, and the individual data points out of the whiskers are the outliers (more than 1.5 inter-quantile range from the median).
Figure 3: This is an unsupervised sample clustering used all genes in the data set. The location of their lowest common node of two samples on the clustering tree represents the similarity of their global gene expression patterns. Since the sample grouping information is not used by an unsupervised analysis, the separation of the groups into two subclusters will indicate that the two groups have distinctive gene expression patterns, or that gene expression pattern can be used to predict which group a given sample belongs to.
Figure 4: PCA, or Principal Components Analysis, is another type of unsupervised analysis that converts a large number of correlated variables (genes) into a smaller set of uncorrelated variables (principal components). Each principal component accounts for certain percentage of the variability of the whole data set and the one having the highest percentage is ordered first. This plot shows the top two principal components and their share of total variability (in parentheses). In general, samples close to each other on this plot have similar global expression patterns. PCA is often used to identify confounding factor(s) during analysis of microarray data while each principal component may or may not corresponds to a real life variable (age, treatment, disease state, etc.)

Figure 2

![](`r fn.png[[2]]`)

Figure 3

![](`r fn.png[[3]]`)

Figure 4

![](`r fn.png[[4]]`)

**_[Go back to project home](`r yml$project$Home`)_**

Gene-level differential expression

By default, Rank Product (RP), a non-paramatric statistical test based on ranks of fold changes, is used for the statistical analysis. The outputs of RP include:

RP-ranked differential expression towards both directions: The method works in a way that each gene gets two ranks for increased and decreased expression in a sample group.
Fold change, percentage change, ratio, or log2-ratio: These variables indicate the magnitude of the difference of group averages. They are the same values in different scale and can be easily converted to each other (see Appendix 1). Given the data has been log2-transformed, the difference of group averages is equivalent to the log2-ratio of unlogged data as log2(A/B)==log2(A)-log2(B). So, group difference will be referred as log2-ratio throughout this report.
p value, q value, or FDR: These variables indicates the statistical significance of group difference. P value refers to type I error of statistical test, or the chance to mistakenly conclude the differential expression of a gene while the gene is actually unchanged. RP estimates both p values and FDR based on a permutation procedure during which the ranked gene lists were shuffled to generate a background distribution of rank products.

**_[Go back to project home](`r yml$project$Home`)_**

Differential expression of all genes

The full table of single gene differential expression can be seen here.

We use the following visualization methods to get an overview of all r nrow(expr) genes:

Figure 5A: M-A Plot. This plot visualizes global pattern of group difference (as log2-ratio). M (Y axis) is the log2-ratio and A (X axis) is the average measurements of the two groups. Each dot represents a gene and the green line is generated by LOWESS smoothing. The location and shape of the LOWESS line indicate whether there is an overall skewness of differential expression or expression level-dependent differential expression.
Figure 5B: Volcano Plot. This plot puts together both types of variables (usually p value and log2-ratio) used to represent differential expression and their association. A common strategy is to only consider genes with both small p values and large log2-ratios as GOI (genes of interest). Each dot represents a gene and its size is proportional to its distance from the [0, 1] point. The horizontal line corresponds to a p value of 0.05 and the vertical lines correspond to 2.0 fold change on both sides.
Figure 5C: P Value Distribution. This plot shows the counts of genes whose Rank Product p values are located in each 0.01 interval. So, the leftmost bar indicates the number of genes having p values less than 0.01. If the data is completely random and there are enough samples, the p values will have a uniform distribution, with approximately the same number of genes in each interval. If there are indeed a number of genes differentially expressed between the compared groups and the microarray experiments are properly designed and executed, we expect that the p values have a distribution with the highest density on the left side. If the highest density of the distribution is in the middle or on the right side, we will suspect a confounding factor that has a bigger influence on the data than the factor distinguishing the two compared groups.
Figure 5D: FDR. This plots traces the number of genes corresponding to each FDR (false discovery rate) cutoff. The table below lists the number of genes corresponding to commonly used FDR cutoffs.

Figure 5

![](`r fn.png[[5]]`)

**_[Go back to project home](`r yml$project$Home`)_**

Top genes

Figure 6: Top-ranked genes on both sides (left: higher in r g2.name; right: lower in r g2.name)

Figure 6

![](`r fn.png[[6]]`)

**_[Go back to project home](`r yml$project$Home`)_**

DEGs (differentially expressed genes)

DEGs were selected using the following rules:

If there are more than r num.top genes with p values less than r yml$input$deg$cutoff.p and FDR less than r yml$input$deg$cutoff.fdr, select ALL of them for further analysis; apply the next rule otherwise;
Select all genes with p values less than r yml$input$deg$cutoff.fdr; if the number of selected genes is greater than r yml$input$deg$num.top, select only the top-ranked r yml$input$deg$num.top genes otherwise. If Rank Product was not used as the statistical test of differential expression, rank the genes by their fold change (LogFC).

Click link to get full list of DEGs:

Higher in r g2.name

Lower in r g2.name

Table: Gene counts by FDR cutoffs.

pander::pander(fdr.table);

Figure 7: A heatmap using all DEGs. Expression measurements of each gene are normalized across samples for plotting; redder means higher expression level. The color of column sidebar indicates which group each sample belongs to.

Figure 7

**_[Go back to project home](`r yml$project$Home`)_**

Gene-set analysis

In this section, the unit of analysis is predefined gene sets. It is based on the assumption that if genes of the same gene set, such as those in the same signaling pathway or coregulated group, are more likely to have differential expression, the gene set is important to the biological difference between the two sample groups.

Over-representation analysis (ORA)

ORA evaluates whether the genes of any predefined gene sets, such as a KEGG pathway or GO term, are over-represented in DEGs. We uses hypergeometric test to evaluate whether N/Nt is significantly greater than M0/Mt, where Mt is the total number of genes in the data set, M is the total number of genes in both of the data set and a gene set, Nt is the total number of DEGs previously selected, and N is the overlapping of M and Nt.

We have archived a large collection of gene sets from multiple sources. By default, a subset of these gene sets, each including at least r min(geneset[[1]][, 'Size']), are used to test over-representation of DEGs in them. Click here to see a summary of these genesets.

Click link to view full ORA results:

ORA results

cat('Writing example of ORA analysis\n');

pdf(paste(path, 'figure', 'ora.pdf', sep='/'), w=4.8, h=3.6); 
id1<-geneset[[2]][[rownames(ora[[1]][[1]])[1]]]
id2<-rownames(degs[[1]]);
PlotVenn(id1, id2, c(paste('Gene set:', rownames(ora[[1]][[1]])[1]), paste('Higher expression in', g2.name)), rownames(anno));
dev.off();
ex.ora<-ConvertPdf2Png(paste(path, 'figure', 'ora.pdf', sep='/'), resize=80, density=150)

Figure 8: Example of DEGs over-represented in a predefined gene set.

Figure 8

![](`r ex.ora`)

**_[Go back to project home](`r yml$project$Home`)_**

Gene set enrichment analysis (GSEA)

GSEA is a different category of analyses applied to gene sets. Instead of using selected top DEGs like ORA, GSEA takes into account all genes in the data set after they were ranked by a statistical test such as Rank Product test. The original GSEA was developed by Broad Institute that is usually run as a stand-alone program. We used a local installation of GSEA and our own collection of gene sets from different databases, such as BioSystems and KEGG. By default, the normalized rank products of all genes were used to adjust gene expression matrix, so the genes will be ranked by their test statistics for GSEA.

GSEA results

cat('Writing example of GSEA\n');

gsea.ex<-gsea$stat;
gsea.ex<-gsea.ex[abs(gsea.ex$NES)==max(abs(gsea.ex$NES)), ];
ex.gsea<-dir(paste(path.gsea, gsea.ex$Collection, sep='/')); 
ex.gsea<-ex.gsea[grep(gsea.ex$Gene_set[1], ex.gsea)];
ex.gsea<-paste(path.gsea, gsea.ex$Collection, ex.gsea[grep('^enplot', ex.gsea)][1], sep='/');

Figure 9: Example of GSEA enrichment plot. See here for intepretaion of such a plot.

Figure 9

![](`r ex.gsea`)

**_[Go back to project home](`r yml$project$Home`)_**

Color-coded KEGG pathways

KEGG pathways are mostly canonical metabolic pathways conserved across species. Each pathway can include both genes(proteins) and chemical compounds. Color-coded (red = hiher in r g2.name) map of KEGG pathway illustrates intuitively the overall change of genes in a pathway. In this analysis, GSEA was used to identify significant pathway first, and the test statistic from differential expression analysis, such as the normalized rank products, was used to plot the maps.

KEGG maps

cat('Writing example of KEGG pathway analysis\n'); 

ex.kegg<-kegg.stat[rownames(kegg.ind), ];
if (nrow(ex.kegg) == 0) ex.kegg<-'' else {
  ex.kegg<- ex.kegg[abs(ex.kegg$NES)==max(abs(ex.kegg$NES)), ];
  ex.id<-tolower(substr(ex.kegg[1,2], 1, 8));
  ex.kegg<-paste(path.kegg, paste(ex.id, '.pathview.png', sep=''), sep='/');
}

Figure 10: Example of color-coded KEGG pathway (red = higher in r g2.name). See here to see the original pathway map on KEGG web site.

Figure 10

![](`r ex.kegg`)

**_[Go back to project home](`r yml$project$Home`)_**

Gene-gene set biclustering

A potential problem complicates the result intepretation of gene set analysis is that many gene sets are redundant to each other by including common DEGs. Biclustering of significant genes and gene sets can be used to reduce redundant information by identifying a smaller number of "modules". Each module includes multiple genes that are likely to be found in the same gene sets and gene sets that are likely to include the same DEGs. The biclustering was performed by Iterative Signature Algorithm in this analysis.

Biclustering results

cat('Writing example of biclustering analysis\n');

ex.bic<-gsea$bicluster[[1]]$summary;
ex.bic<-ex.bic[ex.bic[,2]*ex.bic[,3]==max(ex.bic[,2]*ex.bic[,3]), ];
ex.bic<-paste(path.gsea, 'bicluster', names(degs)[1], paste(ex.bic[1,1], '_heatmap.pdf', sep=''), sep='/');
ex.bic<-ConvertPdf2Png(ex.bic, resize=80, density=150)

Figure 11: Example of a gene-gene set biclustering module. Blue means the gene is both a DEG and a member of the gene set.

Figure 11

![](`r ex.bic`)

**_[Go back to project home](`r yml$project$Home`)_**

Appendix

Fold change vs. log(fold change)

In this report, fold change refers the ratio of two group means in their unlogged form. So a fold change of 2.0 means the average of the second group is increased to twice of the average of the first group; similarly, a fold change of 0.5 means the average is reduced to half. Log(fold change) equals to the log2-transformation of the fold change. The table below gives a few examples of the conversion of these 2 variables.

c<-c(1.25, 1.5, 2, 4, 8);
fc<-c(1/rev(c), 1, c);
lg<-log2(fc);
pct<-100*(fc-1);
t<-round(cbind(fc, lg, pct), 3);
colnames(t)<-c('Fold change', 'Log2(fold change)', 'Percentage change (%)');
pander::pander(t);

R/Biocondcutor packages and functions

The table below lists the core R/Bioconductor packages and functions used to generate this report while in-house functions are used to customize their outputs.

t<-strsplit(
"Create this report\tDEGandMore\tDeReport
Hierarchical clustering\tstats\thclust 
PCA\tstats\tprcomp
Differential expression\tRankProd\tRP
Heatmap\tstats\theatmap
Biclustering analysis\tisa2\tisa
Over-representation analysis\tawsomics\tTestGSE
Gene set enrichment analysis\tDEGandMore\tGSEAviaJava
Plot color-coded KEGG pathways\tpathview\tpathview
Write Java HTML datatables\tawsomics\tCreateDatatable
Write data to Excel\txlsx\tcreateWorkbook",
'\n')[[1]]; 
t<-do.call('rbind', strsplit(t, '\t'));
colnames(t)<-c('Task', 'R package', 'R function');
pander::pander(t);

Examples:

References

R: R Development Core Team, 2011. R: A Language and Environment for Statistical Computing. ISBN 3-900051-07-0. Home page.
Bioconductor: Gentleman RC et al., 2004. Bioconductor: open software development for computational biology and bioinformatics. Genome Biology. Home page.
Rank Product: Rank products: a simple, yet powerful, new method to detect differentially regulated genes in replicated microarray experiments. FEBS Lett. 2004 Aug 27;573(1-3):83-92. Home page.
Biclustering: Ihmels J, Bergmann S, Barkai N, 2004 Defining transcription modules using large-scale gene expression data. Bioinformatics. Home page.
GSEA: Subramanian A et al. 2005 Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles PNAS. Home page
Awsomics: Home page