cat("Setting up and loading data\n");

knitr::opts_chunk$set(eval=TRUE, dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE);

library(MASS);
library(gplots);
library(knitr);
library(rmarkdown); 
library(yaml); 
library(awsomics);
library(rchive);
library(DEGandMore);
library(pathview); 

#fn.yaml<-"/nas/is1/zhangz/projects/falk/2015-01_Worm_Antioxidant_Drugs/R/pairwise/MS010_A-vs-dTPP_A.yml"; # comment it out if running through knitr::knit function

# Loading inputs from yaml file
# if (!exists('fn.yaml')) stop('Input file not found\n'); 
# writeLines(yaml::as.yaml(yml), fn.yaml);
#yml <- yaml.load_file(fn.yaml);  

inputs<-yml$input;
if (class(inputs$anno) == 'character') inputs$anno<-readRDS(inputs$anno); 
if (class(inputs$expr) == 'character') inputs$expr<-readRDS(inputs$expr); 
if (class(inputs$geneset) == 'character') inputs$geneset<-readRDS(inputs$geneset); 

# All input variables
anno<-inputs$anno;
expr<-as.matrix(inputs$expr);
geneset<-inputs$geneset;
genome<-inputs$genome;
paired<-inputs$paired;
penalty<-inputs$penalty;
grps<-inputs$comparison;
g1.ind<-grps[[1]];
g2.ind<-grps[[2]];
g1.name<-names(grps)[1]
g2.name<-names(grps)[2];
g1.name<-gsub('-', '_', g1.name);
g2.name<-gsub('-', '_', g2.name);

# parameters of differential expression
deg.method<-inputs$deg$method;
cutoff.l2r<-inputs$deg$cutoff.l2r;
cutoff.p<-inputs$deg$cutoff.p;
cutoff.fdr<-inputs$deg$cutoff.fdr;
num.top<-inputs$deg$num.top;
nperm<-inputs$deg$nperm;
logged<-inputs$deg$logged;

# Check validity of inputs
if (!identical(rownames(expr), rownames(anno))) stop('Row names of annotation and gene expression data are not identical');
if (is.null(paired)) paired<-FALSE; 
if (is.null(g1.ind)) stop('Error: Index of samples in group 1 unknown');
if (is.null(g2.ind)) stop('Error: Index of samples in group 2 unknown');
if (is.null(g1.name)) stop('Error: Name of group 1 unknown');
if (is.null(g2.name)) stop('Error: Name of group 2 unknown');
if (length(g1.ind)<2) stop('Error: Not enough samples in group ', g1.name, ', minimum=2');
if (length(g2.ind)<2) stop('Error: Not enough samples in group ', g2.name, ', minimum=2');
if (paired & length(g1.ind)!=length(g2.ind)) stop('Error: Unequal sample sizes for paired comparison, ', length(g1.ind), ' vs. ', length(g2.ind));

# path to output files 
path.output<-yml$output;
path0<-paste(g1.name, g2.name, sep='-vs-');
path<-paste(path.output, path0, sep='/');
path.figure<-paste(path, 'figure', sep='/');
path.table <-paste(path, 'table',  sep='/');
if (!file.exists(path.output)) dir.create(path.output, recursive = TRUE);
if (!file.exists(path)) dir.create(path, recursive = TRUE);
if (!file.exists(path.figure)) dir.create(path.figure, recursive = TRUE);
if (!file.exists(path.table )) dir.create(path.table , recursive = TRUE);

# Re-process gene expression matrix
e1<-expr[, g1.ind, drop=FALSE];
e2<-expr[, g2.ind, drop=FALSE];
e1.2<-cbind(e1, e2); 
pctl<-apply(e1.2, 2, function(e) 100*rank(e)/length(e)); # percentile

inputs$expr<-e1.2;
fn.fig<-c();  # Figure file name and index
res<-list(inputs=inputs);  # Result set
cat('Analyze data variance\n');

fn.fig['variance']<-paste(path.figure, 'variance.pdf', sep='/'); 
pdf(fn.fig['variance'], width=6.4, height=8);
par(mfrow=c(3,1), mar=c(4,5,2,2));

# Distribution of average expression level
m<-rowMeans(e1.2);
d<-density(m);
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='A. Distribution of Expression Measurement', cex.lab=2, cex.main=1.5);
title(xlab='Gene Expression Measurement (sample average)', line=2, cex=2);

x0<-as.vector(summary(m))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4); 
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

# Distribution of standard deviation across samples
sd<-apply(e1.2, 1, sd);
d<-density(sd); 
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='B. Distribution of Variance', cex.lab=2, cex.main=1.5);
title(xlab='Gene Standard Deviation (between samples)', line=2, cex=2);

x0<-as.vector(summary(sd))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4);
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

plot(m, sd, xlab='', ylab='Standard Deviation', main='C. Variance vs. Expression Measurement', cex.lab=2, cex.main=1.5, cex=0.5, col='darkgrey');
title(xlab='Expression Measurement', line=2, cex=2);
lines(lowess(m, sd), lwd=3, col=2);

dev.off();
cat('Plot variance boxplot\n');

fn.fig['boxplot']<-paste(path.figure, 'boxplot.pdf', sep='/');
pdf(fn.fig['boxplot'], width=6.4, height=4); 
par(mfrow=c(1,1), mai=c(0.7, 0.6, .1,.1))
plot(0, type='n', xlim=c(0.5, 0.5+ncol(e1.2)), ylim=c(min(e1.2), max(e1.2)), ylab='', xlab='', xaxt='n'); 
lines(1:ncol(e1.2), apply(e1.2, 2, median), col='green', lwd=1);
title(ylab='Expression Level', line=2);
cex<-max(strwidth(colnames(e1.2), unit='in'));
boxplot(data.frame(e1.2), las=3, notch=TRUE, boxwex=.6, col=c(rep('lightgrey', ncol(e1)), rep('lightblue', ncol(e2))), names=colnames(e1.2), cex.axis=min(1, 0.4/cex), yaxt='n', add=TRUE, pch=19, cex=0.4)->x;
dev.off();
cat('Unsupervised hierarchical clustering\n');

fn.fig['clustering']<-paste(path.figure, 'clustering.pdf', sep='/');
pdf(fn.fig['clustering'], width=6.4, height=4.8);
par(mfrow=c(1,1), mai=c(0.2, 0.8, .2 , .2));
plot(hclust(dist(t(e1.2))), main='', ylab='', xlab='', sub='', cex=min(1, 5/ncol(e1.2)/0.12), frame.plot=TRUE);
title(ylab='Distance', line=2.5, cex.lab=2);
dev.off();
cat('Plot PCA\n'); 

fn.fig['pca']<-paste(path.figure, 'pca.pdf', sep='/');
pdf(fn.fig['pca'], width=6.4, height=4.8);

pca<-prcomp(t(e1.2));

X<-pca$x[,1];
Y<-pca$x[,2];
per<-round(summary(pca)$importance[2, 1:2]*100, 2);
pca$importance<-summary(pca)$importance;
res$pca<-pca;

col<-rep(c('darkgreen', 'deeppink1'), c(length(g1.ind), length(g2.ind)));

layout(matrix(1:2, nrow=1), width=c(3,1));
par(mai=c(1,1,.25,.25));

cx<-max(1.25, min(4, 24/length(X)))
plot(X, Y, col=col, pch=19, cex=cx, xlim=c(min(X)*1.1, max(X)*1.1), ylim=c(min(Y)*1.1, max(Y)*1.1), 
    xlab=paste('PC1', ', ', per[1], '%', sep=''), ylab=paste('PC2', ', ', per[2], '%', sep=''), cex.lab=1.5);

text(X, Y, label=1:length(X), col='white', cex=.3*cx);

par(mai=c(1, 0, 0.25, 0));
plot(0, type='n', xlim=c(0, 100), ylim=c(1, 100), axes=FALSE, bty='n', xaxs='i', yaxs='i', xlab='', ylab='');

w<-strwidth(1:length(X), cex=1.2);
w0<-1.2*80/max(w, 80);
h0<-1.2*(100/length(X))/4.0;
cex<-min(1.2, w0, h0); 

points(rep(5, length(X)), 100-(1:length(X))*4.0*cex, col=col, pch=19, cex=1.5*cex);
text(5+cex*5, 100-(1:length(X))*4.0*cex, labels=colnames(e1.2), adj=0, col=col, pch=19, cex=cex);
text(rep(5, length(labels)), 100-(1:length(X))*4.0*cex, labels=1:length(X), col='white', cex=0.8*cex);

dev.off();
cat('DEG analysis\n'); 

pth.deg<-paste(path, '/DEG', sep='');
if (!file.exists(pth.deg)) dir.create(pth.deg);

means<-cbind(rowMeans(expr[, g1.ind]), rowMeans(expr[, g2.ind]));
colnames(means)<-c(g1.name, g2.name);
if (logged) l2r<-means[,2]-means[,1] else l2r<-log2(means[,2]/means[,1]);
l2r[is.na(l2r)]<-0;
fc<-exp(l2r*log(2));
mn<-cbind(means, l2r, fc);
colnames(mn)<-c(paste('Mean', names(grps), sep='_'), 'LogFC', 'FoldChange');

# Adjust expr matrix if a penalty is given for large between-sample variance
if(penalty>0) {
  if (penalty>1) penalty<-1;
  colnames(means)<-paste('Mean_', c(g1.name, g2.name), sep='');
  if (paired) sd<-apply(expr[, g2.ind]-expr[, g1.ind], 1, sd) else {
    v1<-apply(expr[, g1.ind], 1, var);
    v2<-apply(expr[, g2.ind], 1, var);
    df1<-length(g1.ind)-1;
    df2<-length(g2.ind)-1;
    sd<-sqrt((df1*v1+df2*v2)/(df1+df2));
    sd[sd==0]<-min(sd[sd>0])/2;
   }
  pnl<-quantile(sd, probs=seq(0, 1, 0.01))[100*round(1-penalty,2)+1];
  e<-expr; # save the original
  e[, g2.ind]<-apply(e[, g2.ind], 2, function(d) means[,1]+(d-means[,1])/(pnl+sd));
} else e<-expr;

############################################################################################
de<-DeWrapper(e, grps, deg.method, list(nperm=nperm, paired=paired, logged=logged));
############################################################################################

de$results$stat[, c(1,2,4)]<-mn[, 1:3];
de$results$stat[, 3]<-de$results$stat[, 2]-de$results$stat[, 1]; 
de$data<-expr; 
saveRDS(de, file=paste(pth.deg, 'deg.rds', sep='/')); 
if (deg.method == DeMethods()[3]) { # Rank Product methods
  s<-de$results$stat; 
  rp<-de$results$rp$summarized; 
  out<-lapply(rp, function(x) x[, c('Rank', 'Pvalue', 'FDR')]);
  out[[3]]<-cbind(out[[1]], out[[2]]);
  cnm<-rep(paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), each=3);
  colnames(out[[3]])<-paste(colnames(out[[3]]), cnm, sep='_')
  out<-lapply(out, function(out) cbind(mn[rownames(out), , drop=FALSE], out));
  names(out)<-c(paste(c('Higher', 'Lower'), '_in_', g2.name, sep=''), 'Single_rank');
  degs<-lapply(out[1:2], function(x) {
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'Rank']<=num.top, , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(d[, 'Rank']), ]);
  stat<-out[[3]];
} else {
  out<-cbind(mn[rownames(de$results$stat), , drop=FALSE], de$results$stat[, 5:ncol(s)]);
  degs<-lapply(c(1, -1), function(di) {
    x<-out[sign(out[, 'LogFC'])==di, , drop=FALSE];
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & abs(x[, 'LogFC'])>=sort(abs(x[, 'LogFC']), decreasing = TRUE)[min(num.top, nrow(x))], , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(-1*abs(d[, 'LogFC'])), , drop=FALSE]);
  stat<-out;
}

de$DEG<-degs;
up<-degs[[1]];
dn<-degs[[2]];

stat.table<-cbind(anno[rownames(stat), ], stat);
stat.formatted<-awsomics::FormatNumeric(cbind(ID=rownames(stat.table), stat.table));
stat.formatted$ID<-awsomics::AddHref(stat.formatted$ID, awsomics::UrlEntrezGene(stat.formatted$ID));
CreateDatatable(stat.formatted, fn = paste(path, 'DEG', 'all_genes.html', sep='/'), rownames = FALSE, caption = "Differential expression of all genes");
cat('Summarizing DEG analysis\n'); 

fn.fig['deg']<-paste(path.figure, 'deg.pdf', sep='/');
pdf(fn.fig['deg'], width=6.4, height=4.8);
par(mai=c(0.4, 0.5, 0.3, 0.05), mfrow=c(2,2));

#if (class(out)=='list') d<-out[[length(out)]] else d<-out;
d<-cbind(mn, de$results$stat[, 5:6]);

# M-A Plot
plot(rowMeans(d[,1:2]), d[,'LogFC'], main='M-A Plot', pch=19, col='darkgrey',  cex=.25, xlab='', ylab='', cex.axis=.75);
abline(h=0, lwd=1, col=1);
lines(lowess(d[,'LogFC']~rowMeans(d[,1:2])), lwd=2, col=3);
ylab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
xlab<-paste('Log2(', g1.name, ')/2 + Log2(', g2.name, ')/2', sep='');
title(xlab=xlab, ylab=ylab, line=1.6, cex.lab=.75);

#Volcano Plot
x<-d[,'LogFC'];
p<-d[,'Pvalue'];
p[p==0]<-min(p[p>0])/exp(abs(d[p==0,3])*log(2));
y<--1*log10(p);
z<-sqrt(abs(x*y));
cx<-z/(max(z));
plot(x, y, main='Volcano Plot', pch=19, col='darkgrey',  cex=0.8*cx, xlab='', ylab='', cex.axis=.75, ylim=c(0, 1.1*max(y)), yaxs='i', xlim=c(-1*max(abs(x)), max(abs(x))), yaxt='n');
xlab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
title(xlab=xlab, ylab='P value', line=1.6, cex.lab=.75);
axis(2, at=0:ceiling(max(y)), labels=10^(-1*(0:ceiling(max(y)))));
abline(h=-1*log10(0.05), v=c(-1,1), col=4, lty=3);
box();

# P Value Distribution
hist(p[p<1], br=seq(0, 1, 0.01), cex.axis=.75, main='P Value Distribution');
title(xlab='P value', ylab='Count', line=1.6, cex.lab=.75);
box();

# FDR
q<-round(d[, 'FDR'], 2);
n<-sapply(seq(min(q), 1, 0.01), function(x) length(q[q<=x]));
plot(1, type='n', log='y', xlab='', ylab='', cex.axis=.75, xlim=0.01*c(1, 100), ylim=c(1, nrow(expr)), main='FDR');
abline(v=seq(0, 1, .05), lty=3, col=8);
lines(seq(min(q), 1, 0.01), n, lwd=2, col=2);
title(xlab='FDR cutoff', ylab='Count', line=1.6, cex.lab=.75);
box();

dev.off();

#FDR counts
c<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
n.fdr<-sapply(c, function(c) sapply(c(1, -1), function(di) nrow(d[sign(d[, 'LogFC'])==di & d[, 'FDR']<=c, , drop=FALSE])));
fdr.table<-cbind(c, t(n.fdr), colSums(n.fdr)); 
colnames(fdr.table)<-c('FDR', paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), 'Total')

# Plot top genes
fn.fig['top']<-paste(path.figure, 'top.pdf', sep='/');
pdf(fn.fig['top'], width=6.4, height=2.4);
par(mfrow=c(1,2), mai=c(0.7, 0.35, 0.3, 0.05));
ind<-c(rownames(up)[1], rownames(dn)[1]);

cex<-min(0.75, min(0.5/max(strwidth(colnames(e1.2), unit='in'))));
col<-rep(c('grey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
for (ii in 1:2) barplot(e1.2[ind[ii], ], las=3, cex.names=cex, main=CleanHtmlTags(anno[ind[ii], 1], FALSE), cex.main=.75, cex.axis=.75, col=col);
dev.off();

# Plot heatmap of top genes
fn.fig['heatmap']<-paste(path.figure, 'heatmap.pdf', sep='/');
pdf(fn.fig['heatmap'], width=6.4, height=6.4);
par(mfrow=c(1,1));
goi<-e1.2[c(rownames(up), rownames(dn)), ];
rownames(goi)<-as.vector(anno[rownames(goi), 1]);
DegHeatmap(goi,  col=rep(c('chartreuse1', 'deeppink1'), c(length(g1.ind), length(g2.ind))), plot.new=FALSE);
dev.off();

## Gene counts by FDR cutoffs
fdr<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
if (deg.method == DeMethods()[3]) {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]>0 & stat[, 7]<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]<0 & stat[, 10]<=fdr, , drop=FALSE]));
} else {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']>0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']<0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
} 
fdr.table<-cbind(fdr, c1, c2, c1+c2);
colnames(fdr.table)<-c('FDR cutoff', names(degs), 'Total');

# pre-ranking of genes
rnk<-sign(d[, 'LogFC'])*sqrt(d[,'LogFC']^2+log10(p)^2);
cat('Write out DEG results\n');

pth.deg1<-paste(pth.deg, '/Higher_in_', g2.name, sep='');
pth.deg2<-paste(pth.deg, '/Lower_in_', g2.name, sep='');
pth.deg1.bars<-paste(pth.deg1, '/bars', sep='');
pth.deg2.bars<-paste(pth.deg2, '/bars', sep='');

if (!exists(pth.deg)) dir.create(pth.deg, showWarnings = FALSE);
if (!exists(pth.deg1.bars)) dir.create(pth.deg1.bars, showWarnings = FALSE, recursive = TRUE);
if (!exists(pth.deg2.bars)) dir.create(pth.deg2.bars, showWarnings = FALSE, recursive = TRUE);
ids<-c(up=rownames(up), dn=rownames(dn));
names(ids)<-paste(rep(c(pth.deg1.bars, pth.deg2.bars), c(nrow(up), nrow(dn))), '/', ids, '.pdf', sep='');

col<-rep(c('lightgrey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
wid<-min(1.8, 1.2/(0.1*max(nchar(colnames(expr[, c(g1.ind, g2.ind)])))));
fn.barplot<-sapply(names(ids), function(nm) {
    pdf(nm, w=8, h=6); 
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(expr[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Expression level', cex.lab=wid, cex.names=wid);
    title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2);
    #plot.new();
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(pctl[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Percentile (%)', ylim=c(0, 100), cex.lab=2, cex.names=wid);
    title(main=paste(ids[nm], CleanHtmlTags(as.vector(anno[ids[nm], 1])), sep=' - '), cex.main=2);

    dev.off();
    nm;
});

# Write index tables of DEGs
cnm<-c(colnames(anno), colnames(up), 'Samples');
up.tbl<-data.frame(anno[rownames(up), , drop=FALSE], up, rep('Figure', nrow(up)));
colnames(up.tbl)<-cnm;
up.tbl$Samples<-AddHref(up.tbl$Samples, paste('./bars/', rownames(up), '.pdf', sep='')); 
up.formatted<-GeneList2Datatable(FormatNumeric(up.tbl), paste(pth.deg1, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with higher expression in', g2.name));

dn.tbl<-data.frame(anno[rownames(dn), , drop=FALSE], dn, rep('Figure', nrow(dn)));
colnames(dn.tbl)<-cnm;
dn.tbl$Samples<-AddHref(dn.tbl$Samples, paste('./bars/', rownames(dn), '.pdf', sep='')); 
dn.formatted<-GeneList2Datatable(FormatNumeric(dn.tbl), paste(pth.deg2, 'index.html', sep='/'), col.symbol=1, genome=genome, title=paste('Genes with lower expression in', g2.name));
cat('ORA analysis\n');

########################################################################################################
ora<-lapply(degs, function(gs) awsomics::TestGSE(rownames(gs), rownames(expr), geneset$list));
########################################################################################################

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora);
sapply(names(degs), function(nm) if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'), recursive = TRUE))->x;
names(ora)<-names(degs);
saveRDS(ora, paste(path.ora, 'ora_full.rds', sep='/'));

ora.table<-lapply(ora, function(ora) {
  g<-ora$stat;
  lst<-ora$list[rownames(g)];
  rand<-sapply(lst, function(l) { 
    mtrx<-e1.2[rownames(e1.2) %in% l, , drop=FALSE];
    flexclust::randIndex(table(kmeans(t(mtrx), 2)[[1]], rep(1:2, sapply(grps, length))));
  })
  an<-geneset[[1]][rownames(g), ];
  data.frame(row.names = rownames(g), stringsAsFactors = TRUE, Source=an[,1], Collection=an[,2], Term=an[, 'Name'], 
                Total=rowSums(ora$size[rownames(g), 3:4]), Within=ora$size[rownames(g), 4], Enrichment=g[, 'Odds_Ratio'], 
                Rand=rand, Pvalue=g[, 'P_HyperGeo'], FDR=g[, 'FDR_BH']);
}); 

########################################################################################################
# Biclustering
bic<-lapply(ora, function(g) if (nrow(g[[1]])<5) NA else BiclusterFromList(g[[2]][rownames(g[[1]])[1:min(250, nrow(g[[1]]))]]));
ora$bicluster<-bic;

bic2gs<-lapply(bic, function(bic) lapply(bic[[2]], rownames));
gs2bic<-lapply(bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x))));
gs2bic<-lapply(gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';')));

########################################################################################################
# Output tables
ora.table<-lapply(names(ora.table), function(nm) {
  b<-gs2bic[[nm]];
  b<-b[rownames(ora.table[[nm]])];
  b[is.na(b)]<-'';
  ora.table[[nm]]$Cluster<-b;
  ora.table[[nm]];
});
names(ora.table)<-names(bic);
cat('Write out ORA results\n');

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora, recursive = TRUE);

# Split results into tables by collections
ora.wrapped<-lapply(names(ora.table), function(nm) {
  pth<-paste(path.ora, nm, 'table', sep='/');
  if (!file.exists(pth)) dir.create(pth, recursive = TRUE); 
  WrapGSE(ora.table[[nm]][, -(1:3)], geneset$meta, pth, TRUE); 
});
names(ora.wrapped)<-names(ora.table);

# create index.html file
ora.tbl<-lapply(ora.wrapped, function(o) {
  n<-lapply(o$formatted, function(o) sapply(o, nrow)); 
  fn<-unlist(o$file, use.names=FALSE); 
  fn<-sub(paste(path.ora, '/', sep=''), '', fn); 
  s<-rep(names(n), sapply(n, length));
  c<-unlist(lapply(n, names), use.names=FALSE); 
  n<-as.vector(unlist(n, use.names=FALSE)); 
  tbl<-data.frame(Source=s, Collection=c, N=n, stringsAsFactors = FALSE);
  tbl$N<- AddHref(tbl$N, fn);
  rownames(tbl)<-paste(tbl[[1]], tbl[[2]], sep='_'); 
  tbl
});
u<-sort(union(rownames(ora.tbl[[1]]), rownames(ora.tbl[[2]]))); 
ora.tbl<-lapply(ora.tbl, function(t) {
  t<-t[u, ];
  t[is.na(t[,3]), 3]<-0; 
  rownames(t)<-u; 
  t;
});
ora.ind<-cbind(ora.tbl[[1]], N2=ora.tbl[[2]]$N); 
ora.ind[is.na(ora.ind[,1]), 1]<-ora.tbl[[2]][is.na(ora.ind[,1]), 1]; 
ora.ind[is.na(ora.ind[,2]), 2]<-ora.tbl[[2]][is.na(ora.ind[,2]), 2]; 
names(ora.ind)<-c('Source', 'Collection', names(ora.wrapped)); 
CreateDatatable(ora.ind, paste(path.ora, 'index.html', sep='/'), rownames = FALSE, caption = "Click on number to see list"); 

# All tested gene sets
gs.table<-lapply(split(geneset[[1]][, 'Collection'], geneset[[1]][, 'Source']), table);
gs.table<-data.frame(stringsAsFactors = FALSE, Source=rep(names(gs.table), sapply(gs.table, length)), 
                     Collection=unlist(lapply(gs.table, names), use.names=FALSE), Geneset=as.numeric(unlist(gs.table)));
awsomics::CreateDatatable(gs.table, paste(path.ora, 'geneset.html', sep='/'), rownames = FALSE, caption = "Gene set collections");

# Formatted tables of ORA results
stat.slim<-data.frame(anno[rownames(stat), 1, drop=FALSE], awsomics::FormatNumeric(stat));
ora.formatted<-lapply(names(ora.table), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'term', sep='/'))) dir.create(paste(path.ora, nm, 'term', sep='/'));

  t<-ora.table[[nm]];
  an<-geneset[[1]][rownames(t), ];
  t$Term<-awsomics::AddHref(t$Term, an[, 'URL']);
  t$Total[1:min(250, nrow(t))]<-awsomics::AddHref(t$Total[1:min(250, nrow(t))], paste('./term/term_', 1:min(250, nrow(t)), '.html', sep=''));
  #t$Within<-awsomics::AddHref(t$Within, paste('./term/term_', rownames(t), '_within.html', sep=''));

  t<-awsomics::FormatNumeric(t);
  awsomics::CreateDatatable(t, fn=paste(path.ora, nm, 'term.html', sep='/'), rownames = FALSE);
  sapply(1:min(250, nrow(t)), function(i) {
    g<-geneset$list[rownames(t)[i]][[1]];
    t1<-stat.slim[rownames(stat.slim) %in% g, , drop=FALSE];
    t1<-t1[order(t1[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    fn<-paste(path.ora, nm, 'term', paste('term_', i, '.html', sep=''), sep='/');
    GeneList2Datatable(t1, fn, genome = genome, title = rownames(t)[i]);
  });

  t;
}); 
names(ora.formatted)<-names(ora.table);

# Formatted table of biclustering
bic.formatted<-lapply(names(bic), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'bicluster', sep='/'))) dir.create(paste(path.ora, nm, 'bicluster', sep='/'));

  t<-awsomics::FormatNumeric(bic[[nm]]$summary);

  # write individual files of biclusters
  fns<-sapply(1:nrow(t), function(i) {
    fn<- paste(path.ora, nm, 'bicluster', paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/');
    t1<-ora.formatted[[nm]][rownames(bic[[nm]][[2]][[i]]), ];
    CreateDatatable(t1, fn[1], FALSE, caption=paste('Terms of', t[i,1]));
    t2<-stat.slim[colnames(bic[[nm]][[2]][[i]]), , drop=FALSE];
    t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); 
    t3<-bic[[nm]][[2]][[i]];
    rownames(t3)<-paste(ora.table[[nm]][rownames(t3), 2], ora.table[[nm]][rownames(t3), 3], sep=' - ');
    colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - ');
    if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999';
    PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF');
    fn;
  });
  bic.fn<-t(fns);

  # write index table
  t$Num_Terms<-AddHref(t$Num_Terms, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 1]));
  t$Num_Genes<-AddHref(t$Num_Genes, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 2]));
  t$ID<-AddHref(t$ID, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 3]));
  CreateDatatable(t, fn=paste(path.ora, nm, 'bicluster.html', sep='/'), rownames = FALSE);

  t;
});
cat('Preparing GSEA analysis\n');

path.gsea<-paste(path, 'GSEA', sep='/');
if (file.exists(path.gsea)) unlink(path.gsea, recursive = TRUE); # Path must not exist

tmp<-as.character(as.integer(Sys.time())); 
if (file.exists(tmp)) unlist(tmp, recursive = TRUE); 
dir.create(tmp, recursive = TRUE); 

# loading template yaml
input.gsea<-yml$input$gsea;
gsea.yml<-input.gsea$template;
if (input.gsea$remote) gsea.yml<-yaml.load(RCurl::getURL(gsea.yml)) else gsea.yml<-yaml.load_file(gsea.yml)

# If Rank Product was used for DE, adjust the data for GSEA so the group mean difference reflects RP results
e1.2_gsea<-e1.2; 
if (deg.method[1] == 'DeRankP') {
  log.rp<-log10(de$results$stat[, 'RP2']/de$results$stat[, 'RP1']);
  m1<-rowMeans(e1.2[, g1.ind]); 
  m2<-rowMeans(e1.2[, g2.ind]); 
  m0<-m1+log.rp; 
  e1.2_gsea[, g2.ind]<-e1.2[, g2.ind] - (m2-m0); 
}; 

fn.gsea<-PrepareGSEA(e1.2_gsea, grps, paste(tmp, 'input', sep='/'), desc = anno[[1]]); 

gsea.yml$jar<-input.gsea$jar;
gsea.yml$preranked<-FALSE;
gsea.yml$thread<-input.gsea$thread; 
gsea.yml$name<-tmp;
gsea.yml$groups$control<-g1.name;
gsea.yml$groups$case<-g2.name;
gsea.yml$out<-path;
gsea.yml$input<-fn.gsea[1];
gsea.yml$class<-fn.gsea[2];
gsea.yml$chip<-c();
gsea.yml$gmt<-input.gsea$gmt;
cat('GSEA analysis\n');

gsea.cmmd<-GSEAviaJava(gsea.yml); 
file.rename(paste(gsea.yml$out, tmp, sep='/'), path.gsea); 
gsea.tbl<-readRDS(paste(path.gsea, 'full_table.rds', sep='/'));
for (i in 1:ncol(gsea.tbl)) gsea.tbl[[i]]<-as.vector(gsea.tbl[[i]]); 
saveRDS(gsea.tbl, file=paste(path.gsea, 'full_table.rds', sep='/')); 

# Load gene lists of gene sets from GMT files
fn.gmt<-gsea.yml$gmt
gmts<-lapply(fn.gmt, PGSEA::readGmt); 
gsea.lst<-lapply(gmts, function(gmt) {
  nm<-sub(' $', '', sapply(gmt, function(g) g@reference)); 
  lst<-lapply(gmt, function(g) g@ids); 
  names(lst)<-toupper(nm); 
  lst;
}); 

########################################################################################################
# Biclustering
gsea.tbl.sig<-gsea.tbl[gsea.tbl$PValue<=0.05, ]; 
gsea.tbl.sig<-gsea.tbl.sig[order(gsea.tbl.sig$PValue), , drop=FALSE]; 
gsea.lst.sig<-lapply(1:nrow(gsea.tbl.sig), function(i) gsea.lst[[gsea.tbl.sig[i, 1]]][[gsea.tbl.sig[i, 2]]]); 
gsea.lst.sig<-lapply(gsea.lst.sig, function(l) l[l %in% unlist(lapply(degs, rownames), use.names=FALSE)]); 
names(gsea.lst.sig)<-rownames(gsea.tbl.sig); 
gsea.lst.sig<-gsea.lst.sig[sapply(gsea.lst.sig, length)>1];
gsea.gs<-split(gsea.lst.sig, gsea.tbl.sig[names(gsea.lst.sig), ncol(gsea.tbl.sig)]); 
#gsea.gs<-rev(gsea.gs); 
gsea.gs<-lapply(rev(gsea.gs), function(x) x[!duplicated(x)]); 
names(gsea.gs)<-names(degs); 

gsea.bic<-lapply(gsea.gs, function(g) BiclusterFromList(g[1:min(250, length(g))])); 
gsea.bic2gs<-lapply(gsea.bic, function(bic) if (identical(NA, bic)) NA else lapply(bic[[2]], rownames));
gsea.gs2bic<-lapply(gsea.bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x))));
gsea.gs2bic<-lapply(gsea.gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';')));
cat('Biclustering GSEA results\n');

# Formatted table of biclustering
gsea.bic.formatted<-lapply(names(gsea.bic), function(nm) {
  path.bic<-paste(path.gsea, 'bicluster', nm, sep='/'); 
  if (!file.exists(path.bic)) dir.create(path.bic, recursive = TRUE);

  if (identical(NA, gsea.bic[[nm]])) t<-data.frame(ID='No clusters found', stringsAsFactors = FALSE) else {
    t<-FormatNumeric(gsea.bic[[nm]]$summary);
    t$ID<-as.vector(t$ID); 

    # write individual files of biclusters
    fns<-sapply(1:nrow(t), function(i) {
      fn<-paste(path.bic, paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/');
      t1<-gsea.tbl[rownames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE];
      CreateDatatable(FormatNumeric(t1[, 1:6]), fn[1], FALSE, caption=paste('Terms of', t[i,1]));
      t2<-stat.slim[colnames(gsea.bic[[nm]][[2]][[i]]), , drop=FALSE];
      t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE];
      GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); 
      t3<-gsea.bic[[nm]][[2]][[i]];
      rownames(t3)<-paste(gsea.tbl[rownames(t3), 1], gsea.tbl[rownames(t3), 2], sep=' - ');
      colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), CleanHtmlTags(anno[colnames(t3), 1]), sep=' - ');
      if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999';
      PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF');
      fn;
    });
    gsea.bic.fn<-t(fns);

    # write index table
    t$Num_Terms<-AddHref(t$Num_Terms, paste(t$ID, '_terms.html', sep=''));
    t$Num_Genes<-AddHref(t$Num_Genes, paste(t$ID, '_genes.html', sep=''));
    t$ID<-AddHref(t$ID, paste(t$ID, '_heatmap.pdf', sep=''));
  }

  CreateDatatable(t, fn=paste(path.bic, 'index.html', sep='/'), rownames = FALSE);

  t;
});

gsea<-list(path=gsea.yml$out, geneset=gsea.lst, stat=gsea.tbl, bicluster=gsea.bic); 
cat('Write out biclustering results\n');

t<-data.frame(matrix('table', nr=2, nc=2), stringsAsFactors = FALSE); 
t[1, ]<-AddHref(t[1, ], paste("../ORA", names(degs), 'bicluster.html', sep='/')); 
t[2, ]<-AddHref(t[2, ], paste("../GSEA/bicluster", names(degs), 'index.html', sep='/')); 
colnames(t)<-names(degs);
rownames(t)<-c('Over-representation analysis', 'Gene set enrichment analysis'); 
CreateDatatable(t, paste(path, 'DEG', 'bicluster_index.html', sep='/')); 
cat('GSEA analysis of KEGG pathways\n');

path.kegg<-paste(path.gsea, 'kegg', sep='/'); 
if (file.exists(path.kegg)) unlink(path.kegg, recursive = TRUE); 
dir.create(path.kegg, recursive = TRUE); 

kegg.stat<-gsea.tbl[gsea.tbl$Collection=='KEGG_pathway' & gsea.tbl$PValue<=0.05, , drop=FALSE];

# Run pathview, generate figures in a temp directory and copy them to output directory
path.temp<-as.character(as.integer(Sys.time()));
if (!file.exists(path.temp)) dir.create(path.temp);
setwd(path.temp); 
if (nrow(kegg.stat) == 0) kegg.ind<-kegg.stat[, -1] else {
  kegg.id<-tolower(substr(kegg.stat$Gene_set, 1, 8)); 
  kegg.dff<-rowMeans(e1.2_gsea[, g2.ind])-rowMeans(e1.2_gsea[, g1.ind]); 

  fn.kegg<-sapply(kegg.id, function(id) {
    capture.output(kegg<-pathview(kegg.dff, pathway.id=id, species = GetGenomeAlias(genome, "code"), 
                                low = list(gene = "blue"), kegg.dir=yml$input$gsea$kegg))->x; 
    paste(id, '.pathview.png', sep=''); 
  }); 
  names(fn.kegg)<-kegg.id;
  fn.kegg<-fn.kegg[file.exists(fn.kegg)]; 
  if (length(fn.kegg)>0) file.rename(fn.kegg, paste(path.kegg, fn.kegg, sep='/')); 

  kegg.ind<-FormatNumeric(kegg.stat[kegg.id %in% names(fn.kegg), -1]);
  if (nrow(kegg.ind) > 0) kegg.ind$Gene_set<-AddHref(kegg.ind$Gene_set, fn.kegg); 
  kegg.ind<-kegg.ind[order(kegg.ind$NES), , drop=FALSE]; 
}
setwd('..');
unlink(path.temp, recursive = TRUE); 

CreateDatatable(kegg.ind, paste(path.kegg, 'index.html', sep='/')); 
cat('Writing tables to Excel file\n');

# Excel spreadsheets
xls<-list(stat.table, stat.table[rownames(up), , drop=FALSE], stat.table[rownames(dn), , drop=FALSE],
          ora.table[[1]], ora.table[[2]], gsea.tbl);
xls<-lapply(xls, function(x) {
  for (i in 1:ncol(x)) x[[i]]<-CleanHtmlTags(x[[i]], FALSE); 
  x;
});
names(xls)<-c('Complete List', 
  paste('Top', nrow(up), ', higher in ', g2.name, sep=''), 
  paste('Top', nrow(dn), ', lower in ', g2.name, sep=''),
  paste('ORA, higher in ', g2.name, sep=''),
  paste('ORA, lower in ', g2.name, sep=''),
  "GSEA"
);
xls<-lapply(xls, awsomics::FormatNumeric);
tbl<-sapply(names(xls), function(nm) write.csv(xls[[nm]], paste(path.table, paste(nm, '.csv', sep=''), sep='/'))); 
fn.xls<-paste(path, paste(g1.name, g2.name, sep='-vs-'), sep='/');
WriteExcel(xls, fn.xls); 
cat('Converting figures to png format\n');

fn.png<-lapply(fn.fig, function(f) ConvertPdf2Png(f, resize=80, density=150));
fn.png[[length(fn.png)+1]]<-sapply(names(bic), function(nm) {
  t<-bic[[nm]]$summary;
  n<-t[,2]*t[,3];
  f<-paste('bicluster/', t[n==max(n)[1], 1], '_heatmap.pdf', sep='');
  awsomics::ConvertPdf2Png(paste(path.ora, nm, f, sep='/'), resize=80, density=150); 
});
# fn.png[[length(fn.png)+1]]<-sapply(kegg$Formatted, function(t) {
#   ids<-as.vector(t$Pathway_ID);
#   fn<-paste(path, 'KEGG', 'Files', 'figures', 'colored_plots', paste(ids, '.pathview.png', sep=''), sep='/');
#   fn[file.exists(fn)][1];
# });
cat('Writing out all results\n'); 

res$de<-de;
res$ora<-ora;
res$gsea<-gsea;
saveRDS(res, file=paste(path, 'result.rds', sep='/'));
**_[Go back to project home](`r yml$project$Home`)_**


`r g1.name` vs. `r g2.name`

Results at a glance

Click links to go directly to results, or read through the rest of this page for details: - Differential gene expression ([All genes](DEG/all_genes.html)) - [`r nrow(degs[[1]])` genes](`r paste('DEG', names(degs)[1], 'index.html', sep='/')`) with higher expression in `r g2.name` - [`r nrow(degs[[2]])` genes](`r paste('DEG', names(degs)[2], 'index.html', sep='/')`) with lower expression in `r g2.name` - Analysis of differentially expressed genes (DEGs) - [ORA](ORA/index.html) (Over-representation analysis) of gene sets - [GSEA](GSEA/index.html) (Gene set enrichment analysis) of gene sets - [Top hits](GSEA/full_table.html) cheat sheet - [Colored maps](GSEA/kegg/index.html) of KEGG pathways - [Biclustering](DEG/bicluster_index.html) of significant genes and gene sets - Download - [Excel](`r paste(path0, '.xlsx', sep='')`) file with statistical tables - [Zip](`r paste(path0, '.zip', sep='')`) file with complete results.
**_[Go back to project home](`r yml$project$Home`)_**

Project

cat('Writing DE report\n');
cat('Writing project background\n'); 

lns<-lapply(names(yml$project), function(nm) {
  c(paste('##', nm), '\n', yml$project[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n'); 

r lns

**_[Go back to project home](`r yml$project$Home`)_**


Inputs

This document reports the analysis results from comparing the transcriptomes of two sample groups: r g1.name vs. r g2.name.


Required variables

The following inputs are required to generate this report. You can download an example of input specifications as a yaml file from https://raw.githubusercontent.com/zhezhangsh/DEGandMore/master/examples/DeReport/DeReport.yml

**_[Go back to project home](`r yml$project$Home`)_**

Input summary

**_[Go back to project home](`r yml$project$Home`)_**


Analysis and results


Sample analysis

This section analyzes the samples by summarizing their global expression patterns, through descriptive statistics, unsupervised clustering, and sample-sample correlation.


Data distribution of all genes

Distribution of average expression level and sample-sample variance.

Figure 1
![](`r fn.png[[1]]`)
**_[Go back to project home](`r yml$project$Home`)_**

Sample-sample correlation

All genes in this data set were used to calculate the Pearson's correlation between each pair of samples. The result is often used to identify potential outlier samples as the outliers tend to have relatively lower correlation coefficient to any of the other samples.

Figure 2
![](`r fn.png[[2]]`)
Figure 3
![](`r fn.png[[3]]`)
Figure 4
![](`r fn.png[[4]]`)
**_[Go back to project home](`r yml$project$Home`)_**

Gene-level differential expression

By default, Rank Product (RP), a non-paramatric statistical test based on ranks of fold changes, is used for the statistical analysis. The outputs of RP include:

**_[Go back to project home](`r yml$project$Home`)_**

Differential expression of all genes

The full table of single gene differential expression can be seen here.

We use the following visualization methods to get an overview of all r nrow(expr) genes:

Figure 5
![](`r fn.png[[5]]`)
**_[Go back to project home](`r yml$project$Home`)_**

Top genes

Figure 6: Top-ranked genes on both sides (left: higher in r g2.name; right: lower in r g2.name)

Figure 6
![](`r fn.png[[6]]`)
**_[Go back to project home](`r yml$project$Home`)_**

DEGs (differentially expressed genes)

DEGs were selected using the following rules:

Click link to get full list of DEGs:

Higher in r g2.name

Lower in r g2.name

Table: Gene counts by FDR cutoffs.

pander::pander(fdr.table);

Figure 7: A heatmap using all DEGs. Expression measurements of each gene are normalized across samples for plotting; redder means higher expression level. The color of column sidebar indicates which group each sample belongs to.

Figure 7

**_[Go back to project home](`r yml$project$Home`)_**

Gene-set analysis

In this section, the unit of analysis is predefined gene sets. It is based on the assumption that if genes of the same gene set, such as those in the same signaling pathway or coregulated group, are more likely to have differential expression, the gene set is important to the biological difference between the two sample groups.

Over-representation analysis (ORA)

ORA evaluates whether the genes of any predefined gene sets, such as a KEGG pathway or GO term, are over-represented in DEGs. We uses hypergeometric test to evaluate whether N/Nt is significantly greater than M0/Mt, where Mt is the total number of genes in the data set, M is the total number of genes in both of the data set and a gene set, Nt is the total number of DEGs previously selected, and N is the overlapping of M and Nt.

We have archived a large collection of gene sets from multiple sources. By default, a subset of these gene sets, each including at least r min(geneset[[1]][, 'Size']), are used to test over-representation of DEGs in them. Click here to see a summary of these genesets.

Click link to view full ORA results:

ORA results

cat('Writing example of ORA analysis\n');

pdf(paste(path, 'figure', 'ora.pdf', sep='/'), w=4.8, h=3.6); 
id1<-geneset[[2]][[rownames(ora[[1]][[1]])[1]]]
id2<-rownames(degs[[1]]);
PlotVenn(id1, id2, c(paste('Gene set:', rownames(ora[[1]][[1]])[1]), paste('Higher expression in', g2.name)), rownames(anno));
dev.off();
ex.ora<-ConvertPdf2Png(paste(path, 'figure', 'ora.pdf', sep='/'), resize=80, density=150)

Figure 8: Example of DEGs over-represented in a predefined gene set.

Figure 8
![](`r ex.ora`)
**_[Go back to project home](`r yml$project$Home`)_**

Gene set enrichment analysis (GSEA)

GSEA is a different category of analyses applied to gene sets. Instead of using selected top DEGs like ORA, GSEA takes into account all genes in the data set after they were ranked by a statistical test such as Rank Product test. The original GSEA was developed by Broad Institute that is usually run as a stand-alone program. We used a local installation of GSEA and our own collection of gene sets from different databases, such as BioSystems and KEGG. By default, the normalized rank products of all genes were used to adjust gene expression matrix, so the genes will be ranked by their test statistics for GSEA.

GSEA results

cat('Writing example of GSEA\n');

gsea.ex<-gsea$stat;
gsea.ex<-gsea.ex[abs(gsea.ex$NES)==max(abs(gsea.ex$NES)), ];
ex.gsea<-dir(paste(path.gsea, gsea.ex$Collection, sep='/')); 
ex.gsea<-ex.gsea[grep(gsea.ex$Gene_set[1], ex.gsea)];
ex.gsea<-paste(path.gsea, gsea.ex$Collection, ex.gsea[grep('^enplot', ex.gsea)][1], sep='/');

Figure 9: Example of GSEA enrichment plot. See here for intepretaion of such a plot.

Figure 9
![](`r ex.gsea`)
**_[Go back to project home](`r yml$project$Home`)_**

Color-coded KEGG pathways

KEGG pathways are mostly canonical metabolic pathways conserved across species. Each pathway can include both genes(proteins) and chemical compounds. Color-coded (red = hiher in r g2.name) map of KEGG pathway illustrates intuitively the overall change of genes in a pathway. In this analysis, GSEA was used to identify significant pathway first, and the test statistic from differential expression analysis, such as the normalized rank products, was used to plot the maps.

KEGG maps

cat('Writing example of KEGG pathway analysis\n'); 

ex.kegg<-kegg.stat[rownames(kegg.ind), ];
if (nrow(ex.kegg) == 0) ex.kegg<-'' else {
  ex.kegg<- ex.kegg[abs(ex.kegg$NES)==max(abs(ex.kegg$NES)), ];
  ex.id<-tolower(substr(ex.kegg[1,2], 1, 8));
  ex.kegg<-paste(path.kegg, paste(ex.id, '.pathview.png', sep=''), sep='/');
}

Figure 10: Example of color-coded KEGG pathway (red = higher in r g2.name). See here to see the original pathway map on KEGG web site.

Figure 10
![](`r ex.kegg`)
**_[Go back to project home](`r yml$project$Home`)_**

Gene-gene set biclustering

A potential problem complicates the result intepretation of gene set analysis is that many gene sets are redundant to each other by including common DEGs. Biclustering of significant genes and gene sets can be used to reduce redundant information by identifying a smaller number of "modules". Each module includes multiple genes that are likely to be found in the same gene sets and gene sets that are likely to include the same DEGs. The biclustering was performed by Iterative Signature Algorithm in this analysis.

Biclustering results

cat('Writing example of biclustering analysis\n');

ex.bic<-gsea$bicluster[[1]]$summary;
ex.bic<-ex.bic[ex.bic[,2]*ex.bic[,3]==max(ex.bic[,2]*ex.bic[,3]), ];
ex.bic<-paste(path.gsea, 'bicluster', names(degs)[1], paste(ex.bic[1,1], '_heatmap.pdf', sep=''), sep='/');
ex.bic<-ConvertPdf2Png(ex.bic, resize=80, density=150)

Figure 11: Example of a gene-gene set biclustering module. Blue means the gene is both a DEG and a member of the gene set.

Figure 11
![](`r ex.bic`)
**_[Go back to project home](`r yml$project$Home`)_**


Appendix


Fold change vs. log(fold change)

In this report, fold change refers the ratio of two group means in their unlogged form. So a fold change of 2.0 means the average of the second group is increased to twice of the average of the first group; similarly, a fold change of 0.5 means the average is reduced to half. Log(fold change) equals to the log2-transformation of the fold change. The table below gives a few examples of the conversion of these 2 variables.

c<-c(1.25, 1.5, 2, 4, 8);
fc<-c(1/rev(c), 1, c);
lg<-log2(fc);
pct<-100*(fc-1);
t<-round(cbind(fc, lg, pct), 3);
colnames(t)<-c('Fold change', 'Log2(fold change)', 'Percentage change (%)');
pander::pander(t); 

R/Biocondcutor packages and functions

The table below lists the core R/Bioconductor packages and functions used to generate this report while in-house functions are used to customize their outputs.

t<-strsplit(
"Create this report\tDEGandMore\tDeReport
Hierarchical clustering\tstats\thclust 
PCA\tstats\tprcomp
Differential expression\tRankProd\tRP
Heatmap\tstats\theatmap
Biclustering analysis\tisa2\tisa
Over-representation analysis\tawsomics\tTestGSE
Gene set enrichment analysis\tDEGandMore\tGSEAviaJava
Plot color-coded KEGG pathways\tpathview\tpathview
Write Java HTML datatables\tawsomics\tCreateDatatable
Write data to Excel\txlsx\tcreateWorkbook",
'\n')[[1]]; 
t<-do.call('rbind', strsplit(t, '\t'));
colnames(t)<-c('Task', 'R package', 'R function');
pander::pander(t);

Examples:


References

**_[Go back to project home](`r yml$project$Home`)_**

END OF DOCUMENT



zhezhangsh/DEGandMore documentation built on Sept. 22, 2022, 9:55 a.m.