knitr::opts_chunk$set(eval=TRUE, eval=FALSE, dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE);

library(awsomics);
library(gplots);
library(knitr);
library(rmarkdown); 
library(yaml); 

 fn.yaml<-"/nas/is1/zhangz/projects/falk/2015-01_Worm_Antioxidant_Drugs/results/pairwise/DeReport.yml"; # comment it out if running through knitr::knit function

# Loading inputs from yaml file
# if (!exists('fn.yaml')) stop('Input file not found\n'); 
# writeLines(yaml::as.yaml(yml), fn.yaml);
 yml <- yaml.load_file(fn.yaml);  

inputs<-yml$input;
if (class(inputs$anno) == 'character') inputs$anno<-readRDS(inputs$anno); 
if (class(inputs$expr) == 'character') inputs$expr<-readRDS(inputs$expr); 
if (class(inputs$geneset) == 'character') inputs$geneset<-readRDS(inputs$geneset); 

# All input variables
anno<-inputs$anno;
expr<-as.matrix(inputs$expr);
geneset<-inputs$geneset;
genome<-inputs$genome;
paired<-inputs$paired;
penalty<-inputs$penalty;
grps<-inputs$comparison;
g1.ind<-grps[[1]];
g2.ind<-grps[[2]];
g1.name<-names(grps)[1]
g2.name<-names(grps)[2];
g1.name<-gsub('-', '_', g1.name);
g2.name<-gsub('-', '_', g2.name);

# parameters of differential expression
deg.method<-inputs$deg$method;
cutoff.l2r<-inputs$deg$cutoff.l2r;
cutoff.p<-inputs$deg$cutoff.p;
cutoff.fdr<-inputs$deg$cutoff.fdr;
num.top<-inputs$deg$num.top;
nperm<-inputs$deg$nperm;
logged<-inputs$deg$logged;

# Check validity of inputs
if (!identical(rownames(expr), rownames(anno))) stop('Row names of annotation and gene expression data are not identical');
if (is.null(paired)) paired<-FALSE; 
if (is.null(g1.ind)) stop('Error: Index of samples in group 1 unknown');
if (is.null(g2.ind)) stop('Error: Index of samples in group 2 unknown');
if (is.null(g1.name)) stop('Error: Name of group 1 unknown');
if (is.null(g2.name)) stop('Error: Name of group 2 unknown');
if (length(g1.ind)<2) stop('Error: Not enough samples in group ', g1.name, ', minimum=2');
if (length(g2.ind)<2) stop('Error: Not enough samples in group ', g2.name, ', minimum=2');
if (paired & length(g1.ind)!=length(g2.ind)) stop('Error: Unequal sample sizes for paired comparison, ', length(g1.ind), ' vs. ', length(g2.ind));

# path to output files 
path.output<-yml$output;
path0<-paste(g1.name, g2.name, sep='-vs-');
path<-paste(path.output, path0, sep='/');
path.figure<-paste(path, 'figure', sep='/');
if (!file.exists(path.output)) dir.create(path.output, recursive = TRUE);
if (!file.exists(path)) dir.create(path, recursive = TRUE);
if (!file.exists(path.figure)) dir.create(path.figure, recursive = TRUE);

# Re-process gene expression matrix
e1<-expr[, g1.ind, drop=FALSE];
e2<-expr[, g2.ind, drop=FALSE];
e1.2<-cbind(e1, e2); 
pctl<-apply(e1.2, 2, function(e) 100*rank(e)/length(e)); # percentile

inputs$expr<-e1.2;
fn.fig<-c();  # Figure file name and index
res<-list(inputs=inputs);  # Result set
fn.fig['variance']<-paste(path.figure, 'variance.pdf', sep='/'); 
pdf(fn.fig['variance'], width=6.4, height=8);
par(mfrow=c(3,1), mar=c(4,5,2,2));

# Distribution of average expression level
m<-rowMeans(e1.2);
d<-density(m);
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='A. Distribution of Expression Measurement', cex.lab=2, cex.main=1.5);
title(xlab='Gene Expression Measurement (sample average)', line=2, cex=2);

x0<-as.vector(summary(m))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4); 
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

# Distribution of standard deviation across samples
sd<-apply(e1.2, 1, sd);
d<-density(sd); 
x<-d$x;
y<-d$y;
plot(d, type='n', yaxs='i', xaxs='i', xlim=c(min(x), max(x)), ylim=c(0, 1.1*max(y)), xlab='', ylab='Density', main='B. Distribution of Variance', cex.lab=2, cex.main=1.5);
title(xlab='Gene Standard Deviation (between samples)', line=2, cex=2);

x0<-as.vector(summary(sd))[2:5];
y0<-sapply(x0, function(x0, x, y) y[which(abs(x-x0)==min(abs(x-x0)))], x=x, y=y);
col<-c('blue', 'red', 'orange', 'green');
segments(x0, 0, x0, y0, lty=1, col=col, lwd=1);
lines(d, col='darkgrey', lwd=4);
text(x0, y0/2, srt=90, labels=round(x0, 3));
legend(min(x)+0.75*(max(x)-min(x)), max(y), legend=c('First quantile', 'Median', 'Mean', 'Third quantile'), lty=1, col=col, bty='n', lwd=2);

plot(m, sd, xlab='', ylab='Standard Deviation', main='C. Variance vs. Expression Measurement', cex.lab=2, cex.main=1.5, cex=0.5, col='darkgrey');
title(xlab='Expression Measurement', line=2, cex=2);
lines(lowess(m, sd), lwd=3, col=2);

dev.off();
fn.fig['boxplot']<-paste(path.figure, 'boxplot.pdf', sep='/');
pdf(fn.fig['boxplot'], width=6.4, height=4); 
par(mfrow=c(1,1), mai=c(0.7, 0.6, .1,.1))
plot(0, type='n', xlim=c(0.5, 0.5+ncol(e1.2)), ylim=c(min(e1.2), max(e1.2)), ylab='', xlab='', xaxt='n'); 
lines(1:ncol(e1.2), apply(e1.2, 2, median), col='green', lwd=1);
title(ylab='Expression Level', line=2);
cex<-max(strwidth(colnames(e1.2), unit='in'));
boxplot(data.frame(e1.2), las=3, notch=TRUE, boxwex=.6, col=c(rep('lightgrey', ncol(e1)), rep('lightblue', ncol(e2))), names=colnames(e1.2), cex.axis=min(1, 0.4/cex), yaxt='n', add=TRUE, pch=19, cex=0.4)->x;
dev.off();
fn.fig['clustering']<-paste(path.figure, 'clustering.pdf', sep='/');
pdf(fn.fig['clustering'], width=6.4, height=4.8);
par(mfrow=c(1,1), mai=c(0.2, 0.8, .2 , .2));
plot(hclust(dist(t(e1.2))), main='', ylab='', xlab='', sub='', cex=min(1, 5/ncol(e1.2)/0.12), frame.plot=TRUE);
title(ylab='Distance', line=2.5, cex.lab=2);
dev.off();
fn.fig['pca']<-paste(path.figure, 'pca.pdf', sep='/');
pdf(fn.fig['pca'], width=6.4, height=4.8);

pca<-prcomp(t(e1.2));

X<-pca$x[,1];
Y<-pca$x[,2];
per<-round(summary(pca)$importance[2, 1:2]*100, 2);
pca$importance<-summary(pca)$importance;
res$pca<-pca;

col<-rep(c('darkgreen', 'deeppink1'), c(length(g1.ind), length(g2.ind)));

layout(matrix(1:2, nrow=1), width=c(3,1));
par(mai=c(1,1,.25,.25));

cx<-max(1.25, min(4, 24/length(X)))
plot(X, Y, col=col, pch=19, cex=cx, xlim=c(min(X)*1.1, max(X)*1.1), ylim=c(min(Y)*1.1, max(Y)*1.1), 
    xlab=paste('PC1', ', ', per[1], '%', sep=''), ylab=paste('PC2', ', ', per[2], '%', sep=''), cex.lab=1.5);

text(X, Y, label=1:length(X), col='white', cex=.3*cx);

par(mai=c(1, 0, 0.25, 0));
plot(0, type='n', xlim=c(0, 100), ylim=c(1, 100), axes=FALSE, bty='n', xaxs='i', yaxs='i', xlab='', ylab='');

w<-strwidth(1:length(X), cex=1.2);
w0<-1.2*80/max(w, 80);
h0<-1.2*(100/length(X))/4.0;
cex<-min(1.2, w0, h0); 

points(rep(5, length(X)), 100-(1:length(X))*4.0*cex, col=col, pch=19, cex=1.5*cex);
text(5+cex*5, 100-(1:length(X))*4.0*cex, labels=colnames(e1.2), adj=0, col=col, pch=19, cex=cex);
text(rep(5, length(labels)), 100-(1:length(X))*4.0*cex, labels=1:length(X), col='white', cex=0.8*cex);

dev.off();
pth.deg<-paste(path, '/DEG', sep='');
if (!file.exists(pth.deg)) dir.create(pth.deg);

means<-cbind(rowMeans(expr[, g1.ind]), rowMeans(expr[, g2.ind]));
colnames(means)<-c(g1.name, g2.name);
if (logged) l2r<-means[,2]-means[,1] else  l2r<-log2(means[,2]/means[,1]);
l2r[is.na(l2r)]<-0;
fc<-exp(l2r*log(2));
mn<-cbind(means, l2r, fc);
colnames(mn)<-c(paste('Mean', names(grps), sep='_'), 'LogFC', 'FoldChange');

# Adjust expr matrix if a penalty is given for large between-sample variance
if(penalty>0) {
  if (penalty>1) penalty<-1;
  colnames(means)<-paste('Mean_', c(g1.name, g2.name), sep='');
  if (paired) sd<-apply(expr[, g2.ind]-expr[, g1.ind], 1, sd) else {
    v1<-apply(expr[, g1.ind], 1, var);
    v2<-apply(expr[, g2.ind], 1, var);
    df1<-length(g1.ind)-1;
    df2<-length(g2.ind)-1;
    sd<-sqrt((df1*v1+df2*v2)/(df1+df2));
    sd[sd==0]<-min(sd[sd>0])/2;
   }
  pnl<-quantile(sd, probs=seq(0, 1, 0.01))[100*round(1-penalty,2)+1];
  e<-expr; # save the original
  e[, g2.ind]<-apply(e[, g2.ind], 2, function(d) means[,1]+(d-means[,1])/(pnl+sd));
} else e<-expr;

############################################################################################
de<-DeWrapper(e, grps, deg.method, list(nperm=nperm, paired=paired, logged=logged));
############################################################################################

saveRDS(de, file=paste(pth.deg, 'deg.rds', sep='/')); 
if (deg.method == DeMethods()[3]) { # Rank Product methods
  s<-de$results$stat; 
  rp<-de$results$rp$summarized; 
  out<-lapply(rp, function(x) x[, c('Rank', 'Pvalue', 'FDR')]);
  out[[3]]<-cbind(out[[1]], out[[2]]);
  cnm<-rep(paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), each=3);
  colnames(out[[3]])<-paste(colnames(out[[3]]), cnm, sep='_')
  out<-lapply(out, function(out) cbind(mn[rownames(out), , drop=FALSE], out));
  names(out)<-c(paste(c('Higher', 'Lower'), '_in_', g2.name, sep=''), 'Single_rank');
  degs<-lapply(out[1:2], function(x) {
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'Rank']<=num.top, , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(d[, 'Rank']), ]);
  stat<-out[[3]];
} else {
  out<-cbind(mn[rownames(de$results$stat), , drop=FALSE], de$results$stat[, 5:ncol(s)]);
  degs<-lapply(c(1, -1), function(di) {
    x<-out[sign(out[, 'LogFC'])==di, , drop=FALSE];
    deg<-x[x[, 'Pvalue']<=cutoff.p & x[, 'FDR']<=cutoff.fdr & abs(x[, 'LogFC'])>=cutoff.l2r, , drop=FALSE];
    if (nrow(deg) < num.top) deg<-x[x[, 'Pvalue']<=cutoff.p & abs(x[, 'LogFC'])>=sort(abs(x[, 'LogFC']), decreasing = TRUE)[min(num.top, nrow(x))], , drop=FALSE]; 
    deg;
  }); 
  degs<-lapply(degs, function(d) d[order(-1*abs(d[, 'LogFC'])), , drop=FALSE]);
  stat<-out;
}

up<-degs[[1]];
dn<-degs[[2]];

stat.table<-cbind(anno[rownames(stat), ], stat);
stat.formatted<-awsomics::FormatNumeric(cbind(ID=rownames(stat.table), stat.table));
stat.formatted$ID<-awsomics::AddHref(stat.formatted$ID, awsomics::UrlEntrezGene(stat.formatted$ID));
awsomics::CreateDatatable(stat.formatted, fn = paste(path, 'DEG', 'all_genes.html', sep='/'), rownames = FALSE, caption = "Differential expression of all genes");
fn.fig['deg']<-paste(path.figure, 'deg.pdf', sep='/');
pdf(fn.fig['deg'], width=6.4, height=4.8);
par(mai=c(0.4, 0.5, 0.3, 0.05), mfrow=c(2,2));

#if (class(out)=='list') d<-out[[length(out)]] else d<-out;
d<-cbind(mn, de$results$stat[, 5:6]);

# M-A Plot
plot(rowMeans(d[,1:2]), d[,'LogFC'], main='M-A Plot', pch=19, col='darkgrey',  cex=.25, xlab='', ylab='', cex.axis=.75);
abline(h=0, lwd=1, col=1);
lines(lowess(d[,'LogFC']~rowMeans(d[,1:2])), lwd=2, col=3);
ylab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
xlab<-paste('Log2(', g1.name, ')/2 + Log2(', g2.name, ')/2', sep='');
title(xlab=xlab, ylab=ylab, line=1.6, cex.lab=.75);

#Volcano Plot
x<-d[,'LogFC'];
p<-d[,'Pvalue'];
p[p==0]<-min(p[p>0])/exp(abs(d[p==0,3])*log(2));
y<--1*log10(p);
z<-sqrt(abs(x*y));
cx<-z/(max(z));
plot(x, y, main='Volcano Plot', pch=19, col='darkgrey',  cex=0.8*cx, xlab='', ylab='', cex.axis=.75, ylim=c(0, 1.1*max(y)), yaxs='i', xlim=c(-1*max(abs(x)), max(abs(x))), yaxt='n');
xlab<-paste('Log2(', g2.name, '/', g1.name, ')', sep='');
title(xlab=xlab, ylab='P value', line=1.6, cex.lab=.75);
axis(2, at=0:ceiling(max(y)), labels=10^(-1*(0:ceiling(max(y)))));
abline(h=-1*log10(0.05), v=c(-1,1), col=4, lty=3);
box();

# P Value Distribution
hist(p[p<1], br=seq(0, 1, 0.01), cex.axis=.75, main='P Value Distribution');
title(xlab='P value', ylab='Count', line=1.6, cex.lab=.75);
box();

# FDR
q<-round(d[, 'FDR'], 2);
n<-sapply(seq(min(q), 1, 0.01), function(x) length(q[q<=x]));
plot(1, type='n', log='y', xlab='', ylab='', cex.axis=.75, xlim=0.01*c(1, 100), ylim=c(1, nrow(expr)), main='FDR');
abline(v=seq(0, 1, .05), lty=3, col=8);
lines(seq(min(q), 1, 0.01), n, lwd=2, col=2);
title(xlab='FDR cutoff', ylab='Count', line=1.6, cex.lab=.75);
box();

dev.off();

#FDR counts
c<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
n.fdr<-sapply(c, function(c) sapply(c(1, -1), function(di) nrow(d[sign(d[, 'LogFC'])==di & d[, 'FDR']<=c, , drop=FALSE])));
fdr.table<-cbind(c, t(n.fdr), colSums(n.fdr)); 
colnames(fdr.table)<-c('FDR', paste(c('Higher', 'Lower'), 'in', g2.name, sep='_'), 'Total')

# Plot top genes
fn.fig['top']<-paste(path.figure, 'top.pdf', sep='/');
pdf(fn.fig['top'], width=6.4, height=2.4);
par(mfrow=c(1,2), mai=c(0.7, 0.35, 0.3, 0.05));
ind<-c(rownames(up)[1], rownames(dn)[1]);

cex<-min(0.75, min(0.5/max(strwidth(colnames(e1.2), unit='in'))));
col<-rep(c('grey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
for (ii in 1:2) barplot(e1.2[ind[ii], ], las=3, cex.names=cex, main=as.vector(anno[ind[ii], 1]), cex.main=.75, cex.axis=.75, col=col);
dev.off();

# Plot heatmap of top genes
fn.fig['heatmap']<-paste(path.figure, 'heatmap.pdf', sep='/');
pdf(fn.fig['heatmap'], width=6.4, height=6.4);
par(mfrow=c(1,1));
goi<-e1.2[c(rownames(up), rownames(dn)), ];
rownames(goi)<-as.vector(anno[rownames(goi), 1]);
DegHeatmap(goi,  col=rep(c('chartreuse1', 'deeppink1'), c(length(g1.ind), length(g2.ind))), plot.new=FALSE);
dev.off();

## Gene counts by FDR cutoffs
fdr<-c(0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.25);
if (deg.method == DeMethods()[3]) {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]>0 & stat[, 7]<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 3]<0 & stat[, 10]<=fdr, , drop=FALSE]));
} else {
  c1<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']>0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
  c2<-sapply(fdr, function(fdr) nrow(stat[stat[, 'LogFC']<0 & stat[, 'FDR']<=fdr, , drop=FALSE]));
} 
fdr.table<-cbind(fdr, c1, c2, c1+c2);
colnames(fdr.table)<-c('FDR cutoff', names(degs), 'Total');

# pre-ranking of genes
rnk<-sign(d[, 'LogFC'])*sqrt(d[,'LogFC']^2+log10(p)^2);
pth.deg1<-paste(pth.deg, '/Higher_in_', g2.name, sep='');
pth.deg2<-paste(pth.deg, '/Lower_in_', g2.name, sep='');
pth.deg1.bars<-paste(pth.deg1, '/bars', sep='');
pth.deg2.bars<-paste(pth.deg2, '/bars', sep='');

if (!exists(pth.deg)) dir.create(pth.deg, showWarnings = FALSE);
if (!exists(pth.deg1.bars)) dir.create(pth.deg1.bars, showWarnings = FALSE, recursive = TRUE);
if (!exists(pth.deg2.bars)) dir.create(pth.deg2.bars, showWarnings = FALSE, recursive = TRUE);
ids<-c(up=rownames(up), dn=rownames(dn));
names(ids)<-paste(rep(c(pth.deg1.bars, pth.deg2.bars), c(nrow(up), nrow(dn))), '/', ids, '.pdf', sep='');

col<-rep(c('lightgrey', 'lightblue'), c(length(g1.ind), length(g2.ind)));
wid<-min(1.8, 1.2/(0.1*max(nchar(colnames(expr[, c(g1.ind, g2.ind)])))));
fn.barplot<-sapply(names(ids), function(nm) {
    pdf(nm, w=8, h=6); 
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(expr[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Expression level', cex.lab=wid, cex.names=wid);
    title(main=paste(ids[nm], as.vector(anno[ids[nm], 1]), sep=' - '), cex.main=2);
    #plot.new();
    par(mai=c(1.2, 1, 0.6, 0.2));
    barplot(pctl[ids[nm], c(g1.ind, g2.ind)], las=3, col=col, ylab='Percentile (%)', ylim=c(0, 100), cex.lab=2, cex.names=wid);
    title(main=paste(ids[nm], as.vector(anno[ids[nm], 1]), sep=' - '), cex.main=2);

    dev.off();
    nm;
});

# Write index tables of DEGs
cnm<-c(colnames(anno), colnames(up), 'Samples');
up.tbl<-data.frame(anno[rownames(up), , drop=FALSE], up, rep('Figure', nrow(up)));
colnames(up.tbl)<-cnm;
up.tbl$Samples<-awsomics::AddHref(up.tbl$Samples, paste('./bars/', rownames(up), '.pdf', sep='')); 
up.formatted<-DEGandMore::GeneList2Datatable(awsomics::FormatNumeric(up.tbl), paste(pth.deg1, 'index.html', sep='/'), col.symbol=1, genome=genome, 
                                             title=paste('Genes with higher expression in', g2.name));

dn.tbl<-data.frame(anno[rownames(dn), , drop=FALSE], dn, rep('Figure', nrow(dn)));
colnames(dn.tbl)<-cnm;
dn.tbl$Samples<-awsomics::AddHref(dn.tbl$Samples, paste('./bars/', rownames(dn), '.pdf', sep='')); 
dn.formatted<-DEGandMore::GeneList2Datatable(awsomics::FormatNumeric(dn.tbl), paste(pth.deg2, 'index.html', sep='/'), col.symbol=1, genome=genome, 
                                             title=paste('Genes with lower expression in', g2.name));
########################################################################################################
gse<-lapply(degs, function(gs) awsomics::TestGSE(rownames(gs), rownames(expr), geneset$list));
########################################################################################################

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora);
sapply(names(degs), function(nm) if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'), recursive = TRUE))->x;
names(gse)<-names(degs);
saveRDS(gse, paste(path.ora, 'ora_full.rds', sep='/'));

ora.table<-lapply(gse, function(gse) {
  g<-gse$stat;
  lst<-gse$list[rownames(g)];
  rand<-sapply(lst, function(l) { 
    mtrx<-e1.2[rownames(e1.2) %in% l, , drop=FALSE];
    flexclust::randIndex(table(kmeans(t(mtrx), 2)[[1]], rep(1:2, sapply(grps, length))));
  })
  an<-geneset[[1]][rownames(g), ];
  data.frame(row.names = rownames(g), stringsAsFactors = TRUE, Source=an[,1], Collection=an[,2], Term=an[, 'Name'], 
                Total=rowSums(gse$size[rownames(g), 3:4]), Within=gse$size[rownames(g), 4], Enrichment=g[, 'Odds_Ratio'], 
                Rand=rand, Pvalue=g[, 'P_HyperGeo'], FDR=g[, 'FDR_BH']);
}); 


########################################################################################################
# Biclustering
bic<-lapply(gse, function(g) if (nrow(g[[1]])<5) NA else awsomics::BiclusterFromList(g[[2]][rownames(g[[1]])[1:min(250, nrow(g[[1]]))]]));

bic2gs<-lapply(bic, function(bic) lapply(bic[[2]], rownames));
gs2bic<-lapply(bic2gs, function(b) lapply(split(rep(1:length(b), sapply(b, length)), unlist(b, use.names=FALSE)), function(x) sort(unique(x))));
gs2bic<-lapply(gs2bic, function(b) sapply(b, function(b) paste(b, collapse=';')));

########################################################################################################
# Output tables
ora.table<-lapply(names(ora.table), function(nm) {
  b<-gs2bic[[nm]];
  b<-b[rownames(ora.table[[nm]])];
  b[is.na(b)]<-'';
  ora.table[[nm]]$Cluster<-b;
  ora.table[[nm]];
});
names(ora.table)<-names(bic);
path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora, recursive = TRUE);

# Split results into tables by collections
ora.wrapped<-lapply(names(ora.table), function(nm) {
  pth<-paste(path.ora, nm, 'table', sep='/');
  if (!file.exists(pth)) dir.create(pth, recursive = TRUE); 
  WrapGSE(ora.table[[nm]][, -(1:3)], geneset$meta, TRUE, pth); 
});
names(ora.wrapped)<-names(ora.table);

# create index.html file
ora.tbl<-lapply(ora.wrapped, function(o) {
  n<-lapply(o$formatted, function(o) sapply(o, nrow)); 
  fn<-unlist(o$file, use.names=FALSE); 
  fn<-sub(paste(path.ora, '/', sep=''), '', fn); 
  s<-rep(names(n), sapply(n, length));
  c<-unlist(lapply(n, names), use.names=FALSE); 
  n<-as.vector(unlist(n, use.names=FALSE)); 
  tbl<-data.frame(Source=s, Collection=c, N=n, stringsAsFactors = FALSE);
  tbl$N<- AddHref(tbl$N, fn);
  rownames(tbl)<-paste(tbl[[1]], tbl[[2]], sep='_'); 
  tbl
});
u<-sort(union(rownames(ora.tbl[[1]]), rownames(ora.tbl[[2]]))); 
ora.tbl<-lapply(ora.tbl, function(t) {
  t<-t[u, ];
  t[is.na(t[,3]), 3]<-0; 
  rownames(t)<-u; 
  t;
});
ora.ind<-cbind(ora.tbl[[1]], N2=ora.tbl[[2]]$N); 
ora.ind[is.na(ora.ind[,1]), 1]<-ora.tbl[[2]][is.na(ora.ind[,1]), 1]; 
ora.ind[is.na(ora.ind[,2]), 2]<-ora.tbl[[2]][is.na(ora.ind[,2]), 2]; 
names(ora.ind)<-c('Source', 'Collection', names(ora.wrapped)); 
CreateDatatable(ora.ind, paste(path.ora, 'index.html', sep='/'), rownames = FALSE, caption = "Click on number to see list"); 

# All tested gene sets
gs.table<-lapply(split(geneset[[1]][, 'Collection'], geneset[[1]][, 'Source']), table);
gs.table<-data.frame(stringsAsFactors = FALSE, Source=rep(names(gs.table), sapply(gs.table, length)), 
                     Collection=unlist(lapply(gs.table, names), use.names=FALSE), Geneset=as.numeric(unlist(gs.table)));
awsomics::CreateDatatable(gs.table, paste(path.ora, 'geneset.html', sep='/'), rownames = FALSE, caption = "Gene set collections");

# Formatted tables of ORA results
stat.slim<-data.frame(anno[rownames(stat), 1, drop=FALSE], awsomics::FormatNumeric(stat));
ora.formatted<-lapply(names(ora.table), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'term', sep='/'))) dir.create(paste(path.ora, nm, 'term', sep='/'));

  t<-ora.table[[nm]];
  an<-geneset[[1]][rownames(t), ];
  t$Term<-awsomics::AddHref(t$Term, an[, 'URL']);
  t$Total[1:min(250, nrow(t))]<-awsomics::AddHref(t$Total[1:min(250, nrow(t))], paste('./term/term_', 1:min(250, nrow(t)), '.html', sep=''));
  #t$Within<-awsomics::AddHref(t$Within, paste('./term/term_', rownames(t), '_within.html', sep=''));

  t<-awsomics::FormatNumeric(t);
  awsomics::CreateDatatable(t, fn=paste(path.ora, nm, 'term.html', sep='/'), rownames = FALSE);
  sapply(1:min(250, nrow(t)), function(i) {
    g<-geneset$list[rownames(t)[i]][[1]];
    t1<-stat.slim[rownames(stat.slim) %in% g, , drop=FALSE];
    t1<-t1[order(t1[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    fn<-paste(path.ora, nm, 'term', paste('term_', i, '.html', sep=''), sep='/');
    GeneList2Datatable(t1, fn, genome = genome, title = rownames(t)[i]);
  });

  t;
}); 
names(ora.formatted)<-names(ora.table);

# Formatted table of biclustering
bic.formatted<-lapply(names(bic), function(nm) {
  if (!file.exists(paste(path.ora, nm, sep='/'))) dir.create(paste(path.ora, nm, sep='/'));
  if (!file.exists(paste(path.ora, nm, 'bicluster', sep='/'))) dir.create(paste(path.ora, nm, 'bicluster', sep='/'));

  t<-awsomics::FormatNumeric(bic[[nm]]$summary);

  # write individual files of biclusters
  fns<-sapply(1:nrow(t), function(i) {
    fn<- paste(path.ora, nm, 'bicluster', paste(t[i, 1], c('_terms.html', '_genes.html', '_heatmap.pdf'), sep=''), sep='/');
    t1<-ora.formatted[[nm]][rownames(bic[[nm]][[2]][[i]]), ];
    awsomics::CreateDatatable(t1, fn[1], FALSE, caption=paste('Terms of', t[i,1]));
    t2<-stat.slim[colnames(bic[[nm]][[2]][[i]]), , drop=FALSE];
    t2<-t2[order(t2[, paste('Rank', nm, sep='_')]), , drop=FALSE];
    GeneList2Datatable(t2, fn[2], genome = genome, title=paste('Genes of', t[i,1])); 
    t3<-bic[[nm]][[2]][[i]];
    rownames(t3)<-paste(ora.table[[nm]][rownames(t3), 2], ora.table[[nm]][rownames(t3), 3], sep=' - ');
    colnames(t3)<-paste(rownames(anno[colnames(t3), , drop=FALSE]), anno[colnames(t3), 1], sep=' - ');
    if (min(t3)==1) col.min<-'#0000FF' else col.min<-'#999999';
    awsomics::PlotHeatmap(t3, size.max=Inf, fn=sub('.pdf$', '', fn[3]), col.min=col.min, col.max='#0000FF');
    fn;
  });
  bic.fn<-t(fns);

  # write index table
  t$Num_Terms<-awsomics::AddHref(t$Num_Terms, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 1]));
  t$Num_Genes<-awsomics::AddHref(t$Num_Genes, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 2]));
  t$ID<-awsomics::AddHref(t$ID, sub(paste(path.ora, nm, sep='/'), '\\.', bic.fn[, 3]));
  awsomics::CreateDatatable(t, fn=paste(path.ora, nm, 'bicluster.html', sep='/'), rownames = FALSE);

  t;
});
path.gsea<-paste(path, 'GSEA', sep='/');
if (file.exists(path.gsea)) unlink(path.gsea, recursive = TRUE); # Path must not exist

tmp<-as.character(as.integer(Sys.time())); 
if (file.exists(tmp)) unlist(tmp, recursive = TRUE); 
dir.create(tmp, recursive = TRUE); 

# loading template yaml
input.gsea<-yml$input$gsea;
gsea.yml<-input.gsea$template;
if (input.gsea$remote) gsea.yml<-yaml.load(RCurl::getURL(gsea.yml)) else gsea.yml<-yaml.load_file(gsea.yml)

fn.gsea<-PrepareGSEA(e1.2, grps, paste(tmp, 'input', sep='/'), desc = anno[[1]]); 

gsea.yml$jar<-input.gsea$jar;
gsea.yml$preranked<-FALSE;
gsea.yml$thread<-input.gsea$thread; 
gsea.yml$name<-tmp;
gsea.yml$groups$control<-g1.name;
gsea.yml$groups$case<-g2.name;
gsea.yml$out<-path;
gsea.yml$input<-fn.gsea[1];
gsea.yml$class<-fn.gsea[2];
gsea.yml$chip<-c();
gsea.yml$gmt<-input.gsea$gmt;
gsea.cmmd<-GSEAviaJava(gsea.yml); 
file.rename(paste(gsea.yml$out, tmp, sep='/'), path.gsea); 
gsea.tbl<-readRDS(paste(path.gsea, 'full_table.rds', sep='/')); 
#kegg<-GseaKegg(expr, g1.ind, g2.ind, c(g1.name, g2.name), paired, genome, 
#               path=paste(path, '/KEGG', sep=''), Inf, Inf, ranking.penalty=penalty, 
#               path.xml=paste(Sys.getenv('RCHIVE_HOME'), 'data/pathway/public/kegg/src', sep='/'));
# Excel spreadsheets
xls<-list(stat.table, stat.table[rownames(up), , drop=FALSE], stat.table[rownames(dn), , drop=FALSE],
          ora.table[[1]], ora.table[[2]], gsea.tbl);
names(xls)<-c('Complete List', 
  paste('Top', nrow(up), ', higher in ', g2.name, sep=''), 
  paste('Top', nrow(dn), ', lower in ', g2.name, sep=''),
  paste('ORA, higher in ', g2.name, sep=''),
  paste('ORA, lower in ', g2.name, sep=''),
  "GSEA"
);
xls<-lapply(xls, awsomics::FormatNumeric);
fn.xls<-paste(path, paste(g1.name, g2.name, sep='-vs-'), sep='/');
WriteExcel(xls, fn.xls, ); 
fn.png<-lapply(fn.fig, function(f) awsomics::ConvertPdf2Png(f, resize=80, density=150));
fn.png[[length(fn.png)+1]]<-sapply(names(bic), function(nm) {
  t<-bic[[nm]]$summary;
  n<-t[,2]*t[,3];
  f<-paste('bicluster/', t[n==max(n)[1], 1], '_heatmap.pdf', sep='');
  awsomics::ConvertPdf2Png(paste(path.ora, nm, f, sep='/'), resize=80, density=150); 
});
fn.png[[length(fn.png)+1]]<-sapply(kegg$Formatted, function(t) {
  ids<-as.vector(t$Pathway_ID);
  fn<-paste(path, 'KEGG', 'Files', 'figures', 'colored_plots', paste(ids, '.pathview.png', sep=''), sep='/');
  fn[file.exists(fn)][1];
});
res$de<-de;
res$ora<-gse;
res$gsea<-gsea.tbl;
saveRDS(res, file=paste(path, 'result.rds', sep='/'));

fn.zip<-sapply(strsplit(path, '/'), function(x) x[length(x)]);
fn.zip<-paste(fn.zip, '.zip', sep=''); 
if (file.exists(fn.zip)) file.remove(fn.zip); 
zip(fn.zip, path, flag='-r0X', zip='zip');
fn.z<-paste(path, fn.zip, sep='/');
if (file.exists(fn.z)) file.remove(fn.z); 
file.rename(fn.zip, fn.z);

Results at a glance

r g1.name vs. r g2.name

Click a number to see table, or read the rest of this page for details:

t<-cbind(c(nrow(up), nrow(dn)), sapply(ora.table, nrow),  sapply(bic, function(x) nrow(x[[1]])), sapply(kegg$Formatted, function(x) nrow(x[x[, 'P_Value']<=0.05, , drop=FALSE])));
rownames(t)<-paste(c('Higher', 'Lower'), 'in', g2.name);
colnames(t)<-c('DEG', 'ORA', 'Bicluster', 'KEGG_GSEA');
t[, 1]<-awsomics::AddHref(t[,1], paste("./DEG", names(degs), 'index.html', sep='/'));
t[, 2]<-awsomics::AddHref(t[,2], paste("./ORA", names(degs), 'term.html', sep='/'));
t[, 3]<-awsomics::AddHref(t[,3], paste("./ORA", names(degs), 'bicluster.html', sep='/'));
t[, 4]<-awsomics::AddHref(t[,4], paste("./KEGG", names(degs), 'index.html', sep='/'));
knitr::kable(t);              


Introduction

This document reports the analysis results from comparing the transcriptomes of two sample groups: r g1.name vs. r g2.name.


Required input variables

The following inputs are required to generate this report. You can download an example of input data file from here.


Comparison metadata



Sample analysis

This section analyzes the samples by summarizing their global expression patterns, through descriptive statistics, unsupervised clustering, and sample-sample correlation.


Global distribution

Distribution of average expression level and sample-sample variance.

Figure 1:


Sample-sample correlation

All genes in this data set were used to calculate the Pearson's correlation between each pair of samples. The result is often used to identify potential outlier samples as the outliers tend to have relatively lower correlation coefficient to any of the other samples.

Figure 2:

Figure 3:

Figure 4:



Differential gene expression

This section evaluates the differential expression of each each between two sample groups. A compiled version of differential expression results can be downloaded as an Excel file here.


Differential expression of single genes

By default, Rank Product (RP), a non-paramatric statistical test based on ranks of fold changes, is used for the statistical analysis. The outputs of RP include:

Summary

The full table of single gene differential expression can be seen here.

We use the following visualization methods to get an overview of all r nrow(expr) genes:

Figure 5:

Table: Gene counts by FDR cutoffs.

pander::pander(fdr.table);

Top genes

Figure 6: Top-ranked genes on both sides (left: higher in r g2.name; right: lower in r g2.name)

Figure 6:

DEGs (differentially expressed genes)

Click link to get full list of DEGs:

Higher in r g2.name

Lower in r g2.name

DEGs were selected using the following rules:

Figure 7: A heatmap using all DEGs. Expression measurements of each gene are normalized across samples for plotting; redder means higher expression level. The color of column sidebar indicates which group each sample belongs to.

Figure 7:


Differential expression of gene sets

In this section, the unit of analysis is predefined gene sets. It is based on the assumption that if genes of the same gene set, such as those in the same signaling pathway or coregulated group, are more likely to have differential expression, the gene set is important to the biological difference between the two sample groups.

Over-representation analysis (ORA)

ORA evaluates whether the genes of any predefined gene sets, such as a KEGG pathway or GO term, are over-represented in DEGs. We uses hypergeometric test to evaluate whether N/Nt is significantly greater than M0/Mt, where Mt is the total number of genes in the data set, M is the total number of genes in both of the data set and a gene set, Nt is the total number of DEGs previously selected, and N is the overlapping of M and Nt.

We have archived a large collection of gene sets from multiple sources. By default, a subset of these gene sets, each including at least r min(geneset[[1]][, 'Size']), are used to test over-representation of DEGs in them. Click here to see a summary of these genesets.

Click link to get full ORA results:

Higher in r g2.name

Lower in r g2.name

Gene-gene set biclustering

This analysis performs biclustering DEGs and up to top 250 gene sets identified from the ORA. Its goal is to reduce redundant information in the ORA results by identifying "modules" of genes and gene sets likely to be grouped together. The bicluster was done with Iterative Signature Algorithm. Figure 8A and 8B are two examples of biclusters.

Click link to get full biclustering results:

Higher in r g2.name

Lower in r g2.name

Figure 8A: A bicluster module based on DEGs with higher expression in r g2.name

Figure 8B: A bicluster module based on DEGs with lower expression in r g2.name

Gene set enrichment analysis (GSEA) of KEGG pathways

GSEA is a different category of analyses applied to gene sets. Instead of using selected top DEGs like ORA, GSEA takes into account all genes in the data set after they were ranked by a statistical test such as Rank Product test. The original GSEA was developed by Broad Institute that is usually run as a stand-alone program. We used a similar method called GAGE in this analysis. By default, the normalized rand products of all genes were used to make colored pathway plots.

KEGG pathways are mostly canonical metabolic pathways conserved across species. Each pathway can include both genes(proteins) and chemical compounds. The goal of this analysis is to identify pathways that were up- or down-regulated in r g2.name by examing the general pattern of gene differential expression. Each element in the pathways were colored to represent its differential expression (red: hiher in r g2.name). Figure 9A and 9B are two examples of colored KEGG pathways.

Click link to get full GSEA results of KEGG pathways:

Higher in r g2.name

Lower in r g2.name

Figure 9A: A KEGG pathway with higher expression in r g2.name

Figure 9A: A KEGG pathway with lower expression in r g2.name



Appendix

Supplementary information about this report


Fold change vs. log(fold change)

In this report, fold change refers the ratio of two group means in their unlogged form. So a fold change of 2.0 means the average of the second group is increased to twice of the average of the first group; similarly, a fold change of 0.5 means the average is reduced to half. Log(fold change) equals to the log2-transformation of the fold change. The table below gives a few examples of the conversion of these 2 variables.

c<-c(1.25, 1.5, 2, 4, 8);
fc<-c(1/rev(c), 1, c);
lg<-log2(fc);
pct<-100*(fc-1);
t<-round(cbind(fc, lg, pct), 3);
colnames(t)<-c('Fold change', 'Log2(fold change)', 'Percentage change (%)');
pander::pander(t); 

R/Biocondcutor packages and functions

The table below lists the core R/Bioconductor packages and functions used to generate this report while in-house functions are used to customize their outputs.

t<-strsplit("Create this report\tknitr\tknit2html
Hierarchical clustering\tstats\thclust 
PCA\tstats\tprcomp
Differential expression\tRankProd\tRP
Heatmap\tstats\theatmap
Gene ontology analysis\tRDAVIDWebService\tDAVIDWebService
Biclustering analysis\tisa2\tisa
Gene set enrichment analysis of KEGG pathways\tgage\tgage
Plot color-coded KEGG pathways\tpathview\tpathview
Write data to Excel\txlsx\tcreateWorkbook
Misc. custom functions\tawsomics\t ", '\n')[[1]]; 
t<-do.call('rbind', strsplit(t, '\t'));
colnames(t)<-c('Task', 'R package', 'R function');
pander::pander(t);

The .Rmd template file is within the DEGandMore package and can be found here.

An example of input data can be download from here


References


END OF DOCUMENT



zhezhangsh/DEGandMore documentation built on Sept. 22, 2022, 9:55 a.m.