In zhezhangsh/DEGandMore:

cat("Setting up and loading data\n");

knitr::opts_chunk$set(eval=TRUE, dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE, fig.path='FIGURE/');

library(MASS);
library(gplots);
library(knitr);
library(rmarkdown); 
library(yaml); 
library(awsomics);
library(rchive);
library(DEGandMore);

#fn.yaml<-'Placenta_Liver_Uterus.yml';
#yml<-yaml.load_file(fn.yaml); 

path<-yml$output;
if (!file.exists(path)) dir.create(path, recursive = TRUE); 
comp<-names(yml$input$comparison); 

# Load results
cat("Loading results from pairwsie DEG analysis ...\n"); 
res<-lapply(paste(yml$input$comparison, 'result.rds', sep='/'), readRDS); 
grp<-lapply(res, function(res) names(res$input$comparison));
smp<-lapply(res, function(x) x$input$comparison);
names(res)<-names(grp)<-names(smp)<-comp;

gset<-readRDS(yml$input$geneset$source); 
smpl<-readRDS(yml$input$sample)[unlist(smp, use.names=FALSE), ]; 
grps<-do.call('c', lapply(res, function(res) res$input$comparison)); 

path.r<-paste(path, 'R', sep='/');
path.tbl<-paste(path, 'TABLE', sep='/');
if (!file.exists(path.r)) dir.create(path.r, recursive = TRUE); 
if (!file.exists(path.tbl)) dir.create(path.tbl, recursive = TRUE); 

de<-lapply(res, function(res) res$de); 
ora<-lapply(res, function(res) res$ora); 
gsea<-lapply(res, function(res) res$gsea); 

ind1<-c(1,1,2);
ind2<-c(2,3,3);

**_[Go back to project home](`r yml$home`)_**

Introduction: This analysis is based on the outputs of pairwise comparisons of differential gene expression generated by this template. It uses results from 3 pairwise comparisons of 3 sample groups vs. their corresponding control groups and compares how these 3 sample groups are different from each other in terms of their sample-control differences (delta-delta). An example of such analysis is the different responses of 3 cell types to the treatment of the same drug. This analysis is focused on the overlapping of differentially expression at both gene and gene set levels.

`r comp[1]` vs. `r comp[2]` vs. `r comp[3]`

**_[Go back to project home](`r yml$home`)_**

Project

cat('Writing project background\n'); 

lns<-lapply(names(yml$project), function(nm) {
  c(paste('##', nm), '\n', yml$project[[nm]], '\n'); 
});
lns<-paste(do.call('c', lns), collapse='\n');

r lns

Pairwise comparisons

This report compares the results of the following pairwise comparisons. Click links to view full results of individual comparisons:

**_[Go back to project home](`r yml$home`)_**

Gene-level comparison

stat<-lapply(de, function(de) de$result$stat); 
deg<-lapply(de, function(de) de$DEG); 
anno<-lapply(res, function(res) res$input$anno); 
names(stat)<-names(deg)<-names(anno)<-comp

# union set of gene id
if (yml$input$gene$union) gid<-Reduce('union', lapply(stat, rownames)) else 
  gid<-Reduce('intersect', lapply(stat, rownames)); 

# Gene annotation
x<-lapply(anno, rownames); 
y<-unlist(x, use.names=FALSE);
x<-sapply(anno, function(a) as.integer(gid %in% rownames(a)));
y<-max.col(x, 'first'); 
z<-lapply(1:length(anno), function(i) anno[[i]][gid[y==i], , drop=FALSE]);
anno<-do.call('rbind', z)[gid, ]; 

# DE statistics
l2r<-sapply(stat, function(stat) stat[, 'LogFC'][gid]); 
l2r[is.na(l2r)]<-0;
pvl<-sapply(stat, function(stat) stat[, 'Pvalue'][gid]); 
pvl[is.na(pvl)]<-1;
fdr<-sapply(stat, function(stat) stat[, 'FDR'][gid]); 
fdr[is.na(fdr)]<-1;
rnk<-apply(-sign(l2r)*log10(pvl), 2, rank);
corr<-round(l2r, 3);
rownames(l2r)<-rownames(pvl)<-rownames(fdr)<-rownames(rnk)<-gid; 

tbl<-lapply(stat, function(s) sapply(colnames(s)[c(1,2,4,5,6)], function(cnm) s[, cnm][gid])); 
tbl<-do.call('cbind', tbl);
colnames(tbl)[c(3:5, 8:10, 13:15)]<-paste(rep(colnames(tbl)[3:5], 3), rep(names(grp), each=3), sep='_'); 
tbl<-cbind(anno, FormatNumeric(tbl)); 
CreateDatatable(tbl, paste(path, 'gene_table.html', sep='/'), caption='Gene-level differential expression of all 3 pairwise comparisons')->fn; 

corr<-round(sapply(1:3, function(i) cor(l2r[, ind1[i]], l2r[, ind2[i]])), 3);
lns<-paste('  - Corr(', comp[ind1], ':', comp[ind2], ') = ', corr, sep='');
lns<-paste(lns, collapse='\n'); 

tbl0<-tbl;
tbl0[, 1]<-CleanHtmlTags(tbl0[, 1]); 
saveRDS(tbl, paste(path.r, 'gene_stat.rds', sep='/')); 
write.csv(tbl0, paste(path.tbl, 'gene_stat.csv', sep='/'));

Global delta-delta correlation

Both comparisons reported the log ratio of sample and control group means for each gene. The global agreement of log ratios of all genes indicates how much the results of these 3 comparisons are similar to or different from each other. Full table of gene-level statistics side-by-side is here.

Figure 1 This plot shows the global correlation (correlation coefficient = r corr) between the 3 pairwise comparisons: r comp[1], r comp[2], and r comp[3]. The same 3D plot was showed in 2 different angles. Genes obtained p values less than r yml$input$gene$pvalue from any 1, any 2, or all 3 comparisons were highlighted in yellow, orange, or red respectively. The correlatio coefficients between log-ratios of each pair of comparisons are:

r lns

wzxhzdk:3

**_[Go back to project home](`r yml$home`)_**

Differentially expression genes (DEGs)

Both comparisons identified DEGs from 2 compared groups. Check report of individual comparisons for how the DEGs were selected. Overlapped DEGs identified by all 3 comparisons are worthy of a closer look.

Table 1 Number of DEGs identified by each comparison:

deg.n<-t(sapply(deg, function(deg) sapply(deg, nrow))); 
colnames(deg.n)<-c('DEG, > control', 'DEG, < control');
n0<-sapply(stat, nrow);
n1<-apply(pvl, 2, function(x) length(x[!is.na(x) & x<= yml$input$gene$pvalue])); 
deg.n<-cbind(n0, n1, deg.n);
colnames(deg.n)[1:2]<-c('Total_gene', paste('P <', yml$input$gene$pvalue));
pander::pander(deg.n, align=c('l', 'c', 'c', 'c'), );

path.deg<-paste(path, 'DEG', sep='/');
if (!file.exists(path.deg)) dir.create(path.deg, recursive = TRUE); 

deg.up<-lapply(deg, function(x) rownames(x[[1]])); 
deg.dn<-lapply(deg, function(x) rownames(x[[2]])); 

deg.all<-list(up=deg.up, down=deg.dn);
fn.tbl<-lapply(names(deg.all), function(nm) {
  gs<-deg.all[[nm]]; 
  sapply(1:3, function(i) {
    c<-comp[c(ind1[i], ind2[i])]; 
    g<-intersect(gs[[ind1[i]]], gs[[ind2[i]]]); 
    t<-tbl[g, , drop=FALSE]; 
    f<-paste(paste(nm, c[1], c[2], sep='_'), '.html', sep=''); 
    CreateDatatable(t, paste(path.deg, f, sep='/'), caption=paste(nm, ', ', c[1], ' and ', c[2], sep='')); 
    names(f)<-paste(c, collapse=' and '); 
    f;
  })
}); 

deg.up3<-Reduce('intersect', deg.up);
deg.dn3<-Reduce('intersect', deg.dn);
CreateDatatable(tbl[deg.up3, , drop=FALSE], paste(path.deg, 'up_all3.html', sep='/'), caption='Genes up-regulated in all 3 comparisons')->f; 
CreateDatatable(tbl[deg.dn3, , drop=FALSE], paste(path.deg, 'down_all3.html', sep='/'), caption='Genes down-regulated in all 3 comparisons')->f;

lns<-sapply(fn.tbl, function(f) {
  l<-paste('  - [', names(f), '](', paste('DEG', f, sep='/'), ')', sep=''); 
  paste(l, collapse='\n'); 
})

lns.up<-paste(' - [All 3 comparisons](DEG/up_all3.html)', lns[[1]], sep='\n'); 
lns.dn<-paste(' - [All 3 comparisons](DEG/down_all3.html)', lns[[2]], sep='\n');

Figure 2A Overlapping of DEGs, higher expression comparing to control groups. Click links to view overlapping genes:

r lns.up

wzxhzdk:6

Figure 2B Overlapping of DEGs, lower expression comparing to control groups. Click links to view overlapping genes:

r lns.dn

wzxhzdk:7

**_[Go back to project home](`r yml$home`)_**

ANOVA

# Combined matrix
d<-lapply(res, function(res) {
  input<-res$input;
  s<-c(input$comparison[[1]], input$comparison[[2]]); 
  sapply(s, function(s) input$expr[, s]); 
}); 
g<-Reduce('intersect', lapply(d, rownames)); 
d<-do.call('cbind', lapply(d, function(d) d[g, ])); 

# Prepare for ANOVA
s<-smpl[colnames(d), ]
for (i in 1:ncol(s)) s[[i]]<-as.factor(s[[i]]);
s$d<-d[1, ];
d0<-lapply(1:nrow(d), function(i) d[i, ]);
f<-formula(yml$input$gene$anova$formula);

# Run ANOVA
paov<- parallel::mclapply(d0, function(x) {
  s$d<-x;
  smm<-summary(aov(f, data=s))[[1]];
  if (length(smm)==1) smm[[1]] else smm;
}, mc.cores=yml$input$gene$anova$core);

# sort column names
cnm<-rownames(paov[[1]]); 
cnm<-gsub(' ', '', cnm[-length(cnm)]); 
cnm0<-c(yml$input$gene$anova$f1, yml$input$gene$anova$f2, paste(yml$input$gene$anova$f1, yml$input$gene$anova$f2, sep=':'));

# p value table
paov<-t(sapply(paov, function(x) x[-nrow(x), 5])); 
dimnames(paov)<-list(rownames(d), cnm); 
paov<-paov[, c(cnm0, setdiff(cnm, cnm0))]; 
paov<-paov[order(paov[, 3]), ];
colnames(paov)<-paste('P', colnames(paov), sep='_'); 
sig.aov<-paov[paov[, 3]<=yml$input$gene$anova$p, , drop=FALSE]; 

# full table
m<-sapply(grps, function(g) rowMeans(d[, g, drop=FALSE])); 
aov.stat<-cbind(m[rownames(paov), ], paov); 
aov.tbl<-data.frame(anno[rownames(paov), ], FormatNumeric(aov.stat), stringsAsFactors = FALSE);
CreateDatatable(aov.tbl, paste(path, 'anova_table.html', sep='/'), caption='ANOVA results'); 

# Write out
saveRDS(d, paste(path.r, 'expr.rds', sep='/'));
saveRDS(anno[rownames(d), ], paste(path.r, 'anno.rds', sep='/'));
saveRDS(aov.stat, paste(path.r, 'anova_stat.rds', sep='/'));
write.csv(d, paste(path.tbl, 'expr.csv', sep='/'));
write.csv(anno[rownames(d), ], paste(path.tbl, 'anno.csv', sep='/'));
write.csv(aov.stat, paste(path.tbl, 'anova_stat.csv', sep='/'));

2-way ANOVA analysis was performed to identify genes responding to r yml$input$gene$anova$f2 differently in different r yml$input$gene$anova$f1. The analysis reported 3 p values, corresponding to the effect of r yml$input$gene$anova$f2, r yml$input$gene$anova$f1, and their interaction. The analysis identified r nrow(sig.aov) significant genes with interaction p values less than r yml$input$gene$anova$p. The full ANOVA results were summarized in a table here.

Figure 3 Examples: the top 4 genes having the most significant interactive p value (less than r sig.aov[4, 3]).

wzxhzdk:9

**_[Go back to project home](`r yml$home`)_**

Gene set-level comparison

Genes are often grouped into pre-defined gene sets according to their function, interaction, location, etc. Analysis then can be performed on genes in the same gene set as a unit instead of individual genes.

Gene set average

# Gene sets, filtering by size
lst<-gset$list;
gns<-unlist(lst, use.names=FALSE); 
gst<-rep(names(lst), sapply(lst, length));
lst<-split(gns[gns %in% gid], gst[gns %in% gid]);
sz<-sapply(lst, length);
lst<-lst[sz>=yml$input$geneset$min & sz<=yml$input$geneset$max];
sz<-sapply(lst, length); 

# split gene log-ratio into gene sets
gns<-unlist(lst, use.names=FALSE);
gst<-rep(names(lst), sapply(lst, length));
lr<-l2r[gns, ];
splt<-apply(lr, 2, function(lr) split(lr, gst)); 
mean.gs<-sapply(splt, function(x) sapply(x, mean));

mn<-apply(mean.gs, 1, min);
mx<-apply(mean.gs, 1, max);
gs.tbl<-gset[[1]][rownames(mean.gs), ];
gs.tbl$Name<-AddHref(gs.tbl$Name, gs.tbl$URL);
gs.tbl<-cbind(gs.tbl[, 1:3], FormatNumeric(cbind(mean.gs, Diff=mx-mn)));

corr<-round(sapply(1:3, function(i) cor(mean.gs[, ind1[i]], mean.gs[, ind2[i]])), 3);
lns<-paste('  - Corr(', comp[ind1], ':', comp[ind2], ') = ', corr, sep='');
lns<-paste(lns, collapse='\n'); 

CreateDatatable(gs.tbl, paste(path, 'geneset_table.html', sep='/'), rownames = FALSE, caption='Gene set average'); 
saveRDS(gs.tbl, paste(path.r, 'geneset_stat.rds', sep='/'));
write.csv(gs.tbl, paste(path.tbl, 'geneset_stat.csv', sep='/'));

Average differential expression of genes in the same gene set. The gene set-level mean of log-ratio were summarized in this table here.

Figure 4 Each dot represents a gene set and the average log-ratio of all genes in this gene set. The same 3D plot was showed in 2 different angles.

r lns

wzxhzdk:11

**_[Go back to project home](`r yml$home`)_**

path.ora<-paste(path, 'ORA', sep='/');
if (!file.exists(path.ora)) dir.create(path.ora, recursive = TRUE); 

ora.stat<-lapply(ora, function(x) lapply(x[1:2], function(x) x[[1]])); 
ora.sets<-lapply(ora.stat, function(x) lapply(x, rownames)); 
ora.univ<-lapply(ora, function(x) lapply(x[1:2], function(x) rownames(x[[3]]))); 
ora.univ<-Reduce('union', c(ora.univ[[1]], ora.univ[[2]])); 
ora.n<-lapply(ora, function(o) sapply(o[1:2], function(o) o[[3]][, 'n11'][ora.univ])); 
ora.n<-do.call('cbind', ora.n);
ora.n[is.na(ora.n)]<-0; 
rownames(ora.n)<-ora.univ;
colnames(ora.n)<-paste('N', colnames(ora.n), sep='_'); 

ora.or<-lapply(ora, function(o) sapply(o[1:2], function(o) {
  c<-o[[3]]; 
  c<-(c[, 1]/c[, 2])*(c[, 4]/c[, 3]); 
  c<-c[ora.univ];
  c;
})); 
ora.or<-do.call('cbind', ora.or);
ora.or[is.na(ora.or)]<-1; 
rownames(ora.or)<-ora.univ;
colnames(ora.or)<-paste('OR', colnames(ora.or), sep='_'); 
ora.or.hi<-ora.or[, seq(1, ncol(ora.or), 2)]; 
ora.or.lo<-ora.or[, seq(2, ncol(ora.or), 2)]; 
ora.or<-cbind(ora.or.hi, OR_High_Range=apply(ora.or.hi, 1, function(x) max(x)-min(x)),
              ora.or.lo, OR_Low_Range=apply(ora.or.lo, 1, function(x) max(x)-min(x))); 

ora.p<-lapply(ora.stat, function(o) sapply(o, function(o) o[, 'P_HyperGeo'][ora.univ]));
ora.p<-do.call('cbind', ora.p); 
rownames(ora.p)<-ora.univ;
colnames(ora.p)<-paste('P', colnames(ora.p), sep='_'); 
ora.p[is.na(ora.p)]<-0.5;
ora.p.hi<-ora.p[, seq(1, ncol(ora.p), 2)]; 
ora.p.lo<-ora.p[, seq(2, ncol(ora.p), 2)]; 
ora.p<-cbind(ora.p.hi, P_High_Range=apply(ora.p.hi, 1, function(x) min(x)/max(x)),
              ora.p.lo, P_Low_Range=apply(ora.p.lo, 1, function(x) min(x)/max(x))); 

ora.tbl<-FormatNumeric(cbind(ora.n, ora.or, ora.p)); 
ora.tbl<-cbind(gset[[1]][rownames(ora.tbl), ], ora.tbl); 
ora.tbl$Name<-AddHref(ora.tbl$Name, ora.tbl$URL); 
ora.tbl<-ora.tbl[, -5];
CreateDatatable(ora.tbl, paste(path, 'ora_table.html', sep='/'), caption='Combined table of ORA statistics'); 

saveRDS(ora.tbl, paste(path.r, 'ora_stat.rds', sep='/'));
write.csv(ora.tbl, paste(path.tbl, 'ora_stat.csv', sep='/')); 

# split significant gene sets into subgroups by sources
gset.src<-sort(unique(gset[[1]][, 'Source'])); 
ora.s<-do.call('c', ora.stat); 
fn.tbl<-lapply(names(ora.s), function(nm) {
  s<-ora.s[[nm]]; 
  lapply(gset.src, function(src) {
    a<-gset[[1]][rownames(s), ];
    a$Name<-AddHref(a$Name, a$URL)
    a<-a[, 1:3]; 
    t<-cbind(a[a$Source==src, , drop=FALSE], FormatNumeric(s[a$Source==src, , drop=FALSE]))[, -1]; 
    f<-paste('ORA/', src, '_', nm, '.html', sep=''); 
    CreateDatatable(t, paste(path, f, sep='/'), rownames = FALSE, caption=paste(src, nm, sep=': '));
    c<-nrow(t); 
    names(c)<-f;
    c;
  });
}); 
lnk<-sapply(fn.tbl, function(fn) {
  c<-as.vector(unlist(fn));
  f<-sapply(fn, names); 
  paste('[', c, '](', f, ')', sep='');
});
dimnames(lnk)<-list(gset.src, names(ora.s));

Gene set over-representation analysis (ORA)

Each 2-group comparison performs gene set over-representation analysis (ORA) that identifies gene sets over-represented with differentially expressed genes. The results of ORA of both 2-group comparisons are summarized and compared here. The ORA of each gene set reports an odds ratio and p value. These statistics from both comparisons were combined and listed side-by-side, as well as the difference of their odds ratios and ratio of their p values (p set to 0.5 when not available), in this table here

Table 2 Gene sets were broken down into subgroups by their sources. Click on the numbers of over-represented gene sets to see a full list.

r kable(lnk, align='c')

ora.up<-lapply(ora.sets, function(x) x[[1]]); 
ora.dn<-lapply(ora.sets, function(x) x[[2]]); 

ora.all<-list(up=ora.up, down=ora.dn); 
fn.tbl<-lapply(names(ora.all), function(nm) {
  gs<-ora.all[[nm]]; 
  x<-c('up'='Higher', 'down'='Lower'); 
  sapply(1:3, function(i) {
    c<-comp[c(ind1[i], ind2[i])]; 
    g<-intersect(gs[[ind1[i]]], gs[[ind2[i]]]); 
    t<-ora.tbl[g, c(1:4, grep(x[nm], colnames(ora.tbl))), drop=FALSE]; 
    f<-paste(paste(nm, c[1], c[2], sep='_'), '.html', sep=''); 
    CreateDatatable(t, paste(path.ora, f, sep='/'), caption=paste(nm, ', ', c[1], ' and ', c[2], sep='')); 
    names(f)<-paste(c, collapse=' and '); 
    f;
  })
}); 

ora.up3<-Reduce('intersect', ora.up);
ora.dn3<-Reduce('intersect', ora.dn);
CreateDatatable(ora.tbl[ora.up3, c(1:4, grep('Higher', colnames(ora.tbl))), drop=FALSE], paste(path.ora, 'up_all3.html', sep='/'), caption='Gene sets up-regulated in all 3 comparisons')->f; 
CreateDatatable(ora.tbl[ora.dn3, c(1:4, grep('Lower', colnames(ora.tbl))), drop=FALSE], paste(path.ora, 'down_all3.html', sep='/'), caption='Gene sets down-regulated in all 3 comparisons')->f;

lns<-sapply(fn.tbl, function(f) {
  l<-paste('  - [', names(f), '](', paste('ORA', f, sep='/'), ')', sep=''); 
  paste(l, collapse='\n'); 
})

lns.up<-paste(' - [All 3 comparisons](ORA/up_all3.html)', lns[[1]], sep='\n'); 
lns.dn<-paste(' - [All 3 comparisons](ORA/down_all3.html)', lns[[2]], sep='\n');

Figure 5A The overlapping of over-represented gene sets by up-regulated genes in all 3 comparisons. Click links to view overlapping gene sets:

r lns.up

wzxhzdk:14

Figure 5B The overlapping of over-represented gene sets by down-regulated genes in all 3 comparisons. Click links to view overlapping gene sets:

r lns.dn

wzxhzdk:15

Gene set enrichment analysis (GSEA)

path.gsea<-paste(path, 'GSEA', sep='/');
if (!file.exists(path.gsea)) dir.create(path.gsea, recursive = TRUE); 

gsea.stat<-lapply(gsea, function(x) x$stat); 
gsea.stat<-lapply(gsea.stat, function(x) {
  rownames(x)<-paste(x[[1]], x[[2]], sep=':'); 
  x;
})
gsea.univ<-Reduce('union', lapply(gsea.stat, rownames)); 
gsea.stat<-lapply(gsea.stat, function(x) x[gsea.univ, ]); 
for (i in 1:length(gsea.stat)) rownames(gsea.stat[[i]])<-gsea.univ; 

x<-sapply(1:length(gsea.stat), function(i) 1-as.numeric(is.na(gsea.stat[[i]][, 1])));
y<-max.col(x, 'first');
gsea.anno<-lapply(1:3, function(i) gsea.stat[[i]][y==i, 1:3, drop=FALSE]);
gsea.anno<-do.call('rbind', gsea.anno)[gsea.univ, ]; 

gsea.tbl<-FormatNumeric(do.call('cbind', lapply(gsea.stat, function(x) x[, 4:6]))); 
gsea.tbl<-cbind(gsea.anno, gsea.tbl); 
colnames(gsea.tbl)[4:ncol(gsea.tbl)]<-paste(unlist(lapply(gsea.stat, function(x) colnames(x)[4:6]), use.names=FALSE), rep(names(grp), each=3), sep='_'); 
CreateDatatable(gsea.tbl, paste(path, 'gsea_table.html', sep='/'), rownames = FALSE, caption='GSEA statistics'); 

saveRDS(gsea.tbl, paste(path.r, 'gsea_stat.rds', sep='/'));
write.csv(gsea.tbl, paste(path.tbl, 'gsea_stat.csv', sep='/')); 

gsea.sets<-lapply(gsea.stat, function(x) split(x[x[, 'PValue']<=0.05, -ncol(x)], x[x[, 'PValue']<=0.05, ncol(x)])[2:1]); 
gsea.s<-do.call('c', gsea.sets); 
names(gsea.s)<-names(ora.s); 
gset.cll<-sort(unique(unlist(lapply(gsea.s, function(x) x$Collection), use.names=FALSE)));
fn.tbl<-lapply(names(gsea.s), function(nm) {
  s<-gsea.s[[nm]]; 
  lapply(gset.cll, function(cll) {
    t<-s[s$Collection==cll, , drop=FALSE]; 
    t<-FormatNumeric(t); 
    f<-paste('GSEA/', cll, '_', nm, '.html', sep=''); 
    CreateDatatable(t, paste(path, f, sep='/'), rownames = FALSE, caption=paste(cll, nm, sep=': '));
    c<-nrow(t); 
    names(c)<-f;
    c;
  });
}); 
lnk<-sapply(fn.tbl, function(fn) {
  c<-as.vector(unlist(fn));
  f<-sapply(fn, names); 
  paste('[', c, '](', f, ')', sep='');
});
dimnames(lnk)<-list(gset.cll, names(ora.s)); 

gsea.nes<-sapply(gsea.stat, function(x) x[, 'NES']);
gsea.nes[is.na(gsea.nes)]<-0; 
gsea.p<-sapply(gsea.stat, function(x) x[, 'PValue']);
gsea.p[is.na(gsea.p)]<-1; 

corr<-round(sapply(1:3, function(i) cor(gsea.nes[, ind1[i]], gsea.nes[, ind2[i]])), 3);
lns<-paste('  - Corr(', comp[ind1], ':', comp[ind2], ') = ', corr, sep='');
lns<-paste(lns, collapse='\n');

Each 2-group comparison performs gene set enrichment analysis (GSEA) on genes ranked by their differential expression. The results of GSEA of both 2-group comparisons are summarized and compared here. The GSEA of each gene set reports an enrichment score and p value. These statistics from both comparisons were combined and listed side-by-side in this table here

Table 3 Gene sets were broken down into subgroups by collections. Click on the numbers of enriched gene sets to see a full list.

r kable(lnk, align='c')

Figure 6 This plot shows the global correlation (correlation coefficient = r corr) of nominal enrichment scores between the 3 pairwise comparisons: r comp[1], r comp[2], and r comp[3]. The same 3D plot was showed in 2 different angles. Gene sets obtained p values less than 0.01 from any 1, any 2, or all 3 comparisons were highlighted in yellow, orange, or red respectively. The correlatio coefficients between enrichment scores of each pair of comparisons are:

r lns

wzxhzdk:17

gsea.up<-lapply(gsea.sets, function(x) rownames(x[[1]])); 
gsea.dn<-lapply(gsea.sets, function(x) rownames(x[[2]])); 

gsea.all<-list(up=gsea.up, down=gsea.dn); 
fn.tbl<-lapply(names(gsea.all), function(nm) {
  gs<-gsea.all[[nm]]; 
  sapply(1:3, function(i) {
    c<-comp[c(ind1[i], ind2[i])]; 
    g<-intersect(gs[[ind1[i]]], gs[[ind2[i]]]); 
    t<-gsea.tbl[g, , drop=FALSE]; 
    f<-paste(paste(nm, c[1], c[2], sep='_'), '.html', sep=''); 
    CreateDatatable(t, paste(path.gsea, f, sep='/'), rownames = FALSE, caption=paste(nm, ', ', c[1], ' and ', c[2], sep='')); 
    names(f)<-paste(c, collapse=' and '); 
    f;
  })
}); 

gsea.up3<-Reduce('intersect', gsea.up);
gsea.dn3<-Reduce('intersect', gsea.dn);
CreateDatatable(gsea.tbl[gsea.up3, , drop=FALSE], paste(path.gsea, 'up_all3.html', sep='/'), rownames=FALSE, caption='Gene sets up-regulated in all 3 comparisons')->f; 
CreateDatatable(gsea.tbl[gsea.dn3, c(1:4, grep('Lower', colnames(gsea.tbl))), drop=FALSE], rownames=FALSE, paste(path.gsea, 'down_all3.html', sep='/'), caption='Gene sets down-regulated in all 3 comparisons')->f;

lns<-sapply(fn.tbl, function(f) {
  l<-paste('  - [', names(f), '](', paste('GSEA', f, sep='/'), ')', sep=''); 
  paste(l, collapse='\n'); 
})

lns.up<-paste(' - [All 3 comparisons](GSEA/up_all3.html)', lns[[1]], sep='\n'); 
lns.dn<-paste(' - [All 3 comparisons](GSEA/down_all3.html)', lns[[2]], sep='\n');

Figure 7A The overlapping of enriched gene sets by up-regulated genes in all 3 comparisons. Click links to view overlapping gene sets:

r lns.up

wzxhzdk:19

Figure 7B The overlapping of enriched gene sets by down-regulated genes in all 3 comparisons. Click links to view overlapping gene sets:

r lns.dn

wzxhzdk:20

**_[Go back to project home](`r yml$home`)_**

Gene clustering

path.cl<-paste(path, 'CLUSTER', sep='/');
if (!file.exists(path.cl)) dir.create(path.cl, recursive = TRUE);

# normalize data
d1<-lapply(res, function(x) {
  y<-x$input; 
  z<-y$expr[, unlist(y$comparison)];
  z-rowMeans(z[, y$comparison[[1]]])
});
gs<-Reduce('intersect', lapply(d1, rownames)); 
d1<-do.call('cbind', lapply(d1, function(x) x[gs, , drop=FALSE])); 
d1<-t(apply(d1, 1, function(x) x/sd(x))); 

# select seeds
x<-paov[order(paov[, 3]), ];
x<-x[x[,3]<=yml$input$geneset$cluster$panova, , drop=FALSE]; 
d0<-d1[rownames(d1) %in% rownames(x), ]; 
d0<-d0[1: min(nrow(d0), yml$input$geneset$cluster$top), , drop=FALSE];

# initiate clusters
hc<-hclust(as.dist(1-cor(t(d0)))); 
cl<-cutree(hc, k=yml$input$geneset$cluster$seed);
cl<-split(names(cl), cl); 

# merge similar clusters
flag<-TRUE;
while(flag) {
  cat('Number of clusters ', length(cl), '\n'); 
  ms<-sapply(cl, function(cl) colMeans(d0[cl, ])); 
  tr<-cutree(hclust(as.dist(1-cor(ms))), k=length(cl)-1); # find the 2 most similar clusters
  i<-tr[duplicated(tr)];
  c<-ms[, tr==i]; 
  r<-cor(c[, 1], c[, 2]); 
  p<-cor.test(c[, 1], c[, 2])$p.value[[1]]; 
  if (r>yml$input$geneset$cluster$merge$corr & p<yml$input$geneset$cluster$merge$p) {
    cl[tr==i][[1]]<-as.vector(unlist(cl[tr==i]));
    cl<-cl[names(cl)!=names(i)]; 
  } else flag<-FALSE;
}

# Sort clusters
m<-sapply(cl, function(cl) colMeans(d1[cl, ])); 
ind<-apply(m, 2, function(x) which(x==max(x)));  
cl<-cl[order(ind)]; 

# re-cluster genes
reCl<-function(d, cl, r, dif, nmax) {
  md<-sapply(cl, function(cl) apply(d[cl[cl %in% rownames(d)], , drop=FALSE], 2, median));
  corr<-cor(t(d), md); 
  c<-lapply(1:ncol(corr), function(i) {
    corr<-corr[rev(order(corr[, i])), ]; 
    mx<-apply(corr[, -i, drop=FALSE], 1, max);
    id<-rownames(corr)[corr[, i]>=r & (corr[, i]-mx)>dif]; 
    id[1:min(length(id), nmax)]; 
  });
  c;
}
for (i in 1:yml$input$geneset$cluster$recluster$time) 
  cl<-reCl(d0, cl, yml$input$geneset$cluster$recluster$corr, yml$input$geneset$cluster$recluster$diff, yml$input$geneset$cluster$recluster$nmax); 
cls<-reCl(d1, cl, yml$input$geneset$cluster$recluster$corr, yml$input$geneset$cluster$recluster$diff, yml$input$geneset$cluster$recluster$nmax); 
ms<-t(sapply(cls, function(x) colMeans(d1[x, ]))); 
rownames(ms)<-names(cls)<-names(cl)<-paste('Cluster', 1:length(cls), sep='_'); 

fn.htmp<-sapply(names(cls), function(nm) {
  x<-d1[cls[[nm]], ];
  f<-paste(path.cl, '/Heatmap_', nm, '.pdf', sep=''); 
  sz<-CalculateColoredBlockSize(x);
  pdf(f, width = max(sz[2]/3, sz[1]), height = sz[2]);
  PlotColoredBlock(x, num.breaks = 31, key = 'Normalized expression', groups = grps); 
  dev.off();
  f;
});

saveRDS(list(cluster=cls, initial=cl, seed=d0, data=d1), file=paste(path.r, 'cluster.rds', sep='/'));

The top r nrow(d0) genes with significant ANOVA p values (p <= 'r yml$input$geneset$cluster$panova') were used as seeds to perform a gene-gene clustering analysis and r length(cls) clusters were identified. ORA was performed on the clusters to identify their functional association (see table below);

fn.tbl<-lapply(names(cls), function(nm) {
  cat(nm, '\n');
  cl<-cls[[nm]]; 
  s<-TestGSE(cl, gid, gset[[2]])[[1]]; 
  t<-gset[[1]][rownames(s), ];
  t$Name<-AddHref(t$Name, t$URL); 
  t<-cbind(t[, 1:3], FormatNumeric(s)); 
  f<-paste('CLUSTER/ORA_', nm, '.html', sep='');
  CreateDatatable(t, paste(path, f, sep='/'), rownames=FALSE, caption=paste('Over-represented gene set in', nm));
  c<-nrow(t); 
  names(c)<-f;
  t<-do.call('cbind', lapply(stat, function(s) s[cl, c(1, 2, 4, 5, 6)])); 
  t<-FormatNumeric(t);
  colnames(t)[-grep('^Mean', colnames(t))]<-paste(colnames(t)[-grep('^Mean', colnames(t))], rep(names(grp), each=3), sep='_'); 
  t<-data.frame(anno[rownames(t), ], t, stringsAsFactors = FALSE);
  CreateDatatable(t, paste(path.cl, '/', nm, '.html', sep=''), caption=nm);
  c;
}); 
mm<-round(sapply(grps, function(x) rowMeans(ms[, x, drop=FALSE])), 4);
n<-as.vector(unlist(fn.tbl)); 
sz<-sapply(cls, length);
lnk<-paste('[', n, '](', sapply(fn.tbl, names), ')', sep='');
cl.tbl<-data.frame(Size=sz, mm, Gene_set=lnk, stringsAsFactors = FALSE);
cl.tbl[, 1]<-paste('[', sz, '](CLUSTER/', names(cls), '.html)', sep=''); 
cl.tbl<-cbind(ID=rownames(cl.tbl), cl.tbl); 
cl.tbl[, 1]<-paste('[', cl.tbl[, 1], '](CLUSTER/Heatmap_', names(cls), '.pdf)', sep='');

Table 4 This table lists the number of genes in each cluster (click the numbers to see gene lists), the average expression of all genes in a cluster of all sample groups (normalized so the mean of the control groups equals to 0 and the mean is the number of standard deviations), and then the gene sets over-represented in each cluster (click the numbers to see gene set lists).

r kable(cl.tbl, row.names=FALSE, align='c')

Figure 8 This plot shows below the average expression levels of each cluster. Data was normalized before the analysis, so the mean of the control groups was zero and the standard deviation of all samples of each gene was 1.0. Values indicate number of standard deviation from mean of relative control group.

wzxhzdk:23

**_[Go back to project home](`r yml$home`)_**

Figure 9 This plot summarizes the group means and standard errors of all clusters.

wzxhzdk:24

**_[Go back to project home](`r yml$home`)_**

END OF DOCUMENT

zhezhangsh/DEGandMore documentation built on Sept. 22, 2022, 9:55 a.m.

zhezhangsh/DEGandMore

In zhezhangsh/DEGandMore:

Project

Pairwise comparisons

Gene-level comparison

Global delta-delta correlation

Differentially expression genes (DEGs)

ANOVA

Gene set-level comparison

Gene set average

Gene set over-representation analysis (ORA)

Gene set enrichment analysis (GSEA)

Gene clustering

R Package Documentation

Browse R Packages

We want your feedback!