#' Execute GSEA Analysis
#'
#' Analyzes genetic expression data and determines whether defined gene sets show statistically significant differences with respect to two phenotypes.
#' @param input.ds.name Name of the gct expression file.
#' @param input.cls.name Name of the cls phenotype file.
#' @param gene.set.input Name of the geneset file.
#' @param doc.string Name of the output folder for analysis and for naming output files.
#' @param nperm number of permutations.
#' @param bar_percent proportional height of tick mark to window.
#' @param gap_percent proportional height between minimum enrichment score and top of tick mark to the window.
#' @param under_percent proportional height of white space under tick marks to the window size.
#' @param upper_percent proportional height of white space over enrichment graph to the window size.
#' @param color_line color of enrichment score line in plot pdf.
#' @param color_tick color to tick marks on plot.
#' @param abs.val Default is false. Determines whether genes are ranked according to signal to noise or absolute value of signal to noise (when abs.val=T).
#'
#' @details GSEA analysis is computed using the Broad Institute's R source code. Genes are ranked according to signal to noise ratio (difference in means/sum of standard deviations for the two phenotypes).
#' @return pp - A list pp which includes : plots, gene.set.reference.matrix, gene.set.leading, report1, report2, ES.
#' @return plots - A list of ggplot objects, this can act as an input to the function plot.ES() to output a pdf.
#' @return gene.set.reference.matrix - A list of each gene set within a gene set database and the gene symbols corresponding to each set.
#' @return gene.set.leading - a similar structure to gene.set.reference.matrix but only contains the gene symbols within each gene set that are part of the leading edge set.
#' @return report1 - Summary of GSEA analysis data for the first phenotype.
#' @return report2 - Summary of GSEA analysis data for the second phenotype.
#' @return ES - This object contains the enrichment scores and enrichment tags used to create the plots described earlier. The user can use this information to customize plots as they wish.
#' @examples
#' pp = GSEAplots(input.ds.name=expr.input, input.cls.name=pheno.input,
#' gene.set.input=gene.set.input, doc.string="GSEA_plots", nperm=1000,
#' abs.val=F, bar_percent=0.1, gap_percent=0.1, under_percent=0.1,
#' upper_percent=0.1, color_line="black", color_tick="black")
#'
#' custom_results= GSEAplots(input.ds.name=expr.input, input.cls.name=pheno.input,
#' gene.set.input=gene.set.input, doc.string="custom_results", nperm=1000,
#' bar_percent=0.1, gap_percent=0.1, under_percent=0.1, upper_percent=0.1,
#' color_line="black", color_tick="black", abs.val=F)
#' @export
#' @references Subramanian, Tamayo, et al. (2005), PNAS 102, 15545-15550, http://www.broad.mit.edu/gsea/
GSEAplots= function(input.ds.name="",
input.cls.name="",
gene.set.input="",
doc.string="",
nperm=1000,
bar_percent=0.1,
gap_percent=0.1,
under_percent=0.1,
upper_percent=0.1,
color_line="black",
color_tick="black",
abs.val=F#,
)
{
# GSEA 1.0 -- Gene Set Enrichment Analysis / Broad Institute
# Inputs:
# program_location: the outermost directory. The one that contains the source code and code to run the file
# input.ds.name: Input gene expression Affymetrix dataset file in GCT format "name.gct"
# input.cls.name: Input class vector (phenotype) file in CLS format "name.cls"
# gene.set.input: Gene set name for hallmark it is "h.all.v6.1.symbols.gmt"
# doc.string: Dataset_geneset
# nperm: number of permutations, default is 1000
# nom.p.val.threshold Significance threshold for nominal p-vals for gene sets (default: -1, no thres)
wd_new=getwd()
if (file.exists(paste0(wd_new,"/", doc.string))==FALSE){
dir.create(doc.string)
}
print("PRIOR INPUT DS")
print(input.ds.name)
results_new=GSEA( # Input/Output Files :-------------------------------------------
input.ds=input.ds.name, # Input gene expression Affy dataset file in RES or GCT format
input.cls=input.cls.name,
#input.ds=paste(wd_new,datasets.folder,input.ds.name, sep="",collapse=NULL), # Input gene expression Affy dataset file in RES or GCT format
#input.cls=paste(wd_new,datasets.folder,input.cls.name,sep="",collapse=NULL),
#gs.db = paste(genesets.folder,gene.set.input,sep="",collapse=NULL), # Gene set database in GMT format
gs.db = gene.set.input,
output.directory = paste0(wd_new,"/", doc.string,"/"),
output.directory2 =paste0(wd_new,"/"),
# Directory where to store output and results (default: "")
# Program parameters :-------------------------------------------------------------------------------------------------------------------------
doc.string = doc.string, # Documentation string used as a prefix to name result files (default: "GSEA.analysis")
non.interactive.run = T, # Run in interactive (i.e. R GUI) or batch (R command line) mode (default: F)
reshuffling.type = "sample.labels", # Type of permutation reshuffling: "sample.labels" or "gene.labels" (default: "sample.labels"
nperm = nperm, # Number of random permutations (default: 1000)
weighted.score.type = 1, # Enrichment correlation-based weighting: 0=no weight (KS), 1= weigthed, 2 = over-weigthed (default: 1)
nom.p.val.threshold = -1, # Significance threshold for nominal p-vals for gene sets (default: -1, no thres)
fwer.p.val.threshold = -1, # Significance threshold for FWER p-vals for gene sets (default: -1, no thres)
topgs = 20, # Besides those passing test, number of top scoring gene sets used for detailed reports (default: 10)
adjust.FDR.q.val = F, # Adjust the FDR q-vals (default: F)
reverse.sign = F, # Reverse direction of gene list (pos. enrichment becomes negative, etc.) (default: F)
preproc.type = 0, # Preproc.normalization: 0=none, 1=col(z-score)., 2=col(rank) and row(z-score)., 3=col(rank). (def: 0)
random.seed = 3338, # Random number generator seed. (default: 123456)
perm.type = 0, # For experts only. Permutation type: 0 = unbalanced, 1 = balanced (default: 0)
fraction = 1.0, # For experts only. Subsampling fraction. Set to 1.0 (no resampling) (default: 1.0)
replace = F, # For experts only, Resampling mode (replacement or not replacement) (default: F)
save.intermediate.results = F, # For experts only, save intermediate results (e.g. matrix of random perm. scores) (default: F)
OLD.GSEA = F, # Use original (old) version of GSEA (default: F)
use.fast.enrichment.routine = T, # Use faster routine to compute enrichment for random permutations (default: T)
abs.val=abs.val #rank by absolute value of signal to noise ratio
)
#-----------------------------------------------------------------------------------------------------------------------------------------------
if (length(which(sapply(results_new$out5,is.null))) == 0){
ES.tags.files <- results_new$out5
ES.data.files <- results_new$out6
ES.report.files <- results_new$out7
gene.set.numbers <- results_new$out4
} else{
#this step removes the gene sets which did not generate ES.tags, ES.data, or ES.report files
ES.tags.files <- results_new$out5[-which(sapply(results_new$out5,is.null))]
ES.data.files <- results_new$out6[-which(sapply(results_new$out6,is.null))]
ES.report.files <- results_new$out7[-which(sapply(results_new$out7,is.null))]
gene.set.numbers <- results_new$out4[-which(sapply(results_new$out5,is.null))]
}
gene.set.reference.matrix <- results_new$gene.set.reference.matrix
gene.set.leading <- rep(list("null"),length(gene.set.numbers))
ES <- rep(list("null"),length(gene.set.numbers))
enrichind <- rep(list("null"),length(gene.set.numbers))
report1 <- results_new$report1
report2 <- results_new$report2
#if the phrase hallmark does not exist in gene.set.numbers does nothing. Else, the word hallmark is removed
# 1=Exist@start -1=DNE other=existsButNotFirst
# if (regexpr(pattern="HALLMARK_", gene.set.numbers[1]) == -1) {
# #nothing
# } else {
# for (i in 1: length(gene.set.numbers)){
# g <- strsplit(gene.set.numbers[[i]],split="HALLMARK_")
# h <- g[[1]][2]
# gene.set.numbers[[i]] <- h
# }
# }
# for (i in 1: length(gene.set.numbers)){
# if (regexpr(pattern="HALLMARK_", gene.set.numbers[i]) == 1) {
# #g <- vector(length = length(gene.set.numbers), mode = "character")
# g[i] <- strsplit(gene.set.numbers[[i]],split="HALLMARK_")
# h <- g[[2]][i]
# gene.set.numbers[[i]] <- h
# }
# }
#plotting and finding leading edge set
plots <- vector(mode="list",length=length(ES.data.files))
for (i in 1:length(ES.tags.files)){
dat1_name=paste(wd_new,"/",doc.string,"/",doc.string,".",ES.data.files[[i]],sep="",collapse=NULL)
dat2_name=paste(wd_new,"/",doc.string,"/",doc.string,".",ES.tags.files[[i]],sep="",collapse=NULL)
report_name=paste(wd_new,"/",doc.string,"/",doc.string,".",ES.report.files[[i]],sep="",collapse=NULL)
dat1 = read.table(dat1_name,header=T,sep="\t")
dat2 = read.table(dat2_name,header=T,sep="\t")
report=read.table(report_name,sep="\t")
datcomb <- cbind(dat1,dat2)
datcomb= datcomb[,-5]
ES[[i]]=datcomb
enrich_ind=which(dat2$EStag==1)
height=max(dat1$RES)-min(dat1$RES)
bar_height=bar_percent*height
gap_height=gap_percent*height
under_height=under_percent*height
upper_height=upper_percent*height
window_height=height+bar_height+gap_height+under_height+upper_height
y_lower=min(dat1$RES)-gap_height-bar_height
window_low=y_lower-under_height
window_high=max(dat1$RES)+upper_height
d=data.frame(x=enrich_ind, y=matrix(y_lower,length(enrich_ind),1), vx=matrix(0,length(enrich_ind),1), vy=matrix(bar_height,length(enrich_ind),1))
p <- ggplot(datcomb, aes(index,RES))+geom_line(col=color_line)
p <- p+geom_segment(data=d, mapping=aes(x=x, y=y, xend=x+vx, yend=y+vy),col=color_tick)
p <- p+theme_classic()
p <- p+ylim(window_low,window_high)
p <- p+ggtitle(gene.set.numbers[[i]])
p <- p+theme(plot.title = element_text(size = 12))
plots[[i]] <- p
#location of largest RES absolute value
leading_index <- which.max(abs(datcomb$RES))
#depending on which phenotype the gene set is more related to the leading edge set is on different sides
if (datcomb$RES[leading_index] > 0){
#takes the report of all enriched genes and selects the ones before the max
leading.edge.names <- report$V2[which(report$V5 < leading_index)]
leading.edge.RES <- report$V7[which(report$V5 < leading_index)]
}
else {
#takes the report of all enriched genes and selects the ones after the max
leading.edge.names <- report$V2[which(report$V5 > leading_index)]
leading.edge.RES <- report$V7[which(report$V5 > leading_index)]
}
leading.edge.set <- data.frame(leading.edge.names,leading.edge.RES)
gene.set.leading[[i]] <- leading.edge.names
}
names(gene.set.leading) <- gene.set.numbers
names(ES) <- gene.set.numbers
gene.set.leading[] <- lapply(gene.set.leading, as.character)
return(list(plots=plots,gene.set.reference.matrix=gene.set.reference.matrix,gene.set.leading=gene.set.leading,report1=report1,report2=report2,ES=ES))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.