#' @export
#'
#'
#'
ssGSEA2_ce <- function (
input.ds, ## input data file in gct format, first column (Name) must contain gene symbols
output.prefix, ## prefix used for output tables
gene.set.databases=gsea.dbs, ## list of genesets (in gmt format) to evaluate enrichment on
sample.norm.type=c("rank", "log", "log.rank", "none"), ## sample normalization
weight= 0, ## when weight==0, all genes have the same weight; if weight>0,
## actual values matter, and can change the resulting score
statistic = c("area.under.RES", "Kolmogorov-Smirnov"), ## test statistic
output.score.type = c("NES", "ES"),
nperm = 1000, ## number of random permutations for NES case
combine.mode = c("combine.off", "combine.replace", "combine.add"),
## "combine.off" do not combine *_UP and *_DN versions in a single score.
## "combine.replace" combine *_UP and *_DN versions in a single score.
## "combine.add" combine *_UP and *_DN versions in a single score and
## add it but keeping the individual *_UP and *_DN versions.
min.overlap = 10,
correl.type = c("rank", "z.score", "symm.rank"), ## correlation type: "rank", "z.score", "symm.rank"
#fdr.pvalue = TRUE, ## output adjusted (FDR) p-values
global.fdr = FALSE, ## if TRUE calculate global FDR; else calculate FDR sample-by-sample
extended.output = TRUE, ## if TRUE the result GCT files will contain statistics about gene coverage etc.
par=F,
spare.cores=1,
export.signat.gct=T, ## if TRUE gct files with expression values for each signature will be generated
param.file=T,
log.file='run.log') {
## #######################################################################
## single sample GSEA
## for results similar to the Java version, use: weight=0;
## when weight==0, sample.norm.type and correl.type do not matter;
## when weight > 0, the combination of sample.norm.type and correl.type
## dictate how the gene expression values in input.ds are transformed
## to obtain the score -- use this setting with care (the transformations
## can skew scores towards +ve or -ve values)
## . sample.norm.type=='none' uses actual expression values; combined
## with correl.type=='rank', genes are weighted by actual values
## . sample.norm.type=='rank' weights genes proportional to rank
## . sample.norm.type=='log' can be used for log-transforming input data
## . correl.type=='z.score' standardizes the (normalized) input values
## before using them to calculate scores.
## ###################################################
## initialize log-file
cat('##', format(Sys.time()), '\n', file=log.file)
## miximal file path lenght; Windows OS support max. 259 characters
max.nchar.file.path <- 259
## ###################################################
## arguments
sample.norm.type <- match.arg( sample.norm.type)
statistic <- match.arg(statistic)
output.score.type <- match.arg(output.score.type)
combine.mode <- match.arg(combine.mode)
correl.type <- match.arg(correl.type)
## ###################################################
## parameter file
if(param.file){
## save parameters used for ssGSEA
param.str = c(
paste('##', Sys.time()),
paste('input gct:', "input.ds", sep='\t'),
paste('gene.set.database:', "gene.set.databases", sep='\t'),
paste('sample.norm.type:', sample.norm.type, sep='\t'),
paste('weight:', weight, sep='\t'),
paste('statistic:', statistic, sep='\t'),
paste('output.score.type', output.score.type, sep='\t'),
paste('nperm:', nperm, sep='\t'),
paste('global.fdr:', global.fdr, sep='\t'),
paste('min.overlap:', min.overlap, sep='\t'),
paste('correl.type:', correl.type, sep='\t'),
paste('export.signat.gct:', export.signat.gct, sep='\t'),
paste('run.parallel:', par, sep='\t')
)
#writeLines(param.str, con=paste(output.prefix, 'parameters.txt', sep='_'))
}
## ####################################################
## import dataset
## ####################################################
gct.unique <- NULL
dataset <- input.ds
if(class(dataset) != 'try-error' ){
m <- dataset@mat
gene.names <- dataset@rid
gene.descs <- dataset@rdesc
sample.names <- dataset@cid
sample.descs <- dataset@cdesc
} else {
## - cmapR functions stop if ids are not unique
## - import gct using readLines and make ids unique
if(length(grep('rid must be unique', dataset) ) > 0) {
gct.tmp <- readLines(input.ds)
#first column
rid <- gct.tmp %>% sub('\t.*','', .)
#gct version
ver <- rid[1]
#data and meta data columns
meta <- strsplit(gct.tmp[2], '\t') %>% unlist() %>% as.numeric()
if(ver=='#1.3')
rid.idx <- (meta[4]+3) : length(rid)
else
rid.idx <- 4:length(rid)
#check whether ids are unique
if(length(rid[rid.idx]) > length(unique(rid[rid.idx]))){
warning('rids not unique! Making ids unique and exporting new GCT file...\n\n')
#make unique
rid[rid.idx] <- make.unique(rid[rid.idx], sep='_')
#other columns
rest <- gct.tmp %>% sub('.*?\t','', .)
rest[1] <- ''
gct.tmp2 <- paste(rid, rest, sep='\t')
gct.tmp2[1] <- sub('\t.*','',gct.tmp2[1])
#export
gct.unique <- sub('\\.gct', '_unique.gct', input.ds)
writeLines(gct.tmp2, con=gct.unique)
#import using cmapR functions
dataset <- parse.gctx(fname = gct.unique)
## extract data
m <- dataset@mat
gene.names <- sub('_[0-9]{1,5}$', '', dataset@rid)
gene.descs <- dataset@rdesc
sample.names <- dataset@cid
sample.descs <- dataset@cdesc
}
} else { #end if 'rid not unique'
########################################################
## display a more detailed error message if the import
## failed due to other reasons than redundant 'rid'
stop("\n\nError importing GCT file using 'cmapR::parse.gctx()'. Possible reasons:\n\n1) Please check whether you have the latest version of the 'cmapR' installed. Due to submission to Bioconductor the cmap team changed some naming conventions, e.g 'parse.gctx()' has been renamed to 'parse.gctx()'.\n2) The GCT file doesn't seem to be in the correct format! Please see take a look at https://clue.io/connectopedia/gct_format for details about GCT format.
\nError message returned by 'cmapR::parse.gctx()':\n\n", dataset, '\n\n')
}
} #end if try-error
m.org <- m
gct.version <- dataset@version
gct.src <- dataset@src
## number of sample columns
Ns <- ncol(m)
## number of genes/ptm sites
Ng <- nrow(m)
## Extract input file name
input.file.prefix <- sub('.*/(.*)\\.gct$', '\\1', "input")
## ###################################################
## import gene set databases
## ###################################################
GSDB <- vector('list', 2)
names(GSDB) <- c("stk_pamchip_87102_onlyChipPeps_dbs",
"ptk_pamchip_86402_onlyChipPeps_dbs")
#for (gsdb in gene.set.databases)
GSDB[["stk_pamchip_87102_onlyChipPeps_dbs"]] <- Read.GeneSets.db2(ptm_sea_iptmnet_mapping_stk, thres.min = min.overlap, thres.max = 2000)
GSDB[["ptk_pamchip_86402_onlyChipPeps_dbs"]] <- Read.GeneSets.db2(ptm_sea_iptmnet_mapping_ptk, thres.min = min.overlap, thres.max = 2000)
for(i in 1:length(GSDB)){
if(i == 1){
gs <- GSDB[[i]]$gs
N.gs <- GSDB[[i]]$N.gs
gs.names <- GSDB[[i]]$gs.names
gs.descs <- GSDB[[i]]$gs.desc
size.G <- GSDB[[i]]$size.G
} else {
gs <- append(gs, GSDB[[i]]$gs)
gs.names <- append(gs.names, GSDB[[i]]$gs.names)
gs.descs <- append(gs.descs, GSDB[[i]]$gs.desc)
size.G <- append(size.G, GSDB[[i]]$size.G)
N.gs <- N.gs + GSDB[[i]]$N.gs
}
}
## ####################################################
## gene set redundancy score
## ####################################################
#gs.redundancy.mat <-
## ####################################################
## Sample normalization
## -take care of missing values already here
## ####################################################
if (sample.norm.type == "rank") {
m <- apply(m, 2, function(x){
x.rank=rep(NA, length(x))
keep.idx=which(!is.na(x))
x.rank[keep.idx ]=rank(x[keep.idx], ties.method="average")
return(x.rank)
})
m <- 10000*m/Ng
} else if (sample.norm.type == "log.rank") {
m <- apply(m, 2, function(x){
x.rank=rep(NA, length(x))
keep.idx=which(!is.na(x))
x.rank[keep.idx ]=rank(x[keep.idx], ties.method="average")
return(x.rank)
})
m <- log(10000*m/Ng + exp(1))
} else if (sample.norm.type == "log") {
m[m < 1] <- 1
m <- log(m + exp(1))
} ##else if (sample.norm.type == "none") {
## keep original value -- do not transform
##}
tt <- Sys.time()
## ####################################################
## calculate overlap between rownames of the dataset and
## the each gene set
## - in case of PTM signatures extract
## the directionality information
## ####################################################
if(length(grep(';u|;d', gs[[1]])) > 0 ){ ## PTMsigDB
gs <- lapply(gs, function(x) x[ sub(';u$|;d$','', x) %in% gene.names ])
gs.direction <- lapply(gs, function(x) sub('^.*;(d|u)$', '\\1', x))
gs <- lapply(gs, function(x) sub(';u$|;d$','', x))
} else { ## MSigDB
gs.direction <- NULL
gene.set.direction <- NULL
gs <- lapply(gs, function(x) intersect(x, gene.names))
}
## ###########################################
## calculate the overlap
size.ol.G <- sapply(gs, length) ## original: require at least 'min.overlap' unique GENE SET members
cat('MSigDB import: ', file=log.file, append=T)
cat(Sys.time()-tt, '\n', file=log.file, append=T)
## ###########################################
## remove gene sets with unsufficient overlap
## index of gene sets to test
keep.idx <- which(size.ol.G >= min.overlap)
## ###########################################
## stop if no overlap has been found
if(length(keep.idx) == 0)
stop('No overlap to any gene sets found in your data set!\nPossible reasons: 1) organism other than human; 2) wrong format of gene names/site ids!\n')
## #####################################################
## update all data
gs.names <- gs.names[keep.idx]
gs <- gs[keep.idx]
#gs.size <- gs.size[keep.idx]
gs.descs <- gs.descs[keep.idx]
size.G <- size.G[keep.idx]
size.ol.G <- size.ol.G[keep.idx]
if(!is.null(gs.direction))
gs.direction <- gs.direction[keep.idx]
## final number of gene sets to test
N.gs <- length(keep.idx)
cat('Testing', length(keep.idx), 'gene sets:\n', file=log.file, append=T)
## #########################################
## check for redundant signature sets
gene.set.selection <- unique(gs.names)
locs <- match(gene.set.selection, gs.names)
if(length(locs) < N.gs){
gs <- gs[locs]
gs.names <- gs.names[locs]
gs.descs <- gs.descs[locs]
size.G <- size.G[locs]
}
tt <- Sys.time()
## ####################################################
##
## function executed per gene set
##
## ####################################################
project.geneset <- function (data.array,
gene.names,
n.cols,
n.rows,
weight = 0,
statistic = "Kolmogorov-Smirnov", ## alternatives: "Kolmogorov-Smirnov", "area.under.RES"
gene.set,
nperm = 200,
correl.type = "rank", ## "rank", "z.score", "symm.rank"
gene.set.direction=NULL, ## direction of regulation; has to be in same order than 'gene.set'
min.overlap,
size.G.current
) {
## #############################################################################
##
## function to calculate GSEA enrichment score
## - apply correlation scheme and weighting
## - calculate ES
## ############################################################################
gsea.score <- function (ordered.gene.list, gene.set2, weight, n.rows, correl.type, gene.set.direction, data.expr) {
##################################################################
## function to calculate ES score
score <- function(max.ES, min.ES, RES, gaps, valleys, statistic){
## KM
if( statistic == "Kolmogorov-Smirnov" ){
if( max.ES > -min.ES ){
ES <- signif(max.ES, digits=5)
arg.ES <- which.max(RES)
} else{
ES <- signif(min.ES, digits=5)
arg.ES <- which.min(RES)
}
}
## AUC
if( statistic == "area.under.RES"){
if( max.ES > -min.ES ){
arg.ES <- which.max(RES)
} else{
arg.ES <- which.min(RES)
}
gaps = gaps+1
RES = c(valleys,0) * (gaps) + 0.5*( c(0,RES) - c(valleys,0) ) * (gaps)
ES = sum(RES)
}
return(list(RES=RES, ES=ES, arg.ES=arg.ES))
} ## end function score
## #######################################
## weighting
## #######################################
if (weight == 0) {
correl.vector <- rep(1, n.rows)
} else if (weight > 0) {
## if weighting is used (weight > 0), bring
## 'correl.vector' into the same order
## as the ordered gene list
if (correl.type == "rank") {
##correl.vector <- data.array[ordered.gene.list, sample.index]
correl.vector <- data.expr[ordered.gene.list]
} else if (correl.type == "symm.rank") {
##correl.vector <- data.array[ordered.gene.list, sample.index]
correl.vector <- data.expr[ordered.gene.list]
correl.vector <- ifelse(correl.vector > correl.vector[ceiling(n.rows/2)],
correl.vector,
correl.vector + correl.vector - correl.vector[ceiling(n.rows/2)])
} else if (correl.type == "z.score") {
##x <- data.array[ordered.gene.list, sample.index]
x <- data.expr[ordered.gene.list]
correl.vector <- (x - mean(x))/sd(x)
}
}
## length of gene list - equals number of rows in input matrix
N = length(ordered.gene.list)
## #####################################
## directionality of the gene set
if(!is.null(gene.set.direction)){
## number of 'd' features
d.idx <- which(gene.set.direction=='d')
Nh.d <- length(d.idx)
Nm.d <- N - Nh.d
## number of 'u' features
u.idx <- which(gene.set.direction=='u')
Nh.u <- length(u.idx)
Nm.u <- N - Nh.u
########################################
## up-regulated part
########################################
if(Nh.u > 1){
## locations of 'up' features
tag.u <- sign( match(ordered.gene.list, gene.set2[ u.idx ], nomatch=0) )
ind.u = which(tag.u == 1)
## extract and apply weighting
correl.vector.u <- correl.vector[ind.u]
correl.vector.u <- abs(correl.vector.u)^weight ## weighting
sum.correl.u <- sum(correl.vector.u)
up.u <- correl.vector.u/sum.correl.u ## steps up in th random walk
gaps.u <- (c(ind.u-1, N) - c(0, ind.u)) ## gaps between hits
down.u <- gaps.u/Nm.u ## steps down in the random walk
RES.u <- cumsum(c(up.u,up.u[Nh.u])-down.u)
valleys.u = RES.u[1:Nh.u]-up.u
max.ES.u = max(RES.u)
min.ES.u = min(valleys.u)
## calculate final score
score.res <- score(max.ES.u, min.ES.u, RES.u[1:Nh.u], gaps.u, valleys.u, statistic)
ES.u <- score.res$ES
arg.ES.u <- score.res$arg.ES
RES.u <- score.res$RES
} else {
correl.vector.u <- rep(0, N)
ES.u=0
RES.u=0
ind.u=NULL
arg.ES.u=NA
up.u=0
down.u=0
}
## ######################################
## down-regulated part
## ######################################
if(Nh.d > 1){
## locations of 'd' features
tag.d <- sign( match(ordered.gene.list, gene.set2[ d.idx ], nomatch=0) )
ind.d = which(tag.d == 1)
## extract and apply weighting
correl.vector.d <- correl.vector[ind.d]
correl.vector.d <- abs(correl.vector.d)^weight ## weighting
sum.correl.d <- sum(correl.vector.d)
up.d <- correl.vector.d/sum.correl.d
gaps.d <- (c(ind.d-1, N) - c(0, ind.d))
down.d <- gaps.d/Nm.d
RES.d <- cumsum(c(up.d,up.d[Nh.d])-down.d)
valleys.d = RES.d[1:Nh.d]-up.d
max.ES.d = max(RES.d)
min.ES.d = min(valleys.d)
## calculate final score
score.res <- score(max.ES.d, min.ES.d, RES.d[1:Nh.d], gaps.d, valleys.d, statistic)
ES.d <- score.res$ES
arg.ES.d <- score.res$arg.ES
RES.d <- score.res$RES
} else {
correl.vector.d <- rep(0, N)
ES.d=0
RES.d=0
ind.d=NULL
arg.ES.d=NA
up.d=0
down.d=0
}
## ############################
## make sure to meet the min.overlap
## threshold
if(Nh.d == 1 & Nh.u < min.overlap | Nh.u == 1 & Nh.d < min.overlap){
ES.u <- ES.d <- RES.u <- RES.d <- 0
arg.ES <- arg.ES <- NA
ind.u <- ind.d <- NULL
}
## ###########################
## combine the results
ES <- ES.u - ES.d
RES <- list(u=RES.u, d=RES.d)
arg.ES <- c(arg.ES.u, arg.ES.d)
##tag.indicator <- rep(0, N)
correl.vector = list(u=correl.vector.u, d=correl.vector.d)
##if(!is.null(ind.u))tag.indicator[ind.u] <- 1
##if(!is.null(ind.d))tag.indicator[ind.d] <- -1
ind <- list(u=ind.u, d=ind.d)
step.up <- list(u=up.u, d=up.d )
## step.down <- list(u=down.u, d=down.d )
step.down <- list(u=1/Nm.u, d=1/Nm.d)
gsea.results = list(ES = ES, ES.all = list(u=ES.u, d=ES.d), arg.ES = arg.ES, RES = RES, indicator = ind, correl.vector = correl.vector, step.up=step.up, step.down=step.down)
## ##############################################################
##
## original ssGSEA code without directionality
##
## ##############################################################
} else { ## end if(!is.null(gene.set.direction))
Nh <- length(gene.set2)
Nm <- N - Nh
## #####################################
## match gene set to data
tag.indicator <- sign(match(ordered.gene.list, gene.set2, nomatch=0)) # notice that the sign is 0 (no tag) or 1 (tag)
## positions of gene set in ordered gene list
ind = which(tag.indicator==1)
## 'correl.vector' is now the size of 'gene.set2'
correl.vector <- abs(correl.vector[ind])^weight
## sum of weights
sum.correl = sum(correl.vector)
#########################################
## determine peaks and valleys
## divide correl vector by sum of weights
up = correl.vector/sum.correl # "up" represents the peaks in the mountain plot
gaps = (c(ind-1, N) - c(0, ind)) # gaps between ranked pathway genes
down = gaps/Nm
RES = cumsum(c(up,up[Nh])-down)
valleys = RES[1:Nh]-up
max.ES = max(RES)
min.ES = min(valleys)
## calculate final score
score.res <- score(max.ES, min.ES, RES[1:Nh], gaps, valleys, statistic)
ES <- score.res$ES
arg.ES <- score.res$arg.ES
RES <- score.res$RES
gsea.results = list(ES = ES, arg.ES = arg.ES, RES = RES, indicator = ind, correl.vector = correl.vector, step.up=up, step.down=1/Nm)
} ## end else
return (gsea.results)
}
## end function 'gsea.score'
## #######################################################################################################################################################
##debug(gsea.score)
## fast implementation of GSEA)
ES.vector <- NES.vector <- p.val.vector <- rep(NA, n.cols)
correl.vector <- rep(NA, n.rows)
## vectors to return number of overlapping genes/sites
OL.vector <- OL.numb.vector <- OL.perc.vector <- rep(NA, n.cols)
## Compute ES score for signatures in each sample
phi <- array(NA, c(n.cols, nperm))
## list to store random walk accross samples
random.walk <- vector('list', n.cols)
## locations of gene set in input data (before ranking/ordering)
## 'gene.names' is in the same order as the input data
gene.names.all <- gene.names
gene.set.all <- gene.set
gene.set.size <- length(gene.set)
gene.set.direction.all <- gene.set.direction
n.rows.all <- n.rows
## loop over columns/samples
for (sample.index in 1:n.cols) {
## ######################################################
## handle missing values appropriatly
## ######################################################
## reset values
gene.names <- gene.names.all
gene.set <- gene.set.all
gene.set.direction <- gene.set.direction.all
n.rows <- n.rows.all
## extract expression values of current sample
data.expr <- data.array[ , sample.index]
## index of missing values
na.idx <- which( is.na(data.expr) | is.infinite(data.expr) )
## if there are missing values ...
if( length(na.idx) > 0){
## update input data + gene names
## those are in the same order
data.expr <- data.expr[-na.idx]
gene.names <- gene.names[-na.idx]
## update numbers
n.rows=length(data.expr)
## extract gene set members present in data
gene.set.no.na <- which(gene.set %in% gene.names)
## update gene sets + gene set directions
## those are in the same order
gene.set <- gene.set[ gene.set.no.na ]
gene.set.direction <- gene.set.direction[ gene.set.no.na ]
##cat('gene set overlap:', length(gene.set), '\n', file=log.file, append=T)
#cat('gene set overlap:', sum(gene.names %in% gene.set), '\n', file=log.file, append=T)
} ## end if missing values are present
## ###############################################
## if there is NOT sufficient overlap...
if(sum(gene.names %in% gene.set) < min.overlap){
random.walk[[sample.index]] <- NA
OL.numb.vector[sample.index] <- sum(gene.names %in% gene.set)
## if there was any overlap, report the part of the signature
if(OL.numb.vector[sample.index] > 0){
if(is.null(gene.set.direction)){
OL.vector[ sample.index ] <- paste( intersect(gene.names, gene.set), collapse='|')
} else {
##OL.vector[ sample.index ] <- paste( paste(gene.set, gene.set.direction, sep=';')[ na.omit(match(gene.names, sub(';u$|;d$', '', gene.set) )) ], collapse='|')
OL.vector[ sample.index ] <- paste( paste(gene.set, gene.set.direction, sep=';')[ na.omit(match(gene.names, gene.set )) ], collapse='|')
}
}
OL.perc.vector[sample.index] <- round( 100*(OL.numb.vector[sample.index] / size.G.current), 1)
} else {
## locations of gene set in input data (before ranking/ordering)
## 'gene.names' is in the same order as the input data
## gene.set2 <- match(gene.set, gene.names)
## gene.set2 <- which( gene.names %in% gene.set ) ## to work with redundant gene names, e.g. phospho-data
## this takes care about redundant gene lists, the approach above does not return the locations of 'gene.set' in 'gene.names' but the indices of 'gene.names' present in 'gene.set'
gene.set2 <- as.numeric( unlist(sapply( gene.set, function(x) which(gene.names == x) )))
# save(gene.set2, gene.set, gene.names, file='tmp.RData')
## order of ranks, list is now ordered, elements are locations of the ranks in
## original data,
gene.list <- order(data.expr, decreasing=T)
## ##############################################
## calculate enrichment score
GSEA.results <- gsea.score (gene.list, gene.set2, weight, n.rows, correl.type, gene.set.direction, data.expr)
ES.vector[sample.index] <- GSEA.results$ES
## overlap between gene set and data
if(is.null(gene.set.direction)){
OL.vector[ sample.index ] <- paste( unique( gene.names[gene.list][GSEA.results$indicator ]) , collapse='|')
OL.numb.vector[sample.index ] <- length( unique( gene.names[gene.list][GSEA.results$indicator ]) )
OL.perc.vector[sample.index ] <- round(100*( OL.numb.vector[ sample.index ]/size.G.current ),1)
} else { # separately for up/down
ol.tmp <- c()
if( length(GSEA.results$indicator$u) > 0)
ol.tmp <- c(ol.tmp, paste( unique( gene.names[gene.list][GSEA.results$indicator$u ]), 'u', sep=';'))
if(length(GSEA.results$indicator$d) > 0)
ol.tmp <- c(ol.tmp, paste( unique( gene.names[gene.list][GSEA.results$indicator$d ]), 'd', sep=';'))
OL.vector[ sample.index ] <- paste( ol.tmp, collapse='|')
## absolute numbers
OL.numb.vector[sample.index ] <- length( ol.tmp )
## percent
OL.perc.vector[sample.index ] <- round(100*( OL.numb.vector[ sample.index ]/size.G.current ),1)
}
#OL.vector[sample.index] <- paste( gene.names[ GSEA.results$indicator ], collapse='/' )
#OL.vector[sample.index] <- paste( gene.set[gene.set2] , collapse='/' )
## store the gsea results to
random.walk[[sample.index]] <- GSEA.results
## ##############################################
## no permutations: - ES and NES are the same
## - all p-values are 1
if (nperm == 0) {
NES.vector[sample.index] <- ES.vector[sample.index]
p.val.vector[sample.index] <- 1
## ##############################################
## do permutations
} else {
ES.tmp = sapply(1:nperm, function(x) gsea.score(sample(1:n.rows), gene.set2, weight, n.rows, correl.type, gene.set.direction, data.expr)$ES)
phi[sample.index, ] <- unlist(ES.tmp)
## #######################################################
## calculate NES and p-values
if (ES.vector[sample.index] >= 0) {
pos.phi <- phi[sample.index, phi[sample.index, ] >= 0]
if (length(pos.phi) == 0) pos.phi <- 0.5
pos.m <- mean(pos.phi)
NES.vector[sample.index] <- ES.vector[sample.index]/pos.m
s <- sum(pos.phi >= ES.vector[sample.index])/length(pos.phi)
p.val.vector[sample.index] <- ifelse(s == 0, 1/nperm, s)
} else {
neg.phi <- phi[sample.index, phi[sample.index, ] < 0]
if (length(neg.phi) == 0) neg.phi <- 0.5
neg.m <- mean(neg.phi)
NES.vector[sample.index] <- ES.vector[sample.index]/abs(neg.m)
s <- sum(neg.phi <= ES.vector[sample.index])/length(neg.phi)
p.val.vector[sample.index] <- ifelse(s == 0, 1/nperm, s)
}
} ## end do permutations
} ## end minimal overlap required
}
return(list(ES.vector = ES.vector, NES.vector = NES.vector, p.val.vector = p.val.vector, OL.vector=OL.vector, OL.numb.vector=OL.numb.vector, OL.perc.vector=OL.perc.vector, gene.set.size=gene.set.size, random.walk = random.walk))
} ## end function 'project.geneset'
## ######################################################################################################
## #########################################################
##
## multicore version
##
if(par){
## register cores
###################################
## serial loop
} else { ## end if 'par'
tt <- Sys.time()
tmp <- lapply(1:N.gs, function(gs.i){
#cat( gs.names[gs.i], ' ' )
#cat( (gs.i / N.gs)*100, '%\n')
cat(names(gs)[gs.i], '\n' , file=log.file, append=T)
gene.overlap <- gs[[gs.i]]
if(!is.null(gs.direction))
gene.set.direction <- gs.direction[[gs.i]]
if (output.score.type == "ES") {
OPAM <- project.geneset (data.array = m, gene.names = gene.names, n.cols = Ns, n.rows= Ng, weight = weight, statistic = statistic, gene.set = gene.overlap, nperm = 1, correl.type = correl.type, gene.set.direction = gene.set.direction, min.overlap = min.overlap, size.G.current = size.G[gs.i])
} else if (output.score.type == "NES") {
OPAM <- project.geneset (data.array = m, gene.names = gene.names, n.cols = Ns, n.rows= Ng, weight = weight, statistic = statistic, gene.set = gene.overlap, nperm = nperm, correl.type = correl.type, gene.set.direction = gene.set.direction, min.overlap = min.overlap, size.G.current = size.G[gs.i])
}
OPAM
})
names(tmp) <- gs.names
} ## end else
## #######################################
## extract scores and pvalues
## and generate matrices
tmp.pval <- lapply(tmp, function(x)x$p.val.vector)
pval.matrix <- matrix(unlist(tmp.pval), byrow=T, nrow=N.gs)
if (output.score.type == "ES"){
tmp.es <- lapply(tmp, function(x)x$ES.vector)
score.matrix <- matrix(unlist(tmp.es), byrow=T, nrow=N.gs)
}
if (output.score.type == "NES"){
tmp.nes <- lapply(tmp, function(x)x$NES.vector)
score.matrix <- matrix(unlist(tmp.nes), byrow=T, nrow=N.gs)
}
## #############################
## overlapping genes/sites
tmp.ol <- lapply(tmp, function(x)x$OL.vector)
ol.matrix <- matrix(unlist(tmp.ol), byrow=T, nrow=N.gs)
## overlap vector: absolute
tmp.ol.numb <- lapply(tmp, function(x)x$OL.numb.vector)
ol.numb.matrix <- matrix(unlist(tmp.ol.numb), byrow=T, nrow=N.gs)
## overlpa vector: percent
tmp.ol.perc <- lapply(tmp, function(x)x$OL.perc.vector)
ol.perc.matrix <- matrix(unlist(tmp.ol.perc), byrow=T, nrow=N.gs)
## gene site size
gs.size <- size.G
## store random walk
random.walk <- lapply(tmp, function(x) x$random.walk)
#cat('main loop: ')
#cat(Sys.time()-tt, '\n')
initial.up.entries <- 0
final.up.entries <- 0
initial.dn.entries <- 0
final.dn.entries <- 0
combined.entries <- 0
other.entries <- 0
## #####################################
## don't combine
if (combine.mode == "combine.off") {
score.matrix.2 <- score.matrix
pval.matrix.2 <- pval.matrix
ol.matrix.2 <- ol.matrix
ol.numb.matrix.2 <- ol.numb.matrix
ol.perc.matrix.2 <- ol.perc.matrix
gs.names.2 <- gs.names
gs.descs.2 <- gs.descs
gs.size.2 <- gs.size
## ####################################
## combine replace
} else if ((combine.mode == "combine.replace") || (combine.mode == "combine.add")) {
fisher.pval <- function(p) {
Xsq <- -2*sum(log(p))
p.val <- pchisq(Xsq, df = 2*length(p), lower.tail = FALSE)
return(p.val)
}
score.matrix.2 <- NULL
pval.matrix.2 <- NULL
gs.names.2 <- NULL
gs.descs.2 <- NULL
add.entry.2 <- function (s, p, n, d) {
score.matrix.2 <<- rbind (score.matrix.2, s)
pval.matrix.2 <<- rbind (pval.matrix.2, p)
gs.names.2 <<- c (gs.names.2, n)
gs.descs.2 <<- c (gs.descs.2, d)
}
k <- 1
for (i in 1:N.gs) {
temp <- strsplit(gs.names[i], split="_")
body <- paste(temp[[1]][seq(1, length(temp[[1]]) -1)], collapse="_")
suffix <- tail(temp[[1]], 1)
#print(paste("i:", i, "gene set:", gs.names[i], "body:", body, "suffix:", suffix))
if (suffix == "UP") { # This is an "UP" gene set
initial.up.entries <- initial.up.entries + 1
target <- paste(body, "DN", sep="_")
loc <- match(target, gs.names)
if (!is.na(loc)) { # found corresponding "DN" gene set: create combined entry
score <- score.matrix[i,] - score.matrix[loc,]
pval <- sapply (1:Ns, function (k) fisher.pval (c (pval.matrix[i,k], pval.matrix[loc,k]))) # combine UP and DN p-values
add.entry.2 (score, pval, body, paste(gs.descs[i], "combined UP & DN"))
combined.entries <- combined.entries + 1
if (combine.mode == "combine.add") { # also add the "UP entry
add.entry.2 (score.matrix[i,], pval.matrix[i,], gs.names[i], gs.descs[i])
final.up.entries <- final.up.entries + 1
}
} else { # did not find corresponding "DN" gene set: create "UP" entry
add.entry.2 (score.matrix[i,], pval.matrix[i,], gs.names[i], gs.descs[i])
final.up.entries <- final.up.entries + 1
}
} else if (suffix == "DN") { # This is a "DN" gene set
initial.dn.entries <- initial.dn.entries + 1
target <- paste(body, "UP", sep="_")
loc <- match(target, gs.names)
if (is.na(loc)) { # did not find corresponding "UP" gene set: create "DN" entry
add.entry.2 (score.matrix[i,], pval.matrix[i,], gs.names[i], gs.descs[i])
final.dn.entries <- final.dn.entries + 1
} else { # it found corresponding "UP" gene set
if (combine.mode == "combine.add") { # create "DN" entry
add.entry.2 (score.matrix[i,], pval.matrix[i,], gs.names[i], gs.descs[i])
final.dn.entries <- final.dn.entries + 1
}
}
} else { # This is neither "UP nor "DN" gene set: create individual entry
add.entry.2 (score.matrix[i,], pval.matrix[i,], gs.names[i], gs.descs[i])
other.entries <- other.entries + 1
}
} # end for loop over gene sets
#print(paste("initial.up.entries:", initial.up.entries))
#print(paste("final.up.entries:", final.up.entries))
#print(paste("initial.dn.entries:", initial.dn.entries))
#print(paste("final.dn.entries:", final.dn.entries))
#print(paste("other.entries:", other.entries))
#print(paste("combined.entries:", combined.entries))
#print(paste("total entries:", length(score.matrix.2[,1])))
} ## end combine results
## ####################################################################
## Make sure there are no duplicated gene set names after adding entries
unique.gene.sets <- unique(gs.names.2)
locs <- match(unique.gene.sets, gs.names.2)
score.matrix.2 <- data.frame( matrix( score.matrix.2[locs, ], nrow=length(locs) ), stringsAsFactors=F )
pval.matrix.2 <- data.frame( matrix( pval.matrix.2[locs, ], nrow=length(locs) ), stringsAsFactors=F )
ol.matrix.2 <- data.frame( matrix(ol.matrix.2[locs, ], nrow=length(locs) ), stringsAsFactors=F )
ol.numb.matrix.2 <- data.frame( matrix(ol.numb.matrix.2[locs, ], nrow=length(locs) ), stringsAsFactors=F )
ol.perc.matrix.2 <- data.frame( matrix(ol.perc.matrix.2[locs, ], nrow=length(locs)), stringsAsFactors=F )
gs.names.2 <- gs.names.2[locs]
gs.descs.2 <- gs.descs.2[locs]
gs.size.2 <- gs.size.2[locs]
## #######################################
## FDR p-values
fdr.matrix.2 <- pval.matrix.2
if (global.fdr)
fdr.matrix.2 <- matrix ( p.adjust(unlist (fdr.matrix.2), method='fdr'),
ncol=ncol(fdr.matrix.2))
else
for (i in 1:ncol(fdr.matrix.2))
fdr.matrix.2[,i] <- p.adjust (fdr.matrix.2[, i], method='fdr')
#################################################
## number of valid columns
No.columns.scored <- apply(score.matrix.2, 1, function(x) sum(!is.na(x)))
## overlaps
Signature.set.overlap <- ol.matrix.2
colnames(Signature.set.overlap) <- paste( 'Signature.set.overlap', sample.names, sep='.')
Signature.set.overlap.size <- ol.numb.matrix.2
colnames(Signature.set.overlap.size) <- paste( 'Signature.set.overlap.size', sample.names, sep='.')
Signature.set.overlap.percent <- ol.perc.matrix.2
colnames(Signature.set.overlap.percent) <- paste( 'Signature.set.overlap.percent', sample.names, sep='.')
# ###############################################
# prepare for new gct export (R CMAP functions)
if(extended.output){
gs.descs.2 <- data.frame(Signature.set.description=gs.descs.2,
Signature.set.size=gs.size.2,
Signature.set.overlap.percent,
Signature.set.overlap,
No.columns.scored,
stringsAsFactors = F)
} else {
gs.descs.2 <- data.frame(Signature.set.description=gs.descs.2,
Signature.set.size=gs.size.2,
stringsAsFactors = F)
}
## #################################################
## remove emtpy rows (gene set did not achieve sufficient
## overlap in any sample column)
## #################################################
locs <- which( No.columns.scored > 0)
score.matrix.2 <- data.frame( score.matrix.2[locs, ], stringsAsFactors = F)
pval.matrix.2 <- data.frame( pval.matrix.2[locs, ], stringsAsFactors = F)
fdr.matrix.2 <- data.frame( fdr.matrix.2[locs, ], stringsAsFactors = F)
gs.names.2 <- gs.names.2[locs]
gs.descs.2 <- data.frame( gs.descs.2[locs, ], stringsAsFactors = F)
Signature.set.overlap <- data.frame( Signature.set.overlap[locs, ], stringsAsFactors = F)
rownames(Signature.set.overlap) <- gs.names.2
## ##############################################
## export signature GCT files
if(export.signat.gct){
dir.create('signature_gct')
#sapply(rownames(Signature.set.overlap), function(sig.name) {
for( sig.name in rownames(Signature.set.overlap)) {
## extract siganture members
signat <- Signature.set.overlap[sig.name, ]
gene.names.tmp <- as.character(signat) %>% strsplit(. , '\\|') %>% unlist %>% unique # %>% sub(';u$|;d$', '', .)
if(!is.null(gs.direction)){
gene.names.tmp.direction <- sub('^.*;(u|d)$', '\\1', gene.names.tmp)
gene.names.tmp <- sub(';u$|;d$', '', gene.names.tmp)
names(gene.names.tmp.direction) <- gene.names.tmp
}
# map to input dataset. code below handles redundant ids and return all occurences in the data
gene.names.tmp.idx <- lapply(gene.names.tmp, function(x) which(gene.names %in% gene.names.tmp)) %>% unlist %>% unique
## add score, pval and fdr to column annotations
if(nrow(sample.descs) > 0){
sample.descs.tmp <- data.frame(sample.descs,
signature.score=as.numeric(score.matrix.2[which( gs.names.2 == sig.name), ]),
signature.pvalue=as.numeric(pval.matrix.2[which( gs.names.2 == sig.name), ]),
signature.fdr.pvalue=as.numeric(fdr.matrix.2[which( gs.names.2 == sig.name), ]),
stringsAsFactors=F
)
} else {
sample.descs.tmp <- data.frame(signature.score=as.numeric(score.matrix.2[which( gs.names.2 == sig.name), ]),
signature.pvalue=as.numeric(pval.matrix.2[which( gs.names.2 == sig.name), ]),
signature.fdr.pvalue=as.numeric(fdr.matrix.2[which( gs.names.2 == sig.name), ]),
stringsAsFactors=F
)
}
## add names
rownames(sample.descs.tmp) <- sample.names
# row annotations
if(nrow(gene.descs) > 0){
gene.descs.tmp <- data.frame( gene.descs[gene.names.tmp.idx,] )
if(!is.null(gs.direction)){
signature.direction <- gene.names.tmp.direction[ gene.names[gene.names.tmp.idx] ]
gene.descs.tmp <- data.frame(signature.direction, gene.descs.tmp, stringsAsFactors = F)
}
} else {
if(!is.null(gs.direction)){
signature.direction <- gene.names.tmp.direction[ gene.names[gene.names.tmp.idx] ]
gene.descs.tmp <- data.frame(signature.direction, stringsAsFactors = F)
} else {
gene.descs.tmp <- NULL
}
}
## export gene set
gct.tmp <- new('GCT')
gct.tmp@mat <- data.matrix( m.org[gene.names.tmp.idx, ] )
gct.tmp@rid <- make.unique( gene.names[ gene.names.tmp.idx ], sep='_' )
gct.tmp@cid <- sample.names
gct.tmp@cdesc <- sample.descs.tmp
if(!is.null(gene.descs.tmp))
gct.tmp@rdesc <- gene.descs.tmp
gct.tmp@src <- gct.src
#####################################
## check length of filepath
## Windows OS supports maximum of 259 characters in a file path
fn <- paste(getwd(),'/signature_gct/', make.names(sig.name), sep='')
if((nchar(fn)+15) > max.nchar.file.path){
fn <- paste( unlist(strsplit(fn,''))[1:(max.nchar.file.path-15)], collapse='')
cat(nchar(fn), ' ', fn ,'\n')
}
#write.gct(gct.tmp, ofile=sub('.*/', 'signature_gct/', fn), appenddim = T)
}
}
## #################################################
## Final count
#print(paste("Total gene sets:", length(gs.names.2)))
#print(paste("Unique gene sets:", length(unique(gs.names.2))))
#################################################
## score matrix
V.GCT <- new('GCT')
V.GCT@mat <- data.matrix(score.matrix.2)
V.GCT@rid <- gs.names.2
V.GCT@cid <- sample.names
V.GCT@rdesc <- gs.descs.2
if(nrow(sample.descs) > 0){
cdesc.final <- data.frame(sample.descs, stringsAsFactors = F)
rownames(cdesc.final) <- sample.names
#V.GCT@cdesc <- data.frame(sample.descs, stringsAsFactors = F)
V.GCT@cdesc <- cdesc.final
}
V.GCT@src <- gct.src
fn <- paste (output.prefix, '-scores.gct', sep='')
#V.GCT@fname <- fn
#write.gct(V.GCT, ofile = fn, appenddim = F)
#################################################
## p-value matrix
P.GCT <- new('GCT')
P.GCT@mat <- data.matrix(pval.matrix.2)
P.GCT@rid <- gs.names.2
P.GCT@cid <- sample.names
P.GCT@rdesc <- gs.descs.2
if(nrow(sample.descs) > 0)
P.GCT@cdesc <- cdesc.final
P.GCT@src <- gct.src
fn <- paste (output.prefix, '-pvalues.gct', sep='')
#P.GCT@fname <- fn
#write.gct(P.GCT, ofile = fn, appenddim = F)
## ##############################################
## FDR matrix
F.GCT <- new('GCT')
F.GCT@mat <- data.matrix(fdr.matrix.2) #F.GCT.mat
F.GCT@rid <- gs.names.2
F.GCT@cid <- sample.names
F.GCT@rdesc <- gs.descs.2
if(nrow(sample.descs) > 0)
F.GCT@cdesc <- cdesc.final
F.GCT@src <- gct.src
fn <- paste (output.prefix, '-fdr-pvalues.gct', sep='')
#F.GCT@fname <- fn
#write.gct(F.GCT, ofile = fn, appenddim = F)
## ######################################################################
## generate a single CGT containing scores as data matrix and
## p-values, fdr-p-values as row annotation matrices
## add p-values and fdr p-values to row description
fdr.tmp <- F.GCT@mat
colnames(fdr.tmp) <- paste('fdr-pvalue', sample.names, sep='.')
pval.tmp <- P.GCT@mat
colnames(pval.tmp) <- paste('pvalue', sample.names, sep='.')
gs.descs.2 <- data.frame(gs.descs.2, pval.tmp, fdr.tmp, stringsAsFactors = F)
## generate GCT
ALL.GCT <- new('GCT')
ALL.GCT@mat <- V.GCT@mat
ALL.GCT@rid <- gs.names.2
ALL.GCT@cid <- sample.names
ALL.GCT@rdesc <- gs.descs.2
if(nrow(sample.descs) > 0)
ALL.GCT@cdesc <- cdesc.final
ALL.GCT@src <- gct.src
ALL.GCT@version <- gct.version
fn <- paste (output.prefix, '-combined.gct', sep='')
#ALL.GCT@fname <- fn
#write.gct(ALL.GCT, ofile = fn, appenddim = F)
return(random.walk)
}
##########################################################################################
## Support Functions
##
##
##########################################################################################
#############################################################
##
## import gene sets into R workspace
##
## 20161013 modified by kk
#############################################################
Read.GeneSets.db2 <- function (gs.db, thres.min = 2, thres.max = 2000) {
## read gmt files
temp <- gs.db
temp <- strsplit(temp, '\t')
temp.size.G <- sapply(temp, function(x) length(x)-2)
## filter gene sets according to size
rm.idx <- which(temp.size.G < thres.min | temp.size.G > thres.max)
if(length(rm.idx) > 0){
temp <- temp[-rm.idx]
temp.size.G <- temp.size.G[-rm.idx]
}
max.Ng <- length(temp) ## number of signature sets
temp.size.G <- sapply(temp, function(x) length(x)-2)
max.size.G <- max(temp.size.G) ## maximal size
gs <- lapply(temp, function(x)x[3:length(x)])
gs.names <- sapply(temp, function(x)x[1])
gs.desc <- sapply(temp, function(x)x[2])
## check whether gene sets are unique
gs.unique <- lapply(gs, unique)
gs.unique.size.G <- sapply(gs.unique, length)
gs.not.unique.idx <- which(gs.unique.size.G < temp.size.G)
if( length(gs.not.unique.idx) > 0 ){
warning("\n\nDuplicated gene set members detected. Removing redundant members from:\n\n", paste(gs.names[gs.not.unique.idx], collapse='\n'))
gs <- gs.unique
temp.size.G <- gs.unique.size.G
}
size.G <- temp.size.G
names(gs) <- names(gs.names) <- names(gs.desc) <- names(size.G) <- gs.names
return(list(N.gs = max.Ng, gs = gs, gs.names = gs.names, gs.desc = gs.desc,
size.G = size.G, max.N.gs = max.Ng))
}
#################################################
## Given a string and a number of characters
## the function chops the string to the
## specified number of characters and adds
## '...' to the end.
## parameter
## string - character
## nChar - numeric
## value
## string of 'nChar' characters followed
## by '...'
##################################################
chopString <- function(string, nChar=10, add.dots=T)
{
string.trim <- strtrim(string, nChar)
if(add.dots)
string.trim[ which(nchar(string) > nChar) ] <- paste(string.trim[which(nchar(string) > nChar) ], '...')
if(!add.dots)
string.trim[ which(nchar(string) > nChar) ] <- paste(string.trim[which(nchar(string) > nChar) ])
return(string.trim)
}
##########################################################################################################
## translate a color name into rgb space
##
## changelog: 20100929 implementation
##########################################################################################################
my.col2rgb <- function(color, alpha=80, maxColorValue=255){
out <- vector( "character", length(color) )
for(col in 1:length(color)){
col.rgb <- col2rgb(color[col])
out[col] <- rgb(col.rgb[1], col.rgb[2], col.rgb[3], alpha=alpha, maxColorValue=maxColorValue)
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.