#' #GSEA-squared (last mod. 07/2019 ksheu)
#'
#'
#' Input: excel file paths for GSEA output, pos and neg enrichment
#' Input: keyword groups to search
#' Output: graph and pvals
#'
#' @param file1,file2 Paths to GSEA excel file outputs
#' @param keywords Keyword groups to search for (eg. c("CELL_CYLE", "INFLAMMATORY|IMMUNE"))
#' @param keyword.labels Labels for keyword groups, in the same order (eg. c("cycle", "immune"))
#' @param get.keywords perform ks tests on individual words to get candidate keywords
#' @param dir set output directory
#'
#'
#'
#' @export
#
gsea_squared = function(file1, file2, keywords, keyword.labels = keywords, get.keywords = F, dir = getwd()){
# file1 = "./Mseries_melanoma33_X2.xscores_c5.GseaPreranked.1567983643452/gsea_report_for_na_pos_1567983643452.xls";
# file2 = "./Mseries_melanoma33_X2.xscores_c5.GseaPreranked.1567983643452/gsea_report_for_na_neg_1567983643452.xls";
# keywords = c("MHC","PIGMENT|MELANIN|MELANOCYTE","DIFFERENTIATION|DEVELOPMENT","LIPID|STEROL")
# keyword.labels = c("MHC","melanin",'differentiation', "lipid")
# get.keywords = F; dir = "./Mseries_melanoma33_X2.xscores_c5.GseaPreranked.1567983643452/"
# file1 = "./Mseries_melanoma33_X1diff_c5.GseaPreranked.1567979833887/gsea_report_for_na_pos_1567979833887.xls"
# file2 = "./Mseries_melanoma33_X1diff_c5.GseaPreranked.1567979833887/gsea_report_for_na_neg_1567979833887.xls"
# keywords = c("MHC","PIGMENT|MELANIN|MELANOCYTE","DIFFERENTIATION|DEVELOPMENT","LIPID|STEROL")
# keyword.labels = c("MHC","melanin",'differentiation', "lipid")
setwd(dir)
require(ksheu.library1); require(dplyr); require(ggplot2);
require(pheatmap);require(ggpubr);require(Matching);require(RColorBrewer)
# source("C:/Users/msheu/Desktop/Katherine/signed-ks-test.R") #from Github ks.test2
dfm = read.delim(file1)
dfm2 = read.delim(file2)
dfm = rbind(dfm, dfm2)
dfm = dfm[order(dfm$NES),]
dfm$rnk = seq(1:nrow(dfm))
if (get.keywords==T){
#figure out important keywords
words = data.frame(table(unlist(strsplit(tolower(dfm$NAME), "_")))) #split words
words = words[order(words$Freq, decreasing = T),]
words$Var1 = toupper(words$Var1)
str(words)
words.upper = words[which(words$Freq>5&words$Freq<500),] #frequency filter
set.seed(1);kspvals = data.frame(word = as.character(), pval = as.numeric(), ES = as.numeric())
for (i in seq(1:nrow(words.upper))){
tryCatch({
print(i)
print(words.upper$Var1[i])
test<-as.numeric(dfm$rnk[(grepl(words.upper$Var1[i], dfm$NAME))] )
background<-as.numeric(dfm$rnk[!(grepl(words.upper$Var1[i], dfm$NAME))] )
# pval = data.frame(word = words.upper$Var1[i], pval= ks.boot(test, background, n=100)$ks.boot$p.value)
pval = data.frame(word = words.upper$Var1[i], pval= ks.test.2(test, background)$p, ES= ks.test.2(test, background)$ES)
kspvals = rbind(kspvals, pval)
},error=function(e){})
}
kspvals$freq = words.upper$Freq[match(kspvals$word, words.upper$Var1)]
kspvals = kspvals[order(kspvals$pval),]
write.table(kspvals, "./word_freq_ks.test.2.txt", quote =F, sep = "\t", row.names = F)
}
# keywords = c("NEURO", "IMMUNE",...)
# keyword.labels = c("neuro", "immune", ...)
keywords = strsplit((keywords), ",")
keyword.labels = strsplit((keyword.labels), ",")
dfm$type = "other"
for (i in seq(1:length(keywords))){
dfm$type = ifelse(grepl(keywords[i], dfm$NAME), keyword.labels[i], dfm$type)
}
dfm$type = unlist(dfm$type)
dfm$logFDR = log10(dfm$FDR.q.val)
p<-ggplot(dfm, aes(type, rnk))+geom_point(aes(color = type), position = "jitter")+
theme_classic(base_size = 11)+
theme(text = element_text(size=20), #axis.text.y = element_text(angle = 0, vjust = 1, hjust=1.2),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1.2),legend.position = "none") +
# scale_x_discrete(limits = keyword.labels)+
ylab("rank")
print(p)
# plot(abs(dfm$NES), dfm$FDR.q.val)
dfm$sig = as.factor(ifelse((dfm$FDR.q.val<0.05),1,0))
table(dfm$type, dfm$sig)
#plot with diff shape for FDR cutoff
p<-ggplot(dfm, aes(type, rnk))+geom_point(aes(color = type, shape = sig), position = "jitter")+
theme_classic(base_size = 11)+
theme(text = element_text(size=20), #axis.text.y = element_text(angle = 0, vjust = 1, hjust=1.2),
axis.text.x = element_text(angle = 45, vjust = 1, hjust=1.2),legend.position = "none") +
# scale_x_discrete(limits = keyword.labels)+
ylab("rank")
print(p)
#get pvals
frame = data.frame(pval = as.numeric())
for (i in seq(1:length(keyword.labels))){
test<-as.numeric(dfm$rnk[(dfm$type==keyword.labels[i])] )
background<-as.numeric(dfm$rnk[!(dfm$type==keyword.labels[i])] )
frame = rbind(frame, -log10(ks.test.2(test, background)$p))
}
rownames(frame) = keyword.labels
return(frame)
}
#from https://github.com/franapoli/signed-ks-test/blob/master/signed-ks-test.R
ks.test.2 <- function (x, y, ..., alternative = c("two.sided", "less", "greater"), exact = NULL, maxCombSize=10000)
{
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact)) {
exact <- (n.x * n.y < maxCombSize)
if(!exact)
warning(paste("P-value not computed exactly because",
"of combined sample size"))
}
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
if (exact) {
warning("cannot compute exact p-value with ties")
exact <- FALSE
}
else warning("p-value will be approximate in the presence of ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
edge <- which.max(abs(z))
ES <- z[edge]
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .Call(stats:::C_pSmirnov2x, STATISTIC, n.x, n.y)
}
else {
if (is.character(y))
y <- get(y, mode = "function", envir = parent.frame())
if (!is.function(y))
stop("'y' must be numeric or a function or a string naming a valid function")
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning("ties should not be present for the Kolmogorov-Smirnov test")
TIES <- TRUE
}
if (is.null(exact))
exact <- (n < 100) && !TIES
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact) {
PVAL <- 1 - if (alternative == "two.sided")
result = tryCatch({
.C(C_pkolmogorov2x, p = as.double(STATISTIC),
as.integer(n), PACKAGE = "stats")$p
}, warning = function(w) {
warning(w)
}, error = function(e) {
.Call(C_pKolmogorov2x, STATISTIC, n)
}, finally = {
})
else {
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 -
x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 -
x - j/n) + (j - 1) * log(x + j/n)))
}
pkolmogorov1x(STATISTIC, n)
}
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
if (is.null(PVAL)) {
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.double(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND))
p[IND] <- tryCatch({
tryRes <- .C(stats:::C_pkstwo, length(x[IND]), p = x[IND],
as.double(tol), PACKAGE = "stats")$p
}, warning = function(w) {
warning(w)
}, error = function(e) {
tryRes <- .Call(stats:::C_pKS2, p = x[IND], tol)
}, finally = {
})
p
}
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
PVAL <- min(1, max(0, PVAL))
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME, ES = ES, edge = edge)
class(RVAL) <- "htest"
return(RVAL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.