#' enrichment_in_groups
#'
#' Calculate the enrichment in pathways using Fisher's exact or Kolgmorov-Smirnov test
#' # access through leapr wrapper
#'
#' @export
#'
enrichment_in_groups <- function(geneset, targets=NULL, background=NULL, method="fishers", minsize=5,
mapping_column=NULL, abundance_column=NULL, randomize=F,
silence_try_errors=T) {
resultp = c()
resultf = c()
results = data.frame(row.names = geneset$names,
ingroup_n=rep(NA_real_, length(geneset$names)), ingroupnames=rep(NA_character_, length(geneset$names)),
ingroup_mean=rep(NA_real_, length(geneset$names)), outgroup_n=rep(NA_real_, length(geneset$names)),
outgroup_mean=rep(NA_real_, length(geneset$names)), score=rep(NA_real_, length(geneset$names)), oddsratio=rep(NA_real_, length(geneset$names)),
pvalue=rep(NA_real_, length(geneset$names)), BH_pvalue=rep(NA_real_, length(geneset$names)),
SignedBH_pvalue=rep(NA_real_, length(geneset$names)), background_n=rep(NA_real_, length(geneset$names)),
background_mean=rep(NA_real_, length(geneset$names)), stringsAsFactors = F)
for (i in 1:length(geneset$names)) {
thisname = geneset$names[i]
thissize = geneset$size[i]
thisdesc = geneset$desc[i]
grouplist = geneset$matrix[i,1:thissize]
if (randomize) {
# choose a random set of genes as this grouplist
# A disadvantage is that we resample for each functional group rather than
# running one set of analyses on a fully scrambled set of functions.
# I don't think this should be a huge problem though.
grouplist = sample(unlist(geneset$matrix), length(grouplist))
}
in_back = length(background)
if (method == "fishers") {
enr = enrichment_by_fishers(targets, background, grouplist)
p = enr$fisher$p.value
f = enr$foldx
mat = enr$mat
names = enr$in_path_names
results[thisname,"ingroup_n"] = mat[1,1]
results[thisname,"ingroupnames"] = names
results[thisname,"outgroup_n"] = mat[1,2]
results[thisname,"pvalue"] = p
results[thisname,"oddsratio"] = f
results[thisname,"background_n"] = mat[2,2]
# putting this here is a bit of a kludge (since it's not actually a mean, it's a count)
results[thisname,"background_mean"] = mat[2,1]
}
else if (method == "ks") { #Kolmogorov-Smirnov test
# in this case "background" must be the continuous variable from which grouplist can be drawn
backlist = background
if (is.null(mapping_column)) {
in_group = background[grouplist[which(grouplist %in% rownames(background))],abundance_column]
in_group_name = paste(intersect(grouplist, rownames(background)), collapse = ", ")
backlist = background[,abundance_column]
}
else {
# mapping_column adds the ability to use phospho-type data where the gene name (non-unique) is in the
# first column and the rownames are peptide ids
# unfortunately this means that "background" has to be the whole matrix and abundance_column
# has to be specified, which is a bit ugly
in_group = background[which(background[,mapping_column] %in% grouplist),abundance_column]
in_group_name = paste(intersect(background[,mapping_column], grouplist), collapse = ", ")
backlist = background[,abundance_column]
}
in_path = length(in_group)
if ((in_path > minsize)&(any(!is.na(in_path)))) {
in_back = length(backlist)
# enr = try(ks.test(in_group, backlist), silent=silence_try_errors)
# if (class(enr) == "try-error") {
# enr = NA
# p.value = NA
# #browser()
# }
# else {
# p.value = enr$p.value
# }
# The above block of code was replaced by the tryCatch block below to handle errors and warnings more elegantly.
# The if(class(enr)) statement causes an error which doesn't let the rest of the code run.
# Proposed change by Harkirat Sohi:
enr<-NA
enr<-tryCatch(
{
ks.test(in_group, backlist)
},
error=function(e) {
message('An Error Occurred')
return(NA)
}
)
if(length(enr)>1)
p.value<-enr$p.value
else
p.value<-NA
# this expression of foldx might be subject to some weird pathological conditions
# e.g. one sample has a background that is always negative, another that's positive
# may pertain to zscore too (although not sure it should)
#foldx = mean(in_group, na.rm=T)/mean(background, na.rm=T)
# rank from largest to smallest
# NOTE: By default, the function 'rank' outputs the position in the list from smallest to largest.
# that is, rank(backlist) has the most negative values at the top, with the most positive at the bottom.
# To get the positive entries at the top, negative at the bottom, we use rank(-backlist) instead.
if (is.null(mapping_column)) in_rank = rank(-backlist)[which(rownames(background) %in% grouplist)]
else in_rank = rank(-backlist)[which(background[,mapping_column] %in% grouplist)]
# this will give not a fold enrichment - but a score that ranges from 1 (most in top)
# to -1 (most in bottom).
foldx = 1-((mean(in_rank, na.rm=T)/length(backlist))/0.5)
zscore = (mean(in_group, na.rm=T)-mean(backlist, na.rm=T))/sd(in_group, na.rm=T)
#browser()
results[thisname,"ingroup_n"] = in_path
results[thisname,"ingroupnames"] = in_group_name
results[thisname,"ingroup_mean"] = mean(in_group, na.rm=T)
results[thisname,"outgroup_n"] = in_back
results[thisname,"outgroup_mean"] = mean(backlist, na.rm=T)
results[thisname,"zscore"] = zscore
results[thisname,"pvalue"] = p.value
results[thisname,"oddsratio"] = foldx
}
}
}
results[,"BH_pvalue"] = p.adjust(results[,"pvalue"], method="BH")
results[,"SignedBH_pvalue"] = results[,"BH_pvalue"]*sign(results[,"ingroup_mean"])
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.