#!/usr/bin/env/R
####### ACSN enrichment analysis #######
####### Single sample analysis / single cohort analysis #######
#' Gene set enrichment analysis
#'
#' Compute and represent gene set enrichment from your data based on pre-saved maps from ACSN or user imported maps.
#' The gene set enrichment can be run with hypergeometric test or Fisher exact test,
#' and can use multiple corrections. Visualization of data can be done either by barplots or heatmaps.
#'
#' @param Genes Character vector of genes that should be tested for enrichment
#' @param maps list of maps generated by format_from_gmt. Names of element of list will be used to track modules.
#' Default: tests on the master map.
#' @param correction_multitest either FALSE, "bonferroni", "holm", "hochberg", "hommel", "BH", "fdr" (identical to BH), or "BY"
#' @param statistical_test one of "fisher", "hypergeom"
#' @param min_module_size will remove from the analysis all modules which are (strictly) smaller than threshold
#' @param universe Universe on which the statistical analysis should be performed.
#' Can be either "HUGO","ACSN","map_defined", or a character vector of genes.
#' @param Remove_from_universe Default is NULL. A list of genes that should not be considered for enrichment
#' (will be removed from input, maps, and universe). The size of universe and map will be updated after removal.
#' @param threshold maximal p-value (corrected if correction is enabled) that will be displayed
#' @param alternative One of "greater", "less", "both" or "two.sided"
#' Greater will check for enrichment, less will check for depletion, and both will look for
#' both and will keep track of the side,
#' while two-sided (only for fisher test) checks if there is a difference.
#' @return Output is a dataframe with the following columns:\describe{
#' \item{module}{The name of the map or the module preceded by the map}
#' \item{module_size}{The number of genes in the module after taking into account universe reduction}
#' \item{nb_genes_in_module}{The number of genes from input list in the module}
#' \item{genes_in_module}{Names of the genes from input list in the module, space separated}
#' \item{universe_size}{size of the input universe}
#' \item{nb_genes_in_universe}{number of genes from the input list that are found in the universe}
#' \item{test}{the kind of test that was looked for. "greater" when enrichment is tested, "less" when depletion is tested, or "two.sided"}
#' }
#' @examples enrichment(genes_test,min_module_size = 10,
#' threshold = 0.05,
#' maps = list(cellcycle = ACSNMineR::ACSN_maps$CellCycle),
#' universe = "ACSN")
#' @export
#' @importFrom stats fisher.test p.adjust phyper
#' @importFrom utils read.csv
enrichment<-function(Genes=NULL,
maps = c("Apoptosis",
"CellCycle",
"DNA_repair",
"EMT_motility",
"Survival" ),
correction_multitest = "BH",
statistical_test = "fisher",
min_module_size = 5,
universe = "map_defined",
Remove_from_universe = NULL,
threshold = 0.05,
alternative = "greater"){
if(statistical_test=="hypergeom" & (alternative != "greater")){
warning("Depletion with hypergeometric test can be unreliable. Prefer fisher test.")
}
### Checking maps
if(is.data.frame(maps) | is.matrix(maps)){
maps<-list(maps)
}
else if(!is.list(maps)){
if(is.character(maps)){
map_names<-names(ACSNMineR::ACSN_maps)
w<-numeric()
for(map_index in maps){
w<-c(w,which(grepl(pattern = map_index, x = map_names, ignore.case = TRUE)))
}
w<- unique(w)
#print(w)
if(length(w)>1){
maps<-ACSNMineR::ACSN_maps[w]
mapnames<-names(ACSNMineR::ACSN_maps)[w]
names(maps)<-mapnames
}
else if(length(w)){
maps<-list(ACSNMineR::ACSN_maps[[w]])
names(maps)<-map_names[w]
}
else{
stop("The name of the maps you entered are unknown")
}
}
else{
stop("maps should be a dataframe or a list of dataframes. Exiting")
}
}
if(universe[1] == "ACSN"){
universe_was_ACSN<-TRUE
}
else{
universe_was_ACSN<-FALSE
}
### Checking that gene list is unique
Genes<-unique(Genes)
if(!is.null(Remove_from_universe)){
Remove_from_universe<-unique(Remove_from_universe)
test<-Genes %in% Remove_from_universe
if(sum(test)){
warning(paste("The following genes were found in input and asked to be removed from universe:\n",
paste(Genes[test],collapse = " "))
)
Genes<-Genes[!test]
}
}
Genes_size<-length(Genes)
if(!(alternative %in% c("greater","less","both","two.sided"))){
stop('enrichment variable should be one of:"greater","less","both","two.sided" ')
}
### Checking that parameters are correct
if(sum(statistical_test %in% c("fisher", "hypergeom"))==0){
stop("statistical_test should be one of 'fisher' or 'hypergeom'")
}
if(alternative == "two.sided" & statistical_test != "fisher"){
warning("two-sided test only relevant for fisher test. \n Will perform both enrichment and under-representation tests.")
alternative<-"both"
}
result<-data.frame()
### If universe is ACSN: extract genes from ACSN and define it as universe
if(length(universe) == 1){
if(universe == "ACSN"){
if(is.null(Remove_from_universe)){
genesACSN<-unique(unlist(lapply(X = ACSNMineR::ACSN_maps,FUN = function(z){
return(as.character(unique(z[,-(1:2)])))
})))
universe<-genesACSN[genesACSN!=""]
size<-length(genesACSN)
}
else{ # Filter out genes to be removed from analysis
genesACSN<- genesACSN<-unique(unlist(lapply(X = ACSNMineR::ACSN_maps,FUN = function(z){
return(as.character(unique(z[,-(1:2)])))
})))
universe<-genesACSN[!(genesACSN %in% c("",Remove_from_universe))]
size<-length(genesACSN)
}
}
else if(universe == "HUGO"){
###Total size of approved symbols, from http://www.genenames.org/cgi-bin/statistics, as of October 8th 2015
if(is.null(Remove_from_universe)){
size = 39480
}
else{ ## Filter out Remove_from_universe
size = 39480 - length(Remove_from_universe)
if(is.list(maps)){
maps<-lapply(maps,FUN = function(df){
df[df %in% Remove_from_universe]<-""
})
}
else{
maps[maps %in% Remove_from_universe]<-""
}
}
}
else if( universe == "map_defined"){
genesmap<-character()
iterator<-0
if(is.list(maps)){
genesmap<-unique(as.character(unlist(lapply(X = maps,FUN = function(z){
return(as.character(z[,-(1:2)]))
}))))
}
else{
genesmaps<-unique(as.character(maps[,-(1:2)]))
}
genesmap<-as.character(genesmap[genesmap!=""])
if(!is.null(Remove_from_universe)){
genesmap<-genesmap[!(genesmap %in% Remove_from_universe)]
}
is_in_ACSN<-Genes %in% genesmap
S<-sum(!is_in_ACSN)
if(S > 0){ ### removing genes from list, that are not from ACSN
warning(paste(S,"genes are not in ACSN modules and will be excluded from analysis"))
}
Genes<-Genes[is_in_ACSN]
size <- length(genesmap)
universe<-genesmap
}
else{
stop("Invalid universe input: must be 'HUGO','ACSN','map_defined', or a list of genes as character vector")
}
### Length of universe can be changed if "ACSN"
}
if(length(universe)> 1){
### Change maps so that they only have gene names from universe and remove modules which are too small
i<-0
genesACSN<-character()
iterator<-0
for(lt in maps){
iterator<-iterator+1
### extract modules of size >= module_size and get size of ACSN reduced by universe and module size
validmap<-data.frame()
if(dim(lt)[1]>1){ ### map is a dataframe
for(k in 1:dim(lt)[1]){
spare_genes<-as.character(lt[k,-c(1,2)])
spare_genes[!(spare_genes %in% universe)]<- ""
test<-spare_genes != ""
S<-sum(test)
if( S>= min_module_size){
genesACSN<-unique(c(genesACSN,unique(spare_genes[test])))
validmap<-rbind(validmap,cbind(lt[k,1],S,t(spare_genes)))
}
}
}
else{ #map is a single line
if(length(lt)>2){
spare_genes<-as.character(sapply(3:length(lt),FUN = function(z) as.character(lt[[z]])))
test<-spare_genes != ""
S<-sum(test)
if( S>= min_module_size){
genesACSN<-unique(c(genesACSN,unique(spare_genes[test])))
validmap<-rbind(validmap,cbind(lt[1],S,t(spare_genes)))
validmap[,2]<-as.integer(as.character(validmap[,2]))
}
}
else{
stop("Empty map, exiting")
}
}
if(!universe_was_ACSN){
size <- length(universe)
}
maps[[iterator]]<-validmap
}
is_in_ACSN<-Genes %in% genesACSN
S<-sum(!is_in_ACSN)
if(S > 0){ ### removing genes from list, that are not from ACSN
warning(paste(S,"genes are not in universe and will be excluded from analysis"))
}
Genes<-Genes[is_in_ACSN]
}
if(length(Genes)==0){
stop("No genes in universe.")
result<-data.frame()
}
##################################
#### Begin calculation of p-values
##################################
### get from list how many are in each sub-compartment
result<-data.frame()
### what would be the expected?
tracker<-0
map_names<-names(maps)
if(is.null(map_names)){
if(length(maps)>1){
message("Maps should be defined in this way: maps<-list(map1 = ... , map2 = ...)")
message("In the absence of names, will use letters")
}
map_names<-paste("map",1:length(maps),sep = "_")
}
for(map in maps){
tracker <- tracker + 1
if(dim(map)[1]==0){
message(paste("Map",map_names[tracker],"has no modules left from restriction to universe"))
}
else{
modules<-map[,1:2]
### Calculate p-value for map as a whole if map is not ACSN_master
###########################
### SHOULD WE COMPUTE P-VAL FOR WHOLE MAP?
###########################
if(universe_was_ACSN & map_names[tracker]!="ACSN_master"){
compute_module_p.value<-TRUE
}
else if(!is.data.frame(maps) & length(maps)>1 & map_names[tracker]!="ACSN_master"){
compute_module_p.value<-TRUE
}
else{
compute_module_p.value<-FALSE
}
########################
# BEGIN MAP TESTING
########################
if( compute_module_p.value ){ ###
mapgenes<-unique(as.character(unlist(map[,-c(1:2)])))
mapsize<-length(mapgenes)
genes_in_mapgenes<-(Genes %in% mapgenes)
Ngenes_in_mapgenes<-sum(genes_in_mapgenes)
########################
# BEGIN FISHER TESTING
########################
if(statistical_test == "fisher"){
if(alternative != "both"){
p.values<-c(p.val.calc(Ngenes_in_mapgenes,
mapsize-Ngenes_in_mapgenes,
length(Genes),
size - length(Genes),
"fisher",
alternative
),
alternative)
}
else{ ### if both computations
p.values<-cbind(c(p.val.calc(Ngenes_in_mapgenes,
mapsize-Ngenes_in_mapgenes,
length(Genes),
size - length(Genes),
"fisher",
"greater"
),
"enrichment"),
c(p.val.calc(Ngenes_in_mapgenes,
mapsize-Ngenes_in_mapgenes,
Genes_size,
size - length(Genes),
"fisher",
"less"
),
"depletion")
)
}
}
########################
# BEGIN HYPERGEOM TESTING
########################
else{ ### Test is != from fisher
### Ngcalc: Ngenes corrected or not for P[X<=k] (default for phyper, needs to be P[X<k] for p-value)
if(alternative == "less"){
Ngcalc<-Ngenes_in_mapgenes
}
else{
if(Ngenes_in_mapgenes>0){
Ngcalc<-Ngenes_in_mapgenes-1
}
else{
Ngcalc<-0
}
}
if(alternative == "greater" & Ngenes_in_mapgenes==0){ ### You cannot have enrichment if you have 0 genes in module
p.values<-c(1,alternative)
}
else if(alternative != "both"){ # alternative is (less) or (greater & Ngenes>0)
p.values<-c(p.val.calc( Ngcalc,
length(Genes),
size - length(Genes),
mapsize,
"hypergeom",
alternative
),
alternative
)
}
else{ ### Testing both hypotheses
if(Ngenes_in_mapgenes>0){
p.values<-cbind(c(p.val.calc( Ngcalc,
length(Genes),
size - length(Genes),
mapsize,
"hypergeom",
'greater'
),
'enrichment'),
c(p.val.calc( Ngenes_in_mapgenes, ### for not using Ngcalc see def of Ngcalc
length(Genes),
size - length(Genes),
mapsize,
"hypergeom",
'less'
),
'depletion'
)
)
print(p.values)
}
else{ ### Both alternatives and Ngenes is 0
p.values<-cbind(c(1,'greater'),
c(p.val.calc( Ngenes_in_mapgenes, ### for not using Ngcalc see def of Ngcalc
length(Genes),
size - length(Genes),
mapsize,
"hypergeom",
'less'
),
'less'
)
)
print(p.values)
}
}
}
spare<-cbind(map_names[tracker],
mapsize,
paste(Genes[genes_in_mapgenes],collapse = " "),
Ngenes_in_mapgenes,
t(p.values))
colnames(spare)<-c("module","module_size","genes_in_module",
"nb_genes_in_module","p.value","test")
result<-rbind(result,spare)
result$module_size<-as.integer(as.character(result$module_size))
} #### End of module p-value
if(length(maps)>1){
modules[,1]<-paste(map_names[tracker],modules[,1],sep=":")
map[,1]<-modules[,1]
}
##########################
#### BEGIN MODULE TESTING
#########################
### BEGIN FISHER TEST###
#########################
if(statistical_test == "fisher"){
p.values<-apply(map,MARGIN = 1, FUN = function(z){
short_z<-z[z!=""][-c(1,2)] ### remove empty slots, module name and length
Module_size<-cnum(z[2])
test<-Genes %in% short_z
Genes_in_module<-sum(test)
Gene_set<-paste(Genes[test],collapse = " ")
if(alternative != "both"){
return(t(c(Gene_set,Genes_in_module,p.val.calc(x = Genes_in_module,
y = Module_size-Genes_in_module,
z = length(Genes),
a = size - length(Genes),
stat_test = "fisher",
alt = alternative),alternative))
)
}
else{ ### test for both greater and less
return(cbind(t(c(z[1],z[2],Gene_set,Genes_in_module,p.val.calc(x = Genes_in_module,
y = Module_size-Genes_in_module,
z = length(Genes),
a = size - length(Genes),
stat_test = "fisher",
alt = "greater"),"enrichment")),
t(c(z[1],z[2],Gene_set,Genes_in_module,p.val.calc(Genes_in_module,
Module_size-Genes_in_module,
length(Genes),
size - length(Genes),
stat_test = "fisher",
"less"),"depletion"))))
}
}
)
#########################
### BEGIN HYPERGEOM TEST#
#########################
}else if(statistical_test == "hypergeom"){
p.values<-apply(map,MARGIN = 1, FUN = function(z){
short_z<-z[z!=""][-c(1,2)] ### remove empty slots, module name and length
Module_size<-cnum(z[2])
test<-Genes %in% short_z
Genes_in_module<-sum(test)
Gene_set<-paste(Genes[test],collapse = " ")
if(alternative=="greater"){
if(Genes_in_module > 0){ ### Correction for depletion: phyper tests for P[X<=x]
Genes_in_module_calc<-Genes_in_module-1
}
else{ ### Return p-value of 1 (you cannot have enrichment with 0 genes!)
return(c(Gene_set,Genes_in_module,1,alternative))
}
}
else{
Genes_in_module_calc<-Genes_in_module
}
if(alternative != "both"){ #### Either return less or greater&Ngenes>0
return(c(Gene_set,Genes_in_module,p.val.calc(x = Genes_in_module_calc,
y = length(Genes),
z =size - length(Genes),
a = Module_size,
"hypergeom",
alternative),
alternative))
}
else{ #### p-value for both lower and higher tail
if(Genes_in_module > 0){ ### Correction for depletion: phyper tests for P[X<=x]
return(cbind(t(c(z[1],z[2],Gene_set,Genes_in_module,p.val.calc(x = Genes_in_module - 1,
y = length(Genes),
z = size - length(Genes),
a = Module_size,
"hypergeom",
"greater"),
"enrichment")),
t(c(z[1],z[2],Gene_set,Genes_in_module,p.val.calc(x= Genes_in_module,
y=length(Genes),
z=size - length(Genes),
a=Module_size,
"hypergeom",
"less"),
"depletion")))
)
}
else{
return(cbind(t(c(z[1],z[2],Gene_set,Genes_in_module,1,"enrichment")),
t(c(z[1],z[2],Gene_set,Genes_in_module,p.val.calc(x= Genes_in_module,
y=length(Genes),
z=size - length(Genes),
a=Module_size,
"hypergeom",
"less"),
"depletion"))))
}
}
}) ### end of phyper apply function
} ### end of statistical test
}
##################
#END HYPERGEOM ###
#BEGIN Data formating#
##################
if(alternative!="both"){
spare<-cbind(modules,t(p.values))
}
else{
spare<-rbind(t(p.values)[,1:6],t(p.values)[,7:12])
}
colnames(spare)<-c("module","module_size","genes_in_module",
"nb_genes_in_module","p.value","test")
result<-rbind(result,spare)
} ### end of maps
result$p.value<-cnum(result$p.value)
result$universe_size<-size
result$genes_in_universe<-Genes_size
if(is.logical(correction_multitest)){
result<-result[cnum(result$p.value) <= threshold,]
if(correction_multitest){
warning("If multiple correction is wanted, please use one of the listed parameters.")
}
}
else{
if(correction_multitest %in% c("bonferroni", "holm", "hochberg", "hommel", "BH", "fdr", "BY")){
result$p.value.corrected<-p.adjust(cnum(result$p.value),method = correction_multitest)
result<-result[result$p.value.corrected <= threshold,]
}
else{
result<-result[result$p.value <= threshold,]
warning('correction multitest should be one of the following: "bonferroni", "holm", "hochberg", "hommel", "BH", "fdr", "BY"')
}
}
###adding missing info
if(dim(result)[1]==0){
warning("No modules under threshold")
return(NA)
}
result$universe_size<-size
result$nb_genes_in_universe<-length(Genes)
p.val.names<-colnames(result)[grepl(pattern = "p.value",x = colnames(result))]
### re-ordering
result<-result[,c("module","module_size","nb_genes_in_module",
"genes_in_module","universe_size",
"nb_genes_in_universe",
p.val.names,
"test")]
return(result)
}
####### Multiplesample analysis / multiple cohorts analysis #######
#' Automated gene set analysis for multiple sets
#'
#'
#' @param Genes_by_sample List of character vectors. Each list element name should be a sample name,
#' and each character vector the set of genes to test for the sample.
#' @param maps list of maps generated by format_from_gmt. Default: tests on all acsn maps
#' @param correction_multitest either FALSE, "bonferroni", "holm", "hochberg", "hommel", "BH", "fdr" (identical to BH), or "BY"
#' @param statistical_test one of "fisher", "hypergeom"
#' @param min_module_size will remove from the analysis all modules which are (strictly) smaller than threshold
#' @param universe Universe on which the statistical analysis should be performed. Can be either "HUGO","ACSN"
#' ,"map_defined", or a character vector of genes.
#' @param Remove_from_universe Default is NULL. A list of genes that should not be considered for enrichment
#' (will be removed from input, maps, and universe). The size of universe and map will be updated after removal.
#' @param threshold maximal p-value (corrected if correction is enabled) that will be displayed
#' @param cohort_threshold if TRUE modules will be kept in all samples if at least one sample
#' has p-value lower than threshold, otherwise the threshold is applied for each sample independently.
#' @param alternative One of "greater", "less", "both", or "two.sided" (only for fisher test).
#' Greater will check for enrichment, less will check for depletion, and both will look for both.
#' @return Output is a list of dataframes with names the names given in `Genes_by_sample` with the following columns:\describe{
#' \item{module}{The name of the map or the module preceded by the map}
#' \item{module_size}{The number of genes in the module after taking into account universe reduction}
#' \item{nb_genes_in_module}{The number of genes from input list in the module}
#' \item{genes_in_module}{Names of the genes from input list in the module, space separated}
#' \item{universe_size}{size of the input universe}
#' \item{nb_genes_in_universe}{number of genes from the input list that are found in the universe}
#' \item{test}{the kind of test that was looked for. "greater" when enrichment is tested, "less" when depletion is tested, or "two.sided"}
#' }
#' @examples multisample_enrichment(Genes_by_sample = list(set1 = genes_test,set2=c(genes_test,"PTPRD")),
#' maps = list(cellcycle = ACSNMineR::ACSN_maps$CellCycle),
#' min_module_size = 10,
#' universe = "ACSN",cohort_threshold = FALSE)
#' @export
multisample_enrichment<-function(Genes_by_sample=NULL,
maps = c("Apoptosis",
"CellCycle",
"DNA_repair",
"EMT_motility",
"Survival" ),
correction_multitest = "BH",
statistical_test = "fisher",
min_module_size = 5,
universe = "map_defined",
Remove_from_universe = NULL,
threshold = 0.05,
cohort_threshold = TRUE,
alternative = "greater"){
### TO-DO : restrict genes to universe to gain computation time
if(cohort_threshold){
result<-lapply(X = Genes_by_sample ,
FUN = function(z){
enrichment(Genes = z,
maps = maps,
correction_multitest = correction_multitest,
statistical_test = statistical_test,
min_module_size = min_module_size,
universe = universe,
Remove_from_universe =Remove_from_universe,
threshold = 1)
})
kept_modules<-character()
for(sample in result){
if(is.logical(correction_multitest)){
kept_modules<-unique(c(kept_modules,sample[sample$p.value<=threshold,1]))
}
else{
kept_modules<-unique(c(kept_modules,sample[sample$p.value.corrected<=threshold,1]))
}
}
for(i in 1:length(result)){
result[[i]]<-result[[i]][result[[i]][,1] %in% kept_modules,]
}
}
else{
result<-lapply(X = Genes_by_sample ,
FUN = function(z){
enrichment(Genes = z,
maps = maps,
correction_multitest = correction_multitest,
statistical_test = statistical_test,
min_module_size = min_module_size,
universe = universe,
Remove_from_universe = Remove_from_universe,
threshold = threshold)
})
names(result)<-names(Genes_by_sample)
}
return(result)
}
#' Calculate p-value
#'
#' @param x : first value
#' @param y : second value
#' @param z : third value
#' @param a : fourth value
#' @param stat_test : statistical test to be used
#' @param alt : alternative: one of two-sided, greater, less or both
#'
p.val.calc<-function(x,y,z,a,stat_test,alt){
if(stat_test=="fisher"){
M<-matrix(c(x,y,z,a),nrow = 2)
FT<-fisher.test(M, alternative =alt)
return(FT$p.value)
}
else{
if(alt =="greater"){
return(phyper(x, y,z,a,lower.tail = FALSE))
}
else{
return(phyper(x, y,z,a,lower.tail = TRUE))
}
}
}
#' Convert to numeric
#'
#' @param x A vector of numbers which is not in numeric format
cnum<-function(x){
return(as.numeric(as.character(x)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.