R/cAMARETTO_Results.R

Defines functions cAMARETTO_Results GmtFromModules readGMT HyperGTestGeneEnrichment ExtractGenesInfo Extract_Genes_Modules_All

Documented in cAMARETTO_Results ExtractGenesInfo Extract_Genes_Modules_All GmtFromModules HyperGTestGeneEnrichment readGMT

#' @title cAMARETTO_Results
#' 
#' To initiate the community AMARETTO this results functions performs hyper geometric tests between all modules and gene sets.
#'
#' @param AMARETTOresults_all A list of multiple AMARETTO_Run outputs. The names are run names.
#' @param NrCores Nr of Cores that can be used to calculated the results.
#' @param output_dir A directory that stores gmt files for the runs
#' @param gmt_filelist A list with gmt files that are added into the communities. The names of the list are used in the networks. NULL if no list is added.
#' @param drivers Boolean that defines if only targets or drivers and targets are used to calculate the HGT.
#'
#' @return a list with cAMARETTOresults data objects from multiple runs
#' @importFrom gtools combinations
#' @importFrom dplyr arrange group_by left_join mutate select summarise  rename  filter everything pull distinct case_when
#' @importFrom stats p.adjust phyper
#' @examples 
#' 
#' Cibersortgmt <- "ciberSort.gmt"
#' cAMARETTOresults <- cAMARETTO_Results(AMARETTOresults_all, gmt_filelist=list(ImmuneSignature = Cibersortgmt), NrCores = 4 , output_dir = "./")
#' 
#' @export
cAMARETTO_Results <- function(AMARETTOresults_all, NrCores=1, output_dir="./", gmt_filelist=NULL, drivers = FALSE){
  
  #test if names are matching
  
  runnames <- names(AMARETTOresults_all)
  if (!length(unique(runnames)) == length(runnames)){
    stop("The run names are not unique. Give unique names.")
  }

  
  dir.create(file.path(output_dir, "gmt_files"), recursive = FALSE, showWarnings = FALSE)
  
  # for each file a gmt for the modules
  create_gmt_filelist<-c()

  for (run in runnames){
    gmt_file <- file.path(output_dir,"gmt_files",paste0(run, "_modules.gmt"))
    GmtFromModules(AMARETTOresults_all[[run]], gmt_file, run, Drivers = drivers)
    create_gmt_filelist <- c(create_gmt_filelist,gmt_file)
  }
  names(create_gmt_filelist)<-runnames

  # add extra gmt files to compare with
  given_gmt_filelist <- c()
  if (!is.null(gmt_filelist)){
    for (gmt_file in gmt_filelist){
      gmt_file_path <- file.path(gmt_file)
      given_gmt_filelist <- c(given_gmt_filelist,gmt_file_path)
    }
  }
  names(given_gmt_filelist)<-names(gmt_filelist)
  all_gmt_files_list <- c(create_gmt_filelist,given_gmt_filelist)

  if (! length(unique(names(all_gmt_files_list))) == length(names(all_gmt_files_list))){
    stop("There is overlap between the gmt file names and run names")
  }
  
  if (length(all_gmt_files_list) < 2){
    stop("There are none or only one group given, community AMARETTO needs at least two groups.")
  }
  
  # compare gmts pairwise between runs
  all_run_combinations <- as.data.frame(gtools::combinations(n=length(all_gmt_files_list), r=2, v=names(all_gmt_files_list), repeats.allowed=F))
  
  output_hgt_allcombinations <- apply(all_run_combinations, 1, function(x) {
    gmt_run1 <- all_gmt_files_list[x["V1"]]
    gmt_run2 <- all_gmt_files_list[x["V2"]]
    output_hgt_combination <- HyperGTestGeneEnrichment(gmt_run1, gmt_run2,runname1=as.character(x["V1"]),runname2=as.character(x["V2"]),NrCores=NrCores)
    return(output_hgt_combination)
  })
  
  genelists<-lapply(all_gmt_files_list, function(x){
    genelist<-readGMT(x)
    names(genelist)<-gsub(" ","_",names(genelist))
    return(genelist)
  })
  
  output_hgt_allcombinations <- do.call(rbind, output_hgt_allcombinations)
  output_hgt_allcombinations$padj <- stats::p.adjust(output_hgt_allcombinations$p_value, method="BH")
  output_hgt_allcombinations <- output_hgt_allcombinations %>% 
    dplyr::mutate(p_value=dplyr::case_when(Geneset1 == Geneset2~NA_real_, TRUE~p_value))
  output_hgt_allcombinations <- output_hgt_allcombinations %>% dplyr::mutate(Geneset1=ifelse(RunName1%in%names(given_gmt_filelist),paste0(RunName1,"|",gsub(" ","_",Geneset1)),Geneset1),Geneset2=ifelse(RunName2%in%names(given_gmt_filelist),paste0(RunName2,"|",gsub(" ","_",Geneset2)),Geneset2))
  
  # Extract relationship between genes and modules for all AMARETTo files and the given gmt files.
  all_genes_modules_df<-Extract_Genes_Modules_All(AMARETTOresults_all,gmt_filelist)
  
  return(list(runnames=runnames,gmtnames=names(given_gmt_filelist),hgt_modules=output_hgt_allcombinations, genelists = genelists, all_genes_modules_df=all_genes_modules_df, NrCores=NrCores))
}

#' @title GmtFromModules
#'
#' @param AMARETTOresults A AMARETTO_Run output.
#' @param gmt_file A gmtfilename
#' @param run A runname
#' @param Drivers Add drivers to the gmt-file
#'
#' @return Creates a gmt file for a AMARETTO run
#' @importFrom utils write.table
#' @importFrom dplyr select arrange
#' @export
GmtFromModules <- function(AMARETTOresults,gmt_file,run,Drivers=FALSE){
  ModuleMembership<-ExtractGenesInfo(AMARETTOresults,run)
  if (Drivers == FALSE){
    ModuleMembership<-dplyr::filter(ModuleMembership,Type=="Target")
  }
  ModuleMembership<-ModuleMembership%>%dplyr::select(GeneNames,ModuleNr)%>%dplyr::arrange(GeneNames)
  ModuleMembers_list<-split(ModuleMembership$GeneNames,ModuleMembership$ModuleNr)
  names(ModuleMembers_list)<-paste0(run,"|Module_",names(ModuleMembers_list))
  utils::write.table(sapply(names(ModuleMembers_list),function(x) paste(x,paste(ModuleMembers_list[[x]],collapse="\t"),sep="\t")),gmt_file,quote = FALSE,row.names = TRUE,col.names = FALSE,sep='\t')
}

#' @title readGMT
#'
#' @param filename A gmtfilename
#' @examples
#' readGMT("file.gmt")
#' @return Reads a gmt file
#' @export
readGMT<-function(filename){
  gmtLines<-strsplit(readLines(filename),"\t")
  gmtLines_genes <- lapply(gmtLines, tail, -2)
  names(gmtLines_genes) <- sapply(gmtLines, head, 1)
  
  return(gmtLines_genes)
}

#' @title GmtFromModules
#'
#' @param gmtfile1 A gmtfilename that you want to compare
#' @param gmtfile2 A second gmtfile to compare with.
#' @param runname1 name of the first dataset.
#' @param runname2 name of the second dataset.
#' @param NrCores  Number of cores for parallel computing. 
#' @param ref.numb.genes The reference number of genes.
#' @importFrom doParallel registerDoParallel 
#' @importFrom parallel makeCluster stopCluster
#' @importFrom foreach foreach %dopar% %do%
#' @importFrom stats p.adjust phyper
#' 
#' @return Creates resultfile with p-values and padj when comparing two gmt files with a hyper geometric test.
#' 
#' @export
HyperGTestGeneEnrichment<-function(gmtfile1, gmtfile2, runname1, runname2, NrCores, ref.numb.genes=45956){
  
  gmtfile1<-readGMT(gmtfile1) # our gmt_file_output_from Amaretto
  gmtfile2<-readGMT(gmtfile2)  # the hallmarks_and_co2...
  
  ###########################  Parallelizing :
  cluster <- parallel::makeCluster(c(rep("localhost", NrCores)), type = "SOCK")
  doParallel::registerDoParallel(cluster,cores=NrCores)
  
  resultloop<-foreach::foreach(j=1:length(gmtfile2), .combine='rbind') %do% {
    #print(j)
    foreach::foreach(i=1:length(gmtfile1),.combine='rbind') %dopar% {
      #print(i)
      l<-length(gmtfile1[[i]])
      k<-sum(gmtfile1[[i]] %in% gmtfile2[[j]])
      m<-ref.numb.genes
      n<-length(gmtfile2[[j]])
      p1<-stats::phyper(k-1,l,m-l,n,lower.tail=FALSE)
      
      overlapping.genes<-gmtfile1[[i]][gmtfile1[[i]] %in% gmtfile2[[j]]]
      overlapping.genes<-paste(overlapping.genes,collapse = ', ')
      c(RunName1=runname1, RunName2=runname2,Geneset1=names(gmtfile1[i]),Geneset2=names(gmtfile2[j]),p_value=p1,n_Overlapping=k,Overlapping_genes=overlapping.genes)
    }
  }
  
  parallel::stopCluster(cluster)
  resultloop<-as.data.frame(resultloop,stringsAsFactors=FALSE)
  resultloop$p_value<-as.numeric(resultloop$p_value)
  resultloop$n_Overlapping<-as.numeric((resultloop$n_Overlapping))
  resultloop[,"padj"]<-stats::p.adjust(resultloop[,"p_value"],method='BH')
  return(resultloop)
}

#' Title  ExtractGenesInfo 
#'
#' @param AMARETTOresults AMARETTOreults object from an AMARETTO run.
#' @param run the run-name string , for example :"TCGA_LIHC", associated with the AMARETTOreults run. 
#'
#' @return returns a dataframe with columns of runname, module-number, genename, gene-types, and weights of the driver genes.
#' @export
#'
#' @examples ExtractGenesInfo(AMARETTOresults,"TCGA-LIHC")
ExtractGenesInfo<-function(AMARETTOresults,run){
  ModuleMembership<-NULL
  for (ModuleNr in 1:AMARETTOresults$NrModules){
    Targets<- names(AMARETTOresults$ModuleMembership[which(AMARETTOresults$ModuleMembership==ModuleNr),1])
    Target_df<-data.frame(Run_Names=run,ModuleNr=ModuleNr,GeneNames=Targets,Type="Target",Weights=0,stringsAsFactors = FALSE) 
    Drivers <- names(which(AMARETTOresults$RegulatoryPrograms[ModuleNr,] != 0))
    Weight_Driver<-AMARETTOresults$RegulatoryPrograms[ModuleNr,Drivers]
    Drivers_df<-data.frame(Run_Names=run,ModuleNr=ModuleNr,GeneNames=Drivers,Type="Driver",Weights=Weight_Driver,stringsAsFactors = FALSE) 
    ModuleMembership <- rbind(ModuleMembership,Target_df,Drivers_df)
  }
  return(ModuleMembership)
}

#' Title Extract_Genes_Modules_All
#'
#' @param AMARETTOresults_all 
#' @param gmt_filelist 
#'
#' @importFrom utils stack
#' @importFrom dplyr mutate rename select
#' @return a dataframe for all AMARETTO files and given gmt file with the following structure :  runname, ModuleNr, genename, gene-types, and weights of the driver genes.
#' @export
#' @examples Extract_Genes_Modules_All(AMARETTOresults_all,gmt_filelist)
Extract_Genes_Modules_All<-function(AMARETTOresults_all,gmt_filelist){
  all_genes_modules_df<-NULL
  for (run in names(AMARETTOresults_all)){
    all_genes_modules_df<-rbind(all_genes_modules_df,ExtractGenesInfo(AMARETTOresults_all[[run]],run))
  }
  all_genes_modules_df<-all_genes_modules_df%>%dplyr::mutate(ModuleNr=paste0("Module_",ModuleNr))%>%dplyr::mutate(AMARETTOres=1)
  
  for (run in names(gmt_filelist)){
  gmt_genes_df<-utils::stack(readGMT(gmt_filelist[[run]]))%>%dplyr::mutate(Run_Names=run)%>%dplyr::rename(ModuleNr=ind)%>%dplyr::rename(GeneNames=values)%>%
    dplyr::mutate(Type="Target")%>%dplyr::mutate(Weights=0)%>%dplyr::mutate(AMARETTOres=0)%>%dplyr::select(Run_Names,ModuleNr,GeneNames,Type,Weights,AMARETTOres)
  all_genes_modules_df<-rbind(all_genes_modules_df,gmt_genes_df)
  }
  return(all_genes_modules_df)
}
broadinstitute/ImagingAMARETTO documentation built on Dec. 3, 2019, 6:38 p.m.