R/ggenrichData.r

Defines functions ggenrichData

Documented in ggenrichData

#'ggenrichData function
#'@export
#'@description
#'ggenrichData function is used to preprocess the input file
#'@param fnames input file or files
#'@param format the way to get the results of the enrichment analysis.(JAVA_GSEA,R_ClusterProfiler,DAVID)
#'@param type analysis type.(pathways,groups_pathway,cluster,groups_cluster) default pathways
#'@param GENE_MAX The maximum number of genes included in GO term,default 300
#'@param GENE_MIN The minimum number of genes included in GO term,default 100
#'@return a data set that is used as input to the ggenrichplot,enrichplot_MultiGroup,term_clustering and treeplot functions.
#'@note For DAVID format, The first column of the input file submitted by the user is the annotation information(term), the second column is the gene ID(or ENTREZ ID,gene symbol), and the third column is the adj_pval.
#'@examples
#result <- ggenrichData(system.file("extdata", c("HALLMARK_COAGULATION.xls","HALLMARK_HYPOXIA.xls","HALLMARK_PI3K_AKT_MTOR_SIGNALING.xls"), package = "ggenrich",mustWork = TRUE),format="JAVA_GSEA")
#result <- ggenrichData(system.file("extdata", "easy_input.csv", package = "ggenrich",mustWork = TRUE),format="DAVID",type="cluster")
#result <- ggenrichData(system.file("extdata", c("enrichGO1.csv","enrichGO2.csv","enrichGO3.csv"), package = "ggenrich",mustWork = TRUE),format="R_ClusterProfiler",type="groups_cluster",GENE_MAX=400,GENE_MIN=200)
#result <- ggenrichData(system.file("extdata", "gsea_new_output.csv", package = "ggenrich",mustWork = TRUE),format="R_ClusterProfiler")
#result <- ggenrichData(system.file("extdata", c("gsea_group1.csv","gsea_group2.csv"), package = "ggenrich",mustWork = TRUE),format="R_ClusterProfiler",type="groups_pathway")
#result <- ggenrichData(system.file("extdata", c("enrichGO1.csv","enrichGO2.csv","enrichGO3.csv"), package = "ggenrich",mustWork = TRUE),format="R_ClusterProfiler",type="cluster",GENE_MAX=400,GENE_MIN=200)


ggenrichData<- function(fnames,format,type="pathways",GENE_MAX=300,GENE_MIN=100) {
  if(format == "JAVA_GSEA"){
    if(type == "pathways"){
      fdataset <- lapply(fnames,read.delim)
      names(fdataset) <- fnames
      if (requireNamespace("plyr", quietly = TRUE)) {
        result <- plyr::ldply(fdataset, data.frame)
      }
      result$pathway <- unlist(strsplit(result$.id,split = ".xls"))
      return(result)
    }
  }
  if(format == "R_ClusterProfiler"){
    if(type == "pathways"){
      result <- read.csv(fnames,header = TRUE,sep=",")
      return(result)
    }
    if(type == "groups_pathway"){
      fdataset <- lapply(fnames,read.csv)
      names(fdataset) <- fnames
      if (requireNamespace("plyr", quietly = TRUE)) {
        result <- plyr::ldply(fdataset, data.frame)
      }
      return(result)
    }
    if(type == "cluster"){
      fdataset <- lapply(fnames, function(x){read.csv(x)[,c(2,3,5,7)]})
      names(fdataset) <- fnames
      if (requireNamespace("plyr", quietly = TRUE)) {
        ego.all <- plyr::ldply(fdataset, data.frame)
      }
      ego.all$group <- unlist(strsplit(ego.all$.id, split = ".csv"))
      ego.all <- ego.all[ego.all$p.adjust < 0.001,]
      ego.ID <- unique(ego.all[,c(2:4)])
      if (requireNamespace("stringr", quietly = TRUE)) {
        ego.ID$Bg <- as.numeric(stringr::str_split_fixed(ego.ID$BgRatio, "/",2)[,1])
      }
      ego.ID <- ego.ID[ego.ID$Bg < GENE_MAX,]
      ego.ID <- ego.ID[ego.ID$Bg > GENE_MIN,]
      return(ego.ID)
    }
    if(type == "groups_cluster"){
      fdataset <- lapply(fnames, function(x){read.csv(x)[,c(2,3,5,7)]})
      names(fdataset) <- fnames
      if (requireNamespace("plyr", quietly = TRUE)) {
        ego.all <- plyr::ldply(fdataset, data.frame)
      }
      ego.all$group <- unlist(strsplit(ego.all$.id, split = ".csv"))
      ego.all <- ego.all[ego.all$p.adjust < 0.001,]
      ego.ID <- unique(ego.all[,c(2:4)])
      if (requireNamespace("stringr", quietly = TRUE)) {
        ego.ID$Bg <- as.numeric(stringr::str_split_fixed(ego.ID$BgRatio, "/",2)[,1])
      }
      ego.ID <- ego.ID[ego.ID$Bg < GENE_MAX,]
      ego.ID <- ego.ID[ego.ID$Bg > GENE_MIN,]
      MyMerge <- function(x, y){
        df <- merge(x, y, by= "ID", all.x= TRUE, all.y= TRUE)
        return(df)
      }
      ego.m <- Reduce(MyMerge, fdataset)
      if (requireNamespace("dplyr", quietly = TRUE)) {
        ego.m <- dplyr::select(ego.m, ID,dplyr::contains("p.adjust"))
      }
      ego.m <- merge(ego.ID[,1:2], ego.m, by= "ID", all.x= TRUE)
      rownames(ego.m) <- ego.m$Description
      ego.m$ID <- NULL
      ego.m$Description <- NULL
      colnames(ego.m) <- paste0("G", seq(1:length(fnames)))
      return(ego.m)
    }
  }
  if(format == "DAVID"){
    if(type == "cluster"){
      fdataset <- lapply(fnames, function(x){read.csv(x)})
      names(fdataset) <- fnames
      if (requireNamespace("plyr", quietly = TRUE)) {
        ego.all <- plyr::ldply(fdataset, data.frame)
      }
      ego.all$group <- unlist(strsplit(ego.all$.id, split = ".csv"))
      return(ego.all)
    }
    if(type == "groups_cluster"){
      fdataset <- lapply(fnames, function(x){read.csv(x)})
      names(fdataset) <- fnames
      MyMerge <- function(x, y){
        df <- merge(x, y, by= "Annotation_information", all.x= TRUE, all.y= TRUE)
        return(df)
      }
      ego.m <- Reduce(MyMerge, fdataset)
      if (requireNamespace("dplyr", quietly = TRUE)) {
        ego.m <- unique(dplyr::select(ego.m, Annotation_information,dplyr::contains("adj_pval")))
      }
      rownames(ego.m) <- ego.m$Annotation_information
      ego.m$Annotation_information <- NULL
      return(ego.m)
    }

  }

}
ying-ge/ggEnrich documentation built on Nov. 24, 2019, 12:34 p.m.