R/customSets.R

Defines functions setInfo setCheck setShared setLoad setSave setOldies setMake setRead

Documented in setCheck setLoad setMake setRead setSave setShared

# Install Package: Ctrl + Shift + B
# devtools::document()

#' Get identifiers from a differential expression table
#'
#' Imports:
#' dplyr,
#' rlang
#'
#' @param direction string, either "up" or "dn" (down), indicating that only rows with a positive or negative log2FC should be kept
#' @param set string, indicating the separator used in the file
#' @param col_ID string, the name of the column that should be used as an identifier
#' @param col_log2FC string, the name of the column that should be used as the log2FC column
#' @param col_padj string, the name of the column that should be used as the adjusted-p-value/FDR/qvalue column
#' @param path string, full path to the file, including the file name
#' @param cutP number, p value cutoff. Rows with a number in col_padj higher than this number will be filtered out
#' @param cutFC number, log2FC cutoff. Rows with a number in col_log2FC lower than this number wil be filtered out
#' @return a vector of strings (IDs)
#' @examples
#' IDs <- setRead("up", path=myPath)
setRead <- function(direction, path, path2="", sep="\t", col_ID=Symbol, col_log2FC=log2FC, col_padj=padj, cutP=0.05, cutFC=log2(1.5)){
  col_ID <- rlang::enquo(col_ID)
  col_padj <- rlang::enquo(col_padj)
  col_log2FC <- rlang::enquo(col_log2FC)

  df <- data.table::fread(paste0(path2,path), sep=sep, header=T, stringsAsFactors=F)
  df <- dplyr::rename(df, "ID"=quo_name(col_ID), "log2FC"=quo_name(col_log2FC), "padj"=quo_name(col_padj))
  df <- switch(direction,
               "up"  = subset(df, padj <= cutP & log2FC >  cutFC),
               "dn"  = subset(df, padj <= cutP & log2FC < -cutFC),
               "down"= subset(df, padj <= cutP & log2FC < -cutFC)
  )
  df <- unique( na.omit( as.character( df$ID ) ) )
  df <- gsub("\\.[1-9]*$","",df)
  df <- gsub(" ", "", df)
  df <- unique(df)
  print(paste0("number of entries before translation: ",length(df)))
  df
}

#' Create a gene set
#'
#' Imports:
#' GSEABase,
#' AnnotationDbi,
#' limma
#'
#' @param name string, give the set a short descriptive name
#' @param IDs vector of identifiers (strings or integers)
#' @param species string, either "Homo sapiens" or "Mus musculus"
#' @param translate string, translate IDs to EntrezIDs. Must be avlid for mapIds (e.g. "SYMBOL" or "ENSEMBL")
#' @param pmids string, Pubmed ID
#' @param desc1 string, e.g. GSE-ID, Material, Comparison
#' @param desc2 string, e.g. Experiment (RNA-Seq, Microarray etc.), Acquisition (was the set given in the supplement or calculated by yourself, what parameters were used?)
#' @return Vector of human Entrez IDs (or, if matrix=T, a matrix of mouse and human Entrez IDs)
#' @examples
#' IDs <- setRead("up", path=myPath)
#' set <- setMake("mySet", IDs, "Homo sapiens")
setMake <- function(name, IDs, species, translate=NA, pmids="", desc1="", desc2=""){
  IDs <- as.character(na.omit(as.vector(unique(IDs))))
  if(!is.na(translate)){
    #== RECOVER OLD SYMBOL NAMES ==#
    if(translate %in% "SYMBOL"){# also possibble: limma::alias2Symbol(IDs, species="Hs")
      IDs2 <- as.vector(switch(species,
                               "Homo sapiens"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys=IDs, column="ENTREZID", keytype="ALIAS", multiVals="first"),
                               "Mus musculus"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db, keys=IDs, column="ENTREZID", keytype="ALIAS", multiVals="first")))
      IDs2 <- na.omit(IDs2)
    }else{IDs2 <- NA}

    IDs <- c(IDs2, as.vector(switch(species,
                            "Homo sapiens"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=IDs,column="ENTREZID",keytype=translate, multiVals="first"),
                            "Mus musculus"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,keys=IDs,column="ENTREZID",keytype=translate, multiVals="first")))
  )}
  IDs <- na.omit(unlist(unique(na.omit(IDs))))
  print(paste0("number of entries after translation: ",length(IDs)))
  GSEABase::GeneSet(setName=name, geneIds=unique(IDs), organism=species,
                    creationDate=date(),
                    shortDescription=desc1,
                    longDescription=desc2,
                    pubMedIds=pmids,
                    geneIdType=GSEABase::EntrezIdentifier())
}

setOldies <- function(Symbols){
  df <- gsub("\\.[1-9]*$","",Symbols)
  df <- gsub(" ", "", df)
}

#' Create a gene set
#'
#' Imports:
#' GSEABase,
#' AnnotationDbi,
#' limma
#'
#' @param Sets list of genesets
#' @param Folder string, complete folder path
#' @param namePrefix string, some prefix that should make up the first part of the name
#' @return Vector of human Entrez IDs (or, if matrix=T, a matrix of mouse and human Entrez IDs)
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setSave <- function(Sets, Folder, namePrefix="geneset"){
  # create a parallel list with booleans (human=T, mouse=F) that has the same structure
  isHuman <- lapply(Sets, function(x) GSEABase::setName(x))
  isHuman <- unlist(isHuman)
  isHuman <- ifelse(grepl("\\[H\\]|\\(H\\)\\]", isHuman),T,F)

  # create two lists with only EntrezIds converted to human and mouse
  allHuman <- Sets
  allMouse <- Sets
  for(i in seq(Sets)){
    if(isHuman[[i]]){
      GSEABase::geneIds(allMouse[[i]]) <- homologene::human2mouse(GSEABase::geneIds(Sets[[i]]))$mouseID
      GSEABase::geneIds(allMouse[[i]]) <- as.character( unique( GSEABase::geneIds( allMouse[[i]] ) ) )
    }else{
      GSEABase::geneIds(allHuman[[i]]) <- homologene::mouse2human(GSEABase::geneIds(Sets[[i]]))$humanID
      GSEABase::geneIds(allHuman[[i]]) <- as.character( unique( GSEABase::geneIds( allHuman[[i]] ) ) )
    }}

  # convert the lists to GeneSetCollections: the mixedID, humanID and mouseID lists
  coll <- GSEABase::GeneSetCollection(Sets)
  coll_hs <- GSEABase::GeneSetCollection(allHuman)
  coll_mm <- GSEABase::GeneSetCollection(allMouse)

  # save GeneSetCollections as gmt files
  GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_mixedIDs.gmt")))
  GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_humanIDs.gmt")))
  GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_mouseIDs.gmt")))
}

#' Load sets from a .gmt file
#'
#' #' Imports:
#' qusage
#'
#' @param File string, path to the .gmt file that contains the sets; full file path, including folders
#' @return a list where each item is a vector of identifiers that make up the set, and the item name is the set name
#' @examples
#' File <- string, full file path, including folders
setLoad <- function(File="~/bioinformatics/gene_sets/NH_literature_allHuman.gmt"){
  # read in the file as a list of sets
  sets <- qusage::read.gmt(File)

  # give each list item its proper name
  set_names <- sapply(sets, function(x) deparse(substitute(x)))
  names(sets) <- names(set_names)
  sets
}

#' Get the overlap between sets
#'
#' Imports:
#' data.table
#' purrr
#' ggplotify
#' reshape2
#' AnnotationDbi
#' org.Hs.eg.db
#' org.Mm.eg.db
#' plotly
#'
#' @param sets list, each item (i.e. a set) is a vector of strings
#' @param minOverlap integer, indicating the minimum number of sets in which genes should show up
#' @param Species string, either "Homo sapiens" or "Mus musculus", indicating the species the set items refer to
#' @param IDfrom string, anything valid in mapIds (e.g. 'ENTREZID' or 'SYMBOL'), indicating the input ID type of set items
#' @param IDto  string, anything valid in mapIds (e.g. 'ENTREZID' or 'SYMBOL'), indicating the output ID type of set items
#' @param HMstats boolean, should overlap percentage be shown in the tiles?
#' @param HMlabels boolean, should x and y axis have text, i.e. sample names
#' @param plotly boolean, should a plotly graph be shown instead of ggplot?
#' @return a list: two ggplot objects (venn & heatmap), a vector of overlapping genes and a boolean table with all genes
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setShared <- function(sets, minOverlap=2, Species=NA, IDfrom="SYMBOL", IDto="SYMBOL", HMstats=T, HMlabels=24, plotly=F, mylabels=NA, venntextsize1=10, venntextsize2=10){

  #==create venn compatible boolean table==#
  #if not present, set setnames
  if(is.null(names(sets))) names(sets) <- paste0("set",seq(length(sets)))
  #every list item is converted from vector to data.frame
  sets2 <- mapply(function(x,y) `colnames<-`(data.frame(x, T), c("genes",y)), x=sets, y=names(sets), SIMPLIFY=F)
  #combine all data.frames
  sets2 <- purrr::reduce(sets2, dplyr::full_join, by="genes")
  geneIDs <- as.character(sets2$genes)

  #==optionally, translate gene IDs==#
  if(!IDfrom %in% IDto){
    if(is.na(Species)) stop("No species defined. Set parameter 'Species' as either 'human' or 'mouse'.")
    geneIDs <- switch(Species,
                      "Homo sapiens" = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=geneIDs,column=IDto,keytype=IDfrom,multiVals="first"),
                      "Mus musculus" = AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,keys=geneIDs,column=IDto,keytype=IDfrom,multiVals="first")
    )
  }

  #==complete table==#
  rownames(sets2) <- geneIDs
  sets2 <- sets2[-1]
  #convert NAs to FALSE (not in set)
  sets2 <- apply(sets2, 2, function(x) ifelse(is.na(x),F,T))

  #==Venn==#
  eulerr::eulerr_options(pointsize=list(cex=venntextsize1))
  eulerr::eulerr_options(labels=list(cex=venntextsize2))
  if(is.na(mylabels)) mylabels <- names(sets)
  if(any(ncol(sets2)==3:5)){
    venn <- ggplotify:: as.ggplot(plot(eulerr::venn(sets2), quantities = TRUE, labels=mylabels))
  }else if(ncol(sets2)==2){
    venn <- ggplotify:: as.ggplot(plot(eulerr::euler(sets2), quantities = TRUE, labels=mylabels))
  }else{venn <- NA}

  #==heatmap==#
  hm <- apply( sets2, 2, function(x) apply( sets2[x==T,], 2, sum)/sum(x) )
  hm <- reshape2::melt(hm)
  hm$size <- rep(colSums(sets2), each=ncol(sets2))
  hm$hits <- hm$size*hm$value

  if(plotly){
    heatmap <- plotly::plot_ly(data=hm, x=~as.character(Var1), y=~as.character(Var2), z=~value, type="heatmap")
    heatmap <- plotly::add_trace(heatmap, text=~paste0("size: ", size,", hits: ", hits), showlegend = F)
  }else{
    heatmap <- ggplot2::ggplot(hm, ggplot2::aes(x=Var1, y=Var2, fill=value)) +
      ggplot2::geom_tile() +
      ggplot2::scale_fill_gradientn(limits=c(0,1), breaks=c(0,0.25,0.5,0.75,1), colors=c("white","yellow","orange","red","black")) +
      ggplot2::scale_x_discrete(expand=c(0,0)) +
      ggplot2::scale_y_discrete(expand=c(0,0)) +
      ggplot2::theme(axis.title=ggplot2::element_blank(), axis.text=ggplot2::element_text(size=HMlabels), axis.text.x=element_text(angle=45, hjust=1))
    if(HMstats) heatmap <- heatmap + ggplot2::geom_text(ggplot2::aes(label=paste0(round(value,2),"\n(",hits,"/",size,")")), fontface="bold", color="blue")

  }

  #==overlap==#
  overlap <- as.character(rownames(subset(sets2, rowSums(sets2)>=minOverlap)))

  #==output==#
  print(overlap)
  print(heatmap)
  print(venn)
  list(heatmap=heatmap, venn=venn, overlap=overlap, overlapTable=sets2)
}


# GeneOverlap::newGeneOverlap(sets[["MLL-AF9 CB my, down [H](Horton2013)"]], int) %>% GeneOverlap::testGeneOverlap()

#' Get the overlap between a list of genes and a list of gene sets
#'
#' @param genes list or vector of strings, each is a gene identifier
#' @param sets list of vectors (gene sets) with EntrezIDs as gene identifiers
#' @param IDtype type of ID that is used in the input parameter 'genes' (anything used in mapIds, e.g. "ENTREZID" or "SYMBOL")
#' @param dropRows boolean, indicating whether rows without any matches should be filtered out
#' @return a vector of strings. Each string is present in at least x sets, where x is minOverlap
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setCheck <- function(genes, sets, IDtype="SYMBOL", dropRows=T){
  genes <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=genes,column="ENTREZID",keytype=IDtype, multiVals="first")
  df <- lapply(genes, function(x) lapply(sets, function(y) x %in% y))
  df <- lapply(df, unlist)
  df <- do.call(cbind, df)
  if(dropRows){
    df <- subset(df, rowSums(df)>0)
  }
  df
}

setInfo <- function(){
  cat("
  Organizing own datasets:
  setRead   |||takes: table        |||returns: identifiers       |
  setMake   |||takes: identifiers  |||returns: geneset           |
  setSave   |||takes: genesets     |||returns: a .gmt file       |
  setGet    |||takes: a .gmt file  |||returns: list              |
  setShared |||takes: list         |||returns: overlaping genes  |
  setCheck  |||takes: list & genes |||returns: gene presence info|
  ---
  Access the Cancer Genomics Data Server
  setCGbase    |||takes: nothing      |||returns: cgds object    |
  setCGstudy   |||takes: cgds object  |||returns: study ID       |
  setCGclinic  |||takes: cgds & study |||returns: clininc data   |
  setCGsurvival|||takes: clininc data |||returns: survival data  |
  ---
  Access the depmap database
  deepFilter |||takes: attribute specifics|||gives: cell line table      |
  deepResult |||takes: deepFilter output  |||gives: data about cell lines|
  deepPCA    |||takes: deepResults output |||gives: PCA                  |
  ---
  Vizualize in networks
  setNet     |||takes: genes & sets   |||returns: gene network    |
  setNetStyle|||takes: setNet output  |||returns: enhanced network|
  setNetGSEA |||takes: seekGSEA output|||returns: GSEA network    |
      ")
}
Solatar/setR documentation built on Dec. 5, 2020, 10:50 p.m.