# Install Package: Ctrl + Shift + B
# devtools::document()
#' Get identifiers from a differential expression table
#'
#' Imports:
#' dplyr,
#' rlang
#'
#' @param direction string, either "up" or "dn" (down), indicating that only rows with a positive or negative log2FC should be kept
#' @param set string, indicating the separator used in the file
#' @param col_ID string, the name of the column that should be used as an identifier
#' @param col_log2FC string, the name of the column that should be used as the log2FC column
#' @param col_padj string, the name of the column that should be used as the adjusted-p-value/FDR/qvalue column
#' @param path string, full path to the file, including the file name
#' @param cutP number, p value cutoff. Rows with a number in col_padj higher than this number will be filtered out
#' @param cutFC number, log2FC cutoff. Rows with a number in col_log2FC lower than this number wil be filtered out
#' @return a vector of strings (IDs)
#' @examples
#' IDs <- setRead("up", path=myPath)
setRead <- function(direction, path, path2="", sep="\t", col_ID=Symbol, col_log2FC=log2FC, col_padj=padj, cutP=0.05, cutFC=log2(1.5)){
col_ID <- rlang::enquo(col_ID)
col_padj <- rlang::enquo(col_padj)
col_log2FC <- rlang::enquo(col_log2FC)
df <- data.table::fread(paste0(path2,path), sep=sep, header=T, stringsAsFactors=F)
df <- dplyr::rename(df, "ID"=quo_name(col_ID), "log2FC"=quo_name(col_log2FC), "padj"=quo_name(col_padj))
df <- switch(direction,
"up" = subset(df, padj <= cutP & log2FC > cutFC),
"dn" = subset(df, padj <= cutP & log2FC < -cutFC),
"down"= subset(df, padj <= cutP & log2FC < -cutFC)
)
df <- unique( na.omit( as.character( df$ID ) ) )
df <- gsub("\\.[1-9]*$","",df)
df <- gsub(" ", "", df)
df <- unique(df)
print(paste0("number of entries before translation: ",length(df)))
df
}
#' Create a gene set
#'
#' Imports:
#' GSEABase,
#' AnnotationDbi,
#' limma
#'
#' @param name string, give the set a short descriptive name
#' @param IDs vector of identifiers (strings or integers)
#' @param species string, either "Homo sapiens" or "Mus musculus"
#' @param translate string, translate IDs to EntrezIDs. Must be avlid for mapIds (e.g. "SYMBOL" or "ENSEMBL")
#' @param pmids string, Pubmed ID
#' @param desc1 string, e.g. GSE-ID, Material, Comparison
#' @param desc2 string, e.g. Experiment (RNA-Seq, Microarray etc.), Acquisition (was the set given in the supplement or calculated by yourself, what parameters were used?)
#' @return Vector of human Entrez IDs (or, if matrix=T, a matrix of mouse and human Entrez IDs)
#' @examples
#' IDs <- setRead("up", path=myPath)
#' set <- setMake("mySet", IDs, "Homo sapiens")
setMake <- function(name, IDs, species, translate=NA, pmids="", desc1="", desc2=""){
IDs <- as.character(na.omit(as.vector(unique(IDs))))
if(!is.na(translate)){
#== RECOVER OLD SYMBOL NAMES ==#
if(translate %in% "SYMBOL"){# also possibble: limma::alias2Symbol(IDs, species="Hs")
IDs2 <- as.vector(switch(species,
"Homo sapiens"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, keys=IDs, column="ENTREZID", keytype="ALIAS", multiVals="first"),
"Mus musculus"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db, keys=IDs, column="ENTREZID", keytype="ALIAS", multiVals="first")))
IDs2 <- na.omit(IDs2)
}else{IDs2 <- NA}
IDs <- c(IDs2, as.vector(switch(species,
"Homo sapiens"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=IDs,column="ENTREZID",keytype=translate, multiVals="first"),
"Mus musculus"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,keys=IDs,column="ENTREZID",keytype=translate, multiVals="first")))
)}
IDs <- na.omit(unlist(unique(na.omit(IDs))))
print(paste0("number of entries after translation: ",length(IDs)))
GSEABase::GeneSet(setName=name, geneIds=unique(IDs), organism=species,
creationDate=date(),
shortDescription=desc1,
longDescription=desc2,
pubMedIds=pmids,
geneIdType=GSEABase::EntrezIdentifier())
}
setOldies <- function(Symbols){
df <- gsub("\\.[1-9]*$","",Symbols)
df <- gsub(" ", "", df)
}
#' Create a gene set
#'
#' Imports:
#' GSEABase,
#' AnnotationDbi,
#' limma
#'
#' @param Sets list of genesets
#' @param Folder string, complete folder path
#' @param namePrefix string, some prefix that should make up the first part of the name
#' @return Vector of human Entrez IDs (or, if matrix=T, a matrix of mouse and human Entrez IDs)
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setSave <- function(Sets, Folder, namePrefix="geneset"){
# create a parallel list with booleans (human=T, mouse=F) that has the same structure
isHuman <- lapply(Sets, function(x) GSEABase::setName(x))
isHuman <- unlist(isHuman)
isHuman <- ifelse(grepl("\\[H\\]|\\(H\\)\\]", isHuman),T,F)
# create two lists with only EntrezIds converted to human and mouse
allHuman <- Sets
allMouse <- Sets
for(i in seq(Sets)){
if(isHuman[[i]]){
GSEABase::geneIds(allMouse[[i]]) <- homologene::human2mouse(GSEABase::geneIds(Sets[[i]]))$mouseID
GSEABase::geneIds(allMouse[[i]]) <- as.character( unique( GSEABase::geneIds( allMouse[[i]] ) ) )
}else{
GSEABase::geneIds(allHuman[[i]]) <- homologene::mouse2human(GSEABase::geneIds(Sets[[i]]))$humanID
GSEABase::geneIds(allHuman[[i]]) <- as.character( unique( GSEABase::geneIds( allHuman[[i]] ) ) )
}}
# convert the lists to GeneSetCollections: the mixedID, humanID and mouseID lists
coll <- GSEABase::GeneSetCollection(Sets)
coll_hs <- GSEABase::GeneSetCollection(allHuman)
coll_mm <- GSEABase::GeneSetCollection(allMouse)
# save GeneSetCollections as gmt files
GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_mixedIDs.gmt")))
GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_humanIDs.gmt")))
GSEABase::toGmt(coll, file.path(Folder, paste0(namePrefix, "_mouseIDs.gmt")))
}
#' Load sets from a .gmt file
#'
#' #' Imports:
#' qusage
#'
#' @param File string, path to the .gmt file that contains the sets; full file path, including folders
#' @return a list where each item is a vector of identifiers that make up the set, and the item name is the set name
#' @examples
#' File <- string, full file path, including folders
setLoad <- function(File="~/bioinformatics/gene_sets/NH_literature_allHuman.gmt"){
# read in the file as a list of sets
sets <- qusage::read.gmt(File)
# give each list item its proper name
set_names <- sapply(sets, function(x) deparse(substitute(x)))
names(sets) <- names(set_names)
sets
}
#' Get the overlap between sets
#'
#' Imports:
#' data.table
#' purrr
#' ggplotify
#' reshape2
#' AnnotationDbi
#' org.Hs.eg.db
#' org.Mm.eg.db
#' plotly
#'
#' @param sets list, each item (i.e. a set) is a vector of strings
#' @param minOverlap integer, indicating the minimum number of sets in which genes should show up
#' @param Species string, either "Homo sapiens" or "Mus musculus", indicating the species the set items refer to
#' @param IDfrom string, anything valid in mapIds (e.g. 'ENTREZID' or 'SYMBOL'), indicating the input ID type of set items
#' @param IDto string, anything valid in mapIds (e.g. 'ENTREZID' or 'SYMBOL'), indicating the output ID type of set items
#' @param HMstats boolean, should overlap percentage be shown in the tiles?
#' @param HMlabels boolean, should x and y axis have text, i.e. sample names
#' @param plotly boolean, should a plotly graph be shown instead of ggplot?
#' @return a list: two ggplot objects (venn & heatmap), a vector of overlapping genes and a boolean table with all genes
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setShared <- function(sets, minOverlap=2, Species=NA, IDfrom="SYMBOL", IDto="SYMBOL", HMstats=T, HMlabels=24, plotly=F, mylabels=NA, venntextsize1=10, venntextsize2=10){
#==create venn compatible boolean table==#
#if not present, set setnames
if(is.null(names(sets))) names(sets) <- paste0("set",seq(length(sets)))
#every list item is converted from vector to data.frame
sets2 <- mapply(function(x,y) `colnames<-`(data.frame(x, T), c("genes",y)), x=sets, y=names(sets), SIMPLIFY=F)
#combine all data.frames
sets2 <- purrr::reduce(sets2, dplyr::full_join, by="genes")
geneIDs <- as.character(sets2$genes)
#==optionally, translate gene IDs==#
if(!IDfrom %in% IDto){
if(is.na(Species)) stop("No species defined. Set parameter 'Species' as either 'human' or 'mouse'.")
geneIDs <- switch(Species,
"Homo sapiens" = AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=geneIDs,column=IDto,keytype=IDfrom,multiVals="first"),
"Mus musculus" = AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,keys=geneIDs,column=IDto,keytype=IDfrom,multiVals="first")
)
}
#==complete table==#
rownames(sets2) <- geneIDs
sets2 <- sets2[-1]
#convert NAs to FALSE (not in set)
sets2 <- apply(sets2, 2, function(x) ifelse(is.na(x),F,T))
#==Venn==#
eulerr::eulerr_options(pointsize=list(cex=venntextsize1))
eulerr::eulerr_options(labels=list(cex=venntextsize2))
if(is.na(mylabels)) mylabels <- names(sets)
if(any(ncol(sets2)==3:5)){
venn <- ggplotify:: as.ggplot(plot(eulerr::venn(sets2), quantities = TRUE, labels=mylabels))
}else if(ncol(sets2)==2){
venn <- ggplotify:: as.ggplot(plot(eulerr::euler(sets2), quantities = TRUE, labels=mylabels))
}else{venn <- NA}
#==heatmap==#
hm <- apply( sets2, 2, function(x) apply( sets2[x==T,], 2, sum)/sum(x) )
hm <- reshape2::melt(hm)
hm$size <- rep(colSums(sets2), each=ncol(sets2))
hm$hits <- hm$size*hm$value
if(plotly){
heatmap <- plotly::plot_ly(data=hm, x=~as.character(Var1), y=~as.character(Var2), z=~value, type="heatmap")
heatmap <- plotly::add_trace(heatmap, text=~paste0("size: ", size,", hits: ", hits), showlegend = F)
}else{
heatmap <- ggplot2::ggplot(hm, ggplot2::aes(x=Var1, y=Var2, fill=value)) +
ggplot2::geom_tile() +
ggplot2::scale_fill_gradientn(limits=c(0,1), breaks=c(0,0.25,0.5,0.75,1), colors=c("white","yellow","orange","red","black")) +
ggplot2::scale_x_discrete(expand=c(0,0)) +
ggplot2::scale_y_discrete(expand=c(0,0)) +
ggplot2::theme(axis.title=ggplot2::element_blank(), axis.text=ggplot2::element_text(size=HMlabels), axis.text.x=element_text(angle=45, hjust=1))
if(HMstats) heatmap <- heatmap + ggplot2::geom_text(ggplot2::aes(label=paste0(round(value,2),"\n(",hits,"/",size,")")), fontface="bold", color="blue")
}
#==overlap==#
overlap <- as.character(rownames(subset(sets2, rowSums(sets2)>=minOverlap)))
#==output==#
print(overlap)
print(heatmap)
print(venn)
list(heatmap=heatmap, venn=venn, overlap=overlap, overlapTable=sets2)
}
# GeneOverlap::newGeneOverlap(sets[["MLL-AF9 CB my, down [H](Horton2013)"]], int) %>% GeneOverlap::testGeneOverlap()
#' Get the overlap between a list of genes and a list of gene sets
#'
#' @param genes list or vector of strings, each is a gene identifier
#' @param sets list of vectors (gene sets) with EntrezIDs as gene identifiers
#' @param IDtype type of ID that is used in the input parameter 'genes' (anything used in mapIds, e.g. "ENTREZID" or "SYMBOL")
#' @param dropRows boolean, indicating whether rows without any matches should be filtered out
#' @return a vector of strings. Each string is present in at least x sets, where x is minOverlap
#' @examples
#' IDs1 <- setRead("up", path=myPath1)
#' IDs2 <- setRead("up", path=myPath2)
#' set1 <- setMake("mySet", IDs1, "Homo sapiens")
#' set2 <- setMake("mySet", IDs2, "Homo sapiens")
#' setSave(list(set1, set2), myFolder)
setCheck <- function(genes, sets, IDtype="SYMBOL", dropRows=T){
genes <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=genes,column="ENTREZID",keytype=IDtype, multiVals="first")
df <- lapply(genes, function(x) lapply(sets, function(y) x %in% y))
df <- lapply(df, unlist)
df <- do.call(cbind, df)
if(dropRows){
df <- subset(df, rowSums(df)>0)
}
df
}
setInfo <- function(){
cat("
Organizing own datasets:
setRead |||takes: table |||returns: identifiers |
setMake |||takes: identifiers |||returns: geneset |
setSave |||takes: genesets |||returns: a .gmt file |
setGet |||takes: a .gmt file |||returns: list |
setShared |||takes: list |||returns: overlaping genes |
setCheck |||takes: list & genes |||returns: gene presence info|
---
Access the Cancer Genomics Data Server
setCGbase |||takes: nothing |||returns: cgds object |
setCGstudy |||takes: cgds object |||returns: study ID |
setCGclinic |||takes: cgds & study |||returns: clininc data |
setCGsurvival|||takes: clininc data |||returns: survival data |
---
Access the depmap database
deepFilter |||takes: attribute specifics|||gives: cell line table |
deepResult |||takes: deepFilter output |||gives: data about cell lines|
deepPCA |||takes: deepResults output |||gives: PCA |
---
Vizualize in networks
setNet |||takes: genes & sets |||returns: gene network |
setNetStyle|||takes: setNet output |||returns: enhanced network|
setNetGSEA |||takes: seekGSEA output|||returns: GSEA network |
")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.