Nothing
#' makeGeneTable
#'
#' This function obtains genewise p-values,
#' by representing each gene with the smallest p-value among its features,
#' and then determines genes status as significant or not.
#'
#' @param feature.table An \code{featureBasedData} object.
#' @param sig.threshold Significance threshold used to determine whether the gene is significant
#' or not
#' @param stat The statistic used to select significant genes. Options are 'pvalue' or 'fdr'
#'
#' @return Returns a genewised table with several variables (columns)
#' \item{geneID}{Gene identifiers in ensembl gene IDs}
#' \item{geneWisePvalue}{each gene is represented by the smallest p-value among its features}
#' \item{numFeature}{number of gene features within the gene}
#' \item{fdr}{false discovery rate for genewisePvalue}
#' \item{sig.gene}{a gene is significant (1) or not (0)}
#'
#' @examples
#' data(featureBasedData)
#' gene.based.table <- makeGeneTable(featureBasedData)
#' @export
#'
#'
makeGeneTable <- function(feature.table, sig.threshold = 0.05, stat = "pvalue")
{
min.pval <- aggregate(pvalue ~ geneID, data = feature.table, FUN = min)
n.feature <- as.data.frame(table(feature.table$geneID))
both <- merge(x = min.pval, y = n.feature, by.x = "geneID", by.y = "Var1")
both$fdr <- p.adjust(both$pvalue, method = "fdr")
if (stat == "pvalue")
{
both$sig.gene <- ifelse(both$pvalue < sig.threshold, 1, 0)
}
if (stat == "fdr")
{
both$sig.gene <- ifelse(both$fdr < sig.threshold, 1, 0)
}
names(both) <- sub("pvalue", "geneWisePvalue", names(both))
names(both) <- sub("Freq", "numFeature", names(both))
both <- splitGeneCluster(both)
return(both)
}
#' lrTestBias
#'
#' This function tests presence of selection bias using logistic regression, and produces a boxplot
#' that compares distributions of bias factors (e.g. number of exons) for significant genes and non-significant genes.
#'
#' @param genewise.table A dataframe with genewise p-value for each gene, returned from \code{makeGeneTable()}
#' @param boxplot.width width of boxplot
#'
#' @details To determine presentce of selection bias, we fit the following logistic regression model:
#'
#' \code{Pr(a gene is significant) ~ number of features within the gene}
#'
#' Here features refer to exon bins or splicing junction bins, depending on
#' how genewise pvalues were obtained in the \code{genewise.table}
#'
#'
#' @return Nothing to be returned
#'
#' @examples
#' gene.based.table <- makeGeneTable(featureBasedData)
#' lrTestBias(gene.based.table)
#' @export
#'
lrTestBias <- function(genewise.table, boxplot.width = 0.1)
{
mydata <- genewise.table
DE <- ifelse(mydata$sig.gene == 1, 1, 0)
mydata.2 <- cbind(mydata, DE)
if (var(mydata.2$numFeature) != 0)
{
mylogit.2 <- glm(formula = DE ~ as.numeric(numFeature), data = mydata.2, family = binomial(link = "logit"))
pvalue <- coef(summary(mylogit.2))[2,4]
p.value <- formatC(pvalue, format = "e", digits = 2)
print (paste0("P-value from logistic regression is ", p.value))
a <- boxplot(mydata.2$numFeature ~ mydata.2$DE, boxwex = boxplot.width, ylab = "Number of features",
col = "lightgray", ylim = c(min(mydata.2$numFeature), max(mydata.2$numFeature)),
names = c("non-significant genes", "significant genes"))
text(length(a$names)-1, max(mydata.2$numFeature)-5 , paste("P Value =",p.value,sep=" "))
} else
{
cat("There are no variations on the number of features\n")
}
}
#' runPathwaySplice
#'
#' This function identifies pathways that are enriched with signficant genes, while accounting for
#' different number of gene features (e.g. exons) associated with each gene
#'
#' @param genewise.table data frame returned from function \code{makeGeneTable}
#' @param genome Genome to be used, options are 'hg19' or 'mm10'
#' @param id GeneID, options are 'entrezgene' or 'ensembl_gene_id'
#' @param gene2cat Get sets to be tested, these are defined by users, can be obtained from \code{gmtGene2Cat} function
#' @param test.cats Default gene ontology gene sets to be tested if \code{gene2cat} is not defined
#' @param go.size.limit Size limit of the gene sets to be tested
#' @param method the method used to calculate pathway enrichment p value.
#' Options are 'Wallenius', 'Sampling', and 'Hypergeometric'
#' @param repcnt Number of random samples to be calculated when 'Sampling' is used, this argument
#' ignored unless \code{method='Sampling'}
#' @param use.genes.without.cat Whether genes not mapped to any gene_set tested are included in the analysis.
#' Default is set to FALSE, where genes not mapped to any tested categories are ignored in analysis.
#' Set this option to TRUE if it's desired that all genes in \code{genewise.table} to be counted towards
#' the total number of genes outside the category.
#' @param binsize The number of genes in each gene bin in the bias plot
#' @param output.file File name for the analysis result in .csv format.
#'
#' @details This function implements the methodology described in Young et al. (2011) to adjust for
#' different number of gene features (column \code{numFeature} in \code{gene.based.table}).
#' For example, gene features can be non-overlapping exon counting bins associated with each gene (Fig 1 in Anders et al. 2012).
#' In the bias plot, the genes are grouped by \code{numFeature} in \code{genewise.table} into gene bins,
#' the proportions of signficant genes are then plotted against the gene bins.
#'
#' @return runPathwaySplice returns a \href{https://cran.r-project.org/package=dplyr}{tibble} with the following information:
#' \item{gene_set}{Name of the gene set. Note in this document we used the terms gene_set, category,
#' and pathway interchangeably}
#' \item{over_represented_pvalue}{P-vaue for the associated gene_set being over-represented among significant genes}
#' \item{under_represented_pvalue}{P-vaue for the associated gene_set being under-represented among significant genes}
#' \item{numSIGInCat}{The number of significant genes in the gene_set}
#' \item{numInCat}{The total number of genes in the gene_set}
#' \item{description}{Description of the gene gene_set}
#' \item{ontology}{The domain of the gene ontology terms if GO categories were tested.
#' Go categories can be classified into three domains: cellular component, biological process, molecular function.}
#' \item{SIGgene_ensembl}{Ensembl gene ID of significant genes in the gene_set}
#' \item{SIGgene_symbol}{Gene symbols of signficant genes in the gene_set}
#' \item{Ave_value_all_gene}{The average value for \code{numFeature} for all the genes in the gene_set,
#' note that \code{numFeature} is the bias factor adjusted by PathwaySplice}
#'
#'These information are also saved in the file \code{output.file}
#'
#' @references Young MD, Wakefield MJ, Smyth GK, Oshlack A (2011) \emph{Gene ontology analysis for RNA-seq:
#' accounting for selection bias}. Genome Biology 11:R14
#'
#' Anders S, Reyes A, Huber W (2012) \emph{Dececting differential usage of exons from RNA-seq data.}
#' Genome Research 22(10): 2008-2017
#'
#' @examples
#' gene.based.table <- makeGeneTable(featureBasedData)
#'
#' res <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
#' test.cats=c('GO:BP'),
#' go.size.limit=c(5,30),
#' method='Wallenius',binsize=20)
#' \dontrun{
#'
#' # demonstrate how output file can be specified
#' res <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
#' test.cats=c('GO:BP'),
#' go.size.limit=c(5,30),
#' method='Wallenius',binsize=800,
#' output.file=tempfile())
#'
#'# demonstrate using customized gene sets
#' dir.name <- system.file('extdata', package='PathwaySplice')
#' hallmark.local.pathways <- file.path(dir.name,'h.all.v6.0.symbols.gmt.txt')
#' hlp <- gmtGene2Cat(hallmark.local.pathways, genomeID='hg19')
#'
#' res <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
#' gene2cat=hlp,
#' go.size.limit=c(5,200),
#' method='Wallenius',binsize=20,
#' output.file=tempfile())
#'
#' }
#'
#' @export
runPathwaySplice <- function(genewise.table, genome, id, gene2cat = NULL, test.cats = c("GO:CC",
"GO:BP", "GO:MF"), go.size.limit = c(10, 200), method = "Wallenius", repcnt = 2000,
use.genes.without.cat = FALSE, binsize = "auto", output.file = tempfile())
{
x <- genewise.table$sig.gene
names(x) <- genewise.table$geneID
pwf <- nullpSplice(x, genome, id, bias.data = genewise.table$numFeature, plot.fit = TRUE,
binsize)
CatDE <- pathwaysplice(pwf, genome = genome, id = id, gene2cat = gene2cat, test.cats = test.cats,
go.size.limit = go.size.limit, method = method, repcnt = repcnt, use.genes.without.cat = use.genes.without.cat)
res1 <- getStaisitcs4Go(CatDE, genewise.table)
res2 <- reformatPathwayOut(res1)
res3 <- within(res2, rm("Ave_value_DE"))
writeTibble(res3, output.file)
res3
}
#' enrichmentMap
#'
#' This function draws an enrichment map based on the overlap of gene sets
#' as measured by the Jaccard Coefficient(JC)
#'
#' @param pathway.res Pathway analysis results, an object returned by \code{runPathwaySplice}
#' @param n The top \emph{n} most significant gene sets are shown on enrichment map
#' @param fixed If set to FALSE, will invoke tkplot (an interactive graphing facility in R) that allows one
#' to draw an interactive enrichment map. Users can then manually adjust the layout of the enrichment map.
#' Note: on OS X system, users need to have \href{https://www.xquartz.org/index.html}{XQuartz} installed
#' to run this function . tcltk R package is also required, but in most distributions of R tcltk is already included
#' @param node.label.font Font size of node label
#' @param similarity.threshold Gene sets with Jaccard Coefficient > \code{similarity.threshold}
#' will be connected on the enrichment map
#' @param scaling.factor Scaling factor that users can use to adjust the edge thickness of the network,
#' which is based on value of sqrt(JC coefficient * 5) * scaling.factor
#' @param output.file.dir Output files directory, see \code{Details} section below.
#'
#' @param label.node.by.index Options for labeling nodes on network.
#'
#' FALSE indicates to label nodes by gene set names
#'
#' TRUE indicates to label nodes by the index of gene sets
#' @param add.numSIGInCat Option for users to decide whether to add number of signficant genes of each gene set to the nodes in enrichment map or not
#'
#' @param ... Additional parameter
#'
#' @details
#'
#' In the enrichment map,
#' \itemize{
#' \item the \emph{node colors} are controlled by gene set p-values,
#' where smaller p-values correspond to dark red color.
#' \item \emph{node sizes} are controlled by the number of significant genes in gene set.
#' \item \emph{thickness of the edges} correspond to Jaccard similarity coefficient between two gene sets.
#' \item the numbers after ':' indicates the nubmer of significant genes in the gene set.
#' }
#'
#' The Jaccard similarity coefficient ranges from 0 to 1. JC=0 indicates
#' there are no overlapping genes between two gene sets,
#' JC=1 indicates two gene sets are identical.
#'
#' The output directory will include the following files:
#'
#' (1) a network file (in GML format) that can be used as an input for \href{http://www.cytoscape.org/}{Cytoscape} software
#' (2) when label.node.by.index=TRUE, also a gene set information file that includes full names of the gene sets
#' and the gene set indices shown on the network.
#'
#' @return A list with edge and node information used to plot enrichment map
#'
#' @author Aimin created this funciton based on enrichMap function in G Yu's DOSE R package
#'
#' @examples
#'
#' gene.based.table <- makeGeneTable(featureBasedData)
#'
#' res <- runPathwaySplice(gene.based.table,genome='hg19',
#' id='ensGene',test.cats=c('GO:BP'),
#' go.size.limit=c(5,30),method='Wallenius')
#'
#' # labeling each node by gene set name
#' enmap <- enrichmentMap(res,n=10,similarity.threshold=0.3,
#' label.node.by.index = FALSE)
#'
#' # labeling each node by gene set index
#' enmap <- enrichmentMap(res,n=10,similarity.threshold=0.3,
#' label.node.by.index = TRUE)
#'
#' \dontrun{
#' # illustrates specification of output file directory
#' # Enable interactive map and label each node by gene set index
#' enmap <- enrichmentMap(res,n=10,fixed=FALSE, similarity.threshold=0.3,
#' label.node.by.index = TRUE, output.file.dir=tempdir())
#'
#' enmap <- enrichmentMap(res,n=10,similarity.threshold=0.3,
#' label.node.by.index = FALSE, output.file.dir=tempdir())}
#' @export
enrichmentMap <- function(pathway.res, n = 50, fixed = TRUE, node.label.font = 1,
similarity.threshold, scaling.factor = 1, output.file.dir = tempdir(), label.node.by.index = FALSE,add.numSIGInCat=FALSE,
...)
{
if (!dir.exists(output.file.dir))
{
dir.create(output.file.dir, recursive = TRUE)
}
pathway.res <- as.data.frame(pathway.res)
GO.name <- pathway.res$gene_set
temp <- pathway.res$SIGgene_ensembl
names(temp) <- GO.name
x <- pathway.res
geneSets <- temp
y <- as.data.frame(x)
if (any(grep("^GO:", y$gene_set)))
{
if(add.numSIGInCat==TRUE){
nodename <- paste0(y$description, ":", y$numSIGInCat)
}else
{
nodename <- y$description
}
if (label.node.by.index == TRUE)
{
nodename.index <- seq(1, length(nodename))
output.text <- as.data.frame(cbind(nodename.index, nodename))[1:n, ]
nodename <- nodename.index
colnames(output.text) <- c("index", "name")
write.table(output.text, file = file.path(output.file.dir, "enrichmap_GO.xls"),
quote = FALSE, col.names = TRUE, row.names = FALSE, sep = "\t")
}
} else
{
if(add.numSIGInCat==TRUE){
nodename <- paste0(y$description, ":", y$numSIGInCat)
}else
{
nodename <- y$description
}
if (label.node.by.index == TRUE)
{
nodename.index <- seq(1, length(nodename))
output.text <- as.data.frame(cbind(nodename.index, nodename))[1:n, ]
nodename <- nodename.index
colnames(output.text) <- c("index", "name")
write.table(output.text, file = file.path(output.file.dir, "enrichmap_pathway.xls"),
quote = FALSE, col.names = TRUE, row.names = FALSE, sep = "\t")
}
}
if (nrow(y) < n)
{
n <- nrow(y)
}
y <- y[1:n, ]
if (n == 0)
{
stop("no enriched term found...")
} else if (n == 1)
{
g <- graph.empty(0, directed = FALSE)
g <- add_vertices(g, nv = 1)
igraph::V(g)$name <- nodename
igraph::V(g)$color <- "red"
} else
{
pvalue <- as.numeric(y$over_represented_pvalue)
id <- y[, 1]
geneSets <- geneSets[id]
n <- nrow(y)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- nodename[1:n]
for (i in 1:n)
{
for (j in i:n)
{
w[i, j] <- overlap_ratio(geneSets[id[i]], geneSets[id[j]])
}
}
wd <- melt(w)
wd <- wd[wd[, 1] != wd[, 2], ]
wd <- wd[!is.na(wd[, 3]), ]
g <- graph.data.frame(wd[, -3], directed = FALSE)
igraph::E(g)$width <- sqrt(wd[, 3] * 5) * scaling.factor
g <- delete.edges(g, igraph::E(g)[wd[, 3] < similarity.threshold])
idx <- unlist(sapply(igraph::V(g)$name, function(x) which(x == nodename[1:n])))
cols <- color_scale("red", "#E5C494")
igraph::V(g)$color <- cols[sapply(pvalue, getIdx, min(pvalue), max(pvalue))]
Edata <- as.data.frame(get.edgelist(g))
Edata$edgewidth <- igraph::E(g)$width
Vdata <- data.frame(pathway = igraph::V(g)$name, color = igraph::V(g)$color)
map_data <- list(edge_data = Edata, node_data = Vdata)
cnt <- as.integer(y$numSIGInCat)
names(cnt) <- nodename[1:n]
cnt2 <- cnt[igraph::V(g)$name]
igraph::V(g)$size <- cnt2/sum(cnt2) * 100
}
netplot(g, node.label.font = node.label.font, node.label.color = "black", fixed = fixed,
...)
write.graph(g, file.path(output.file.dir, "network.layout.for.cytoscape.gml"),
format = "gml")
invisible(g)
re2 <- map_data
return(re2)
}
#' gmtGene2Cat
#'
#' Obtains all pathways associated with a set of genes
#'
#' @param pathway.file Input file for the gene sets in GMT format, must be in \emph{gene symbols}
#' @param gene.anno.file Gene annotation file that facilitate gene id conversions when
#' gene ids in RNA-Seq data and \code{pathway.file} differ. If not specified, \code{gmtGene2cat}
#' relies on gene annotations provided by R package \code{AnnotationHub}.
#'
#'
#' @param genomeID Genome to be used. Options are 'mm10','hg19' or 'hg38'.
#'
#' @details This function reads a gene set file in \url{https://software.broadinstitute.org/cancer/
#' software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29},
#' and returns a list with its name being a gene id, and each element of
#' the list being the pathways associated with the gene. When gene ids in RNA-Seq data differ from those in pathway database,
#' \code{gene.anno.file} facilitate gene id conversions. Users can prepare this file based on the format of the example gene annotation file at
#' \url{https://raw.githubusercontent.com/aiminy/GOSJ/master/data/gene_annotation.txt}
#'
#' @return A list where each entry is named by a gene and contains a vector of all
#' the pathways associated with the gene
#'
#' @examples
#' #using local file for pathways database
#' dir.name <- system.file('extdata', package='PathwaySplice')
#' hallmark.local.pathways <- file.path(dir.name,'h.all.v6.0.symbols.gmt.txt')
#' hlp <- gmtGene2Cat(hallmark.local.pathways, genomeID='hg19')
#'
#' \dontrun{
#' #using url for pathways database linked to a website
#' hallmark.url.pathways <- paste0('https://raw.githubusercontent.com/SCCC-BBC',
#' '/PathwaySplice/development/inst/extdata',
#' '/h.all.v6.0.symbols.gmt.txt')
#' hup <- gmtGene2Cat(hallmark.url.pathways, genomeID='hg19')}
#'
#' @export
gmtGene2Cat <- function(pathway.file, gene.anno.file = NULL, genomeID = c("mm10",
"hg19", "hg38"))
{
gmt_input_file <- pathway.file
gene.2.cat.gmt <- gene2cat2(gmt_input_file)
names.gene.gmt <- as.data.frame(names(gene.2.cat.gmt))
colnames(names.gene.gmt) <- "gene_id"
if (!is.null(gene.anno.file))
{
gene_anno_file <- gene.anno.file
gene.id.conversion <- read.csv(gene_anno_file)
} else
{
gene.id.conversion <- match.arg(genomeID)
}
xxx <- match2Genome(gene.id.conversion)
names.gene.gmt.2 <- match(names.gene.gmt$gene_id, xxx[, 1])
gene.id.conversion.2 <- xxx[names.gene.gmt.2, ]
gene.2.cat.gmt.2 <- gene.2.cat.gmt
names(gene.2.cat.gmt.2) <- gene.id.conversion.2[, 2]
gene.2.cat.gmt.2
}
#' .onAttach
#'
#' @param libname
#' @param pkgname
#'
#' @return Nothing to be returned
#'
#' @examples
#'
#' @export
.onAttach <- function(libname, pkgname)
{
if (.Platform$OS.type == "windows" && .Platform$GUI == "Rgui")
{
winMenuAddItem("Vignettes", "PathwaySplice", "shell.exec(system.file(\"doc\",\"PathwaySplice.pdf\",package=\"PathwaySplice\"))")
}
}
.ORG_PACKAGES = paste("org.", c("Ag.eg", "At.tair", "Bt.eg", "Ce.eg", "Cf.eg", "Dm.eg",
"Dr.eg", "EcK12.eg", "EcSakai.eg", "Gg.eg", "Hs.eg", "Mm.eg", "Mmu.eg", "Pf.plasmo",
"Pt.eg", "Rn.eg", "Sc.sgd", "Ss.eg", "Xl.eg"), sep = "")
names(.ORG_PACKAGES) = c("anoGam", "Arabidopsis", "bosTau", "ce", "canFam", "dm",
"danRer", "E. coli K12", "E. coli Sakai", "galGal", "hg", "mm", "rheMac", "Malaria",
"panTro", "rn", "sacCer", "susScr", "xenTro")
.ID_MAP = c("eg", "eg", "ENSEMBL", "SYMBOL", "sgd", "plasmo", "tair")
names(.ID_MAP) = c("knownGene", "refGene", "ensGene", "geneSymbol", "sgd", "plasmo",
"tair")
.ORG_GOMAP_FUNCTION = c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS", "GO2ALLORFS")
names(.ORG_GOMAP_FUNCTION) = c("default", "org.At.tair", "org.Pf.plasmo", "org.Sc.sgd")
.TXDB_ORGS = c("ce6", "dm3", "hg18", "hg19", "hg38", "mm10", "mm9", "rn4", "rn5",
"sacCer2", "sacCer3")
#' compareResults
#'
#' This function helps with visualizing the effects of bias adjustment in pathway analysis,
#' by comparing the distributions of bias factors (e.g. number of exon bins)
#' in genes associated with the most significant gene sets, before and after adjusting for bias factors
#' in splicing pathway analysis.
#'
#'
#' @param n.go Distributions of bias factor in genes associated with the most significant
#' \code{n.go} gene sets will be compared
#' @param adjusted An object returned by \code{runPathwaySplice}, should correspond to
#' gene set anlaysis results adjusting for biases in splicing analysis
#' @param unadjusted An object returned by \code{runPathwaySplice}, should correspond to
#' gene set analysis results NOT adjusting for biases
#' @param gene.based.table An object returned by \code{makeGeneTable}, should correspond to
#' a table with one p-value for each gene
#' @param output.dir Directory for output files
#' @param type.boxplot Options are 'All' and 'Only3', corresponding to drawing 5 boxplots or 3 boxplots.
#'
#' 5 boxplots: \code{all genesets}, \code{sig.adjusted} (sig gene sets in adjusted analysis),
#' \code{sig.unadjusted} (sig gene sets in unadjusted analysis),
#' \code{sig.adjusted.only} (sig gene sets in adjusted analysis only),
#' \code{sig.unadjusted.only} (sig gene sets in unadjusted analysis only)
#'
#' 3 boxplots: \code{all genesets}, \code{adjusted.sig}, \code{unadjusted.sig}
#'
#'
#' @return The output include 3 files in \code{output.dir}:
#' (1) a venn diagram comparing significant gene sets before and after adjusting for bias factors
#' (2) a .csv file with gene set names belonging to different sections of the venn diagram
#' (3) a box plot showing the distributions of number of features within all genes
#' in significant gene sets, with or without adjusting for bias factors
#'
#' @examples
#'
#' dir.name <- system.file('extdata', package='PathwaySplice')
#' hallmark.pathway.file <- file.path(dir.name,'h.all.v6.0.symbols.gmt.txt')
#'
#' hallmark <- gmtGene2Cat(hallmark.pathway.file,genomeID='hg19')
#'
#' gene.based.table <- makeGeneTable(featureBasedData)
#'
#' res.adj <- runPathwaySplice(gene.based.table,genome='hg19',
#' id='ensGene',gene2cat=hallmark,
#' go.size.limit = c(5, 200),
#' method='Wallenius')
#'
#' res.unadj <- runPathwaySplice(gene.based.table,genome='hg19',
#' id='ensGene',gene2cat=hallmark,go.size.limit = c(5, 200),
#' method='Hypergeometric')
#'
#' compareResults(20, res.adj, res.unadj, gene.based.table, type.boxplot='Only3')
#'
#' \dontrun{
#' # illustrate specification of output directory on windows systems
#' compareResults(20, res.adj, res.unadj, gene.based.table, type.boxplot='Only3',output.dir=tempdir())
#'
#' output.dir <- '~/OutputTestPathwaySplice' #linux system
#' compareResults(20,res.adj, res.unadj,gene.based.table, output.dir, type.boxplot='Only3')
#'}
#'
#' @export
#'
compareResults <- function(n.go, adjusted, unadjusted, gene.based.table, output.dir = tempdir(),
type.boxplot = c("All", "Only3"))
{
if (!dir.exists(output.dir))
{
dir.create(output.dir, recursive = TRUE)
}
n <- n.go
example.go.adjusted.by.exon <- as.data.frame(adjusted)
example.go.unadjusted <- as.data.frame(unadjusted)
if (is.na(example.go.adjusted.by.exon) || is.na(example.go.unadjusted))
{
cat("One of results is empty\n\n")
return()
} else
{
if (dim(example.go.adjusted.by.exon)[1] >= n && dim(example.go.unadjusted)[1] >=
n)
{
adjusted <- example.go.adjusted.by.exon[1:n, 1]
unadjusted <- example.go.unadjusted[1:n, 1]
re <- list(adjusted = adjusted, unadjusted = unadjusted)
vp <- venn.diagram(re, fill = c("red", "blue"), cat.col = c("red", "blue"),
alpha = 0.3, filename = file.path(output.dir, paste0(names(re)[1],
"_", names(re)[2], "_overlap_venn.tiff")))
common <- intersect(unadjusted, adjusted)
In.unadjusted.not.in.adjusted <- setdiff(unadjusted, common)
In.adjusted.not.in.unadjusted <- setdiff(adjusted, common)
if (length(In.unadjusted.not.in.adjusted) != 0 && length(In.adjusted.not.in.unadjusted) !=
0)
{
index1 <- match(In.adjusted.not.in.unadjusted, example.go.adjusted.by.exon$gene_set)
index1.name <- unique(unlist(example.go.adjusted.by.exon[index1,
]$All_gene_ensembl))
In.ad.not.un <- gene.based.table[match(index1.name, gene.based.table$geneID),
]$numFeature
yy <- cbind(example.go.unadjusted[index1, ]$rank.value.by.over_represented_pvalue,
example.go.adjusted.by.exon[index1, ]$rank.value.by.over_represented_pvalue)
index2 <- match(In.unadjusted.not.in.adjusted, example.go.unadjusted$gene_set)
index2.name <- unique(unlist(example.go.unadjusted[index2, ]$All_gene_ensembl))
In.un.not.ad <- gene.based.table[match(index2.name, gene.based.table$geneID),
]$numFeature
yyy <- cbind(example.go.unadjusted[index2, ]$rank.value.by.over_represented_pvalue,
example.go.adjusted.by.exon[index2, ]$rank.value.by.over_represented_pvalue)
cp.topN.adjusted.name <- unique(unlist(example.go.adjusted.by.exon[1:n,
]$All_gene_ensembl))
cp.topN.adjusted <- gene.based.table[match(cp.topN.adjusted.name,
gene.based.table$geneID), ]$numFeature
cp.topN.unadjusted.name <- unique(unlist(example.go.unadjusted[1:n,
]$All_gene_ensembl))
cp.topN.unadjusted <- gene.based.table[match(cp.topN.unadjusted.name,
gene.based.table$geneID), ]$numFeature
adjusted.name <- unique(unlist(example.go.adjusted.by.exon$All_gene_ensembl))
cp.all.adjusted <- gene.based.table[match(adjusted.name, gene.based.table$geneID),
]$numFeature
unadjusted.name <- unique(unlist(example.go.unadjusted$All_gene_ensembl))
cp.all.unadjusted <- gene.based.table[match(unadjusted.name, gene.based.table$geneID),
]$numFeature
type.boxplot <- match.arg(type.boxplot)
switch(type.boxplot, Only3 = {
yy <- rbind(cbind(cp.topN.adjusted, rep(paste0("Adjusted_", n.go),
length(cp.topN.adjusted))), cbind(cp.topN.unadjusted, rep(paste0("Unadjusted_",
n.go), length(cp.topN.unadjusted))), cbind(cp.all.unadjusted,
rep("All", length(cp.all.unadjusted))))
colnames(yy) <- c("y", "grp")
yy <- as.data.frame(yy)
yy$grp <- factor(yy$grp)
yy$grp <- factor(yy$grp, levels = levels(yy$grp)[c(2, 1, 3)])
yy$y <- as.numeric(as.character(yy$y))
colnames(yy) <- c("numFeature", "category")
m = list(l = 200, r = 5, b = 5, t = 5, pad = 4)
p <- plot_ly(yy, x = ~numFeature, color = ~category, type = "box") %>%
layout(margin = m)
htmlwidgets::saveWidget(p, file.path(output.dir, "boxplot.html"))
}, {
yy <- rbind(cbind(In.ad.not.un, rep("adjusted.only", length(In.ad.not.un))),
cbind(In.un.not.ad, rep("unadjusted.only", length(In.un.not.ad))),
cbind(cp.topN.adjusted, rep(paste0("top.adjusted.", n.go), length(cp.topN.adjusted))),
cbind(cp.topN.unadjusted, rep(paste0("top.unadjusted.", n.go),
length(cp.topN.unadjusted))), cbind(cp.all.adjusted, rep("all.adjusted",
length(cp.all.adjusted))), cbind(cp.all.unadjusted, rep("all.unadjusted",
length(cp.all.unadjusted))))
colnames(yy) <- c("y", "grp")
yy <- as.data.frame(yy)
yy$y <- as.numeric(as.character(yy$y))
colnames(yy) <- c("numFeature", "category")
m = list(l = 200, r = 5, b = 5, t = 5, pad = 4)
p <- plot_ly(yy, x = ~numFeature, color = ~category, type = "box") %>%
layout(margin = m)
print (p)
htmlwidgets::saveWidget(p, file.path(output.dir, "boxplot.html"))
})
na.pad <- function(x, len)
{
x[1:len]
}
makePaddedDataFrame <- function(l, ...)
{
maxlen <- max(sapply(l, length))
data.frame(lapply(l, na.pad, len = maxlen), ...)
}
x <- In.adjusted.not.in.unadjusted
y <- In.unadjusted.not.in.adjusted
z <- common
venn.res <- makePaddedDataFrame(list(x = x, y = y, z = z))
colnames(venn.res) <- c("adjusted.only", "unadjusted.only", "common")
write.table(venn.res, file = file.path(output.dir, "results4venn.csv"),
quote = FALSE, sep = ",", eol = "\n", na = " ", dec = ".", row.names = FALSE,
col.names = TRUE)
temp1 <- dplyr::as_data_frame(example.go.adjusted.by.exon[index1,
])
writeTibble(temp1, file.path(output.dir, "adjustedOnly.csv"))
temp2 <- dplyr::as_data_frame(example.go.unadjusted[index2, ])
writeTibble(temp2, file.path(output.dir, "unadjustedOnly.csv"))
} else
{
if (length(In.unadjusted.not.in.adjusted) == 0)
{
cat("there is no significant gene sets in unadjusted results only\n")
}
cat("\n")
if (length(In.adjusted.not.in.unadjusted) == 0)
{
cat("there is no significant gene sets in adjusted results only\n")
}
cat("\n")
}
} else
{
cat("The enriched gene sets is less than", n, "\n")
}
}
}
# Utility functions for PathwaySplice
gene2cat <- function(gene.name, re)
{
z <- re$genesets
res <- lapply(z, function(ch) grep(gene.name, ch))
res2 <- sapply(res, function(x) length(x) > 0)
gene2cat <- list(re$geneset.names[res2])
gene2cat
}
gsa.read.gmt <- function(filename)
{
a <- scan(filename, what = list("", ""), sep = "\t", quote = NULL, fill = TRUE,
flush = TRUE, multi.line = FALSE)
geneset.names <- a[1][[1]]
geneset.descriptions <- a[2][[1]]
dd <- scan(filename, what = "", sep = "\t", quote = NULL)
nn <- length(geneset.names)
n <- length(dd)
ox <- rep(NA, nn)
ii <- 1
for (i in 1:nn)
{
while ((dd[ii] != geneset.names[i]) | (dd[ii + 1] != geneset.descriptions[i]))
{
ii <- ii + 1
}
ox[i] <- ii
ii <- ii + 1
}
genesets <- vector("list", nn)
for (i in 1:(nn - 1))
{
i1 <- ox[i] + 2
i2 <- ox[i + 1] - 1
geneset.descriptions[i] <- dd[ox[i] + 1]
genesets[[i]] <- dd[i1:i2]
}
geneset.descriptions[nn] <- dd[ox[nn] + 1]
genesets[[nn]] <- dd[(ox[nn] + 2):n]
out <- list(genesets = genesets, geneset.names = geneset.names, geneset.descriptions = geneset.descriptions)
class(out) <- "GSA.genesets"
return(out)
}
gene2cat2 <- function(gmt.input.file)
{
re <- gsa.read.gmt(gmt.input.file)
gene.name <- unique(do.call(c, re$genesets))
gene.2.cat <- sapply(gene.name, gene2cat, re)
names(gene.2.cat) <- gene.name
gene.2.cat
}
list_to_df <- function(list_for_df)
{
list_for_df <- as.list(list_for_df)
nm <- names(list_for_df)
if (is.null(nm))
nm <- seq_along(list_for_df)
df <- data.frame(name = nm, stringsAsFactors = FALSE)
df$value <- unname(list_for_df)
df
}
reversemapping <- function(map)
{
tmp <- unlist(map, use.names = FALSE)
names(tmp) <- rep(names(map), times = as.numeric(summary(map)[, 1]))
return(split(names(tmp), as.vector(tmp)))
}
reformatdata <- function(re.gene.based)
{
re <- re.gene.based
no.re.testable.index <- which(re$mostSigDeFeature == "character(0)")
if (length(no.re.testable.index) > 0)
{
re2 <- re[-no.re.testable.index, ]
} else
{
re2 <- re
}
All.gene.id.based.on.sub_feature <- unique(unlist(strsplit(re2$geneID, "\\+")))
All.gene.id.index <- rep(0, length(All.gene.id.based.on.sub_feature))
names(All.gene.id.index) <- All.gene.id.based.on.sub_feature
re3 <- lapply(All.gene.id.based.on.sub_feature, function(u, re2)
{
x <- as.data.frame(re2[grep(u, re2$geneID), ], stringsAsFactors = FALSE)
}, re2)
re4 <- do.call(rbind.data.frame, c(re3, stringsAsFactors = FALSE))
index.geneID <- which(colnames(re4) %in% c("geneID"))
re5 <- cbind.data.frame(All.gene.id.based.on.sub_feature, re4[, -c(index.geneID)],
stringsAsFactors = FALSE)
colnames(re5)[1] <- "geneID"
return(re5)
}
writegototable <- function(GO_re, Output_file)
{
dataset2 <- GO_re
dataset2[sapply(dataset2, is.list)] <- sapply(dataset2[sapply(dataset2, is.list)],
function(x) sapply(x, function(y) paste(unlist(y), collapse = ", ")))
write.table(dataset2, file = Output_file, row.names = FALSE, quote = FALSE, sep = "\t")
}
pathwaysplice <- function(pwf, genome, id, gene2cat, test.cats, go.size.limit, method,
repcnt, use.genes.without.cat)
{
if (any(!test.cats %in% c("GO:CC", "GO:BP", "GO:MF", "KEGG")))
{
stop("Invalid gene_set specified. Valid categories are GO:CC, GO:BP, GO:MF or KEGG")
}
if ((missing(genome) | missing(id)))
{
if (is.null(gene2cat))
{
stop("You must specify the genome and gene ID format when automatically fetching gene to GO gene_set mappings.")
}
genome <- "dummy"
id <- "dummy"
}
if (!any(method %in% c("Wallenius", "Sampling", "Hypergeometric")))
{
stop("Invalid calculation method selected. Valid options are Wallenius, Sampling & Hypergeometric.")
}
if (!is.null(gene2cat) && (!is.data.frame(gene2cat) & !is.list(gene2cat)))
{
stop("Was expecting a dataframe or a list mapping categories to genes. Check gene2cat input and try again.")
}
pwf <- unfactor(pwf)
gene2cat <- unfactor(gene2cat)
if (is.null(gene2cat))
{
message("Fetching GO annotations...")
gene2cat <- getGeneSet(rownames(pwf), genome, id, fetch.cats = test.cats)
cat2gene <- reversemapping(gene2cat)
gene2cat <- reversemapping(cat2gene)
} else
{
message("Using manually entered categories.")
if (class(gene2cat) != "list")
{
genecol_sum <- as.numeric(apply(gene2cat, 2, function(u)
{
sum(u %in% rownames(pwf))
}))
genecol <- which(genecol_sum != 0)
if (length(genecol) > 1)
{
genecol <- genecol[order(-genecol_sum)[1]]
warning(paste("More than one possible gene column found in gene2cat, using the one headed",
colnames(gene2cat)[genecol]))
}
if (length(genecol) == 0)
{
genecol <- 1
warning(paste("Gene column could not be identified in gene2cat conclusively, using the one headed",
colnames(gene2cat)[genecol]))
}
othercol <- 1
if (genecol == 1)
{
othercol <- 2
}
gene2cat <- split(gene2cat[, othercol], gene2cat[, genecol])
cat2gene <- reversemapping(gene2cat)
gene2cat <- reversemapping(cat2gene)
}
gene2cat <- gene2cat[-which(is.na(names(gene2cat)))]
if (sum(unique(unlist(gene2cat, use.names = FALSE)) %in% rownames(pwf)) >
sum(unique(names(gene2cat)) %in% rownames(pwf)))
{
gene2cat <- reversemapping(gene2cat)
}
gene2cat <- gene2cat[names(gene2cat) %in% rownames(pwf)]
if (length(gene2cat) > 0)
{
cat2gene <- reversemapping(gene2cat)
gene2cat <- reversemapping(cat2gene)
cat2gene <- lapply(cat2gene, function(x)
{
unique(x)
})
gene2cat <- lapply(gene2cat, function(x)
{
unique(x)
})
} else
{
cat("There is no match between gene names of gene2pathway input and gene names of the data set under analysis,please change gene2pathway input\n\n")
return(NA)
}
}
cat2gene <- getGeneSetBySize(cat2gene, go.size.limit)
if (length(gene2cat) == 0)
{
stop("No gene set is satisfied by the selected size. Change gene set or choose new size.")
}
gene2cat <- reversemapping(cat2gene)
cat2gene <- reversemapping(gene2cat)
nafrac <- (sum(is.na(pwf$pwf))/nrow(pwf)) * 100
if (nafrac > 50)
{
warning(paste("Missing length data for ", round(nafrac), "% of genes. Accuarcy of GO test will be reduced.",
sep = ""))
}
pwf$pwf[is.na(pwf$pwf)] <- pwf$pwf[match(sort(pwf$bias.data[!is.na(pwf$bias.data)])[ceiling(sum(!is.na(pwf$bias.data))/2)],
pwf$bias.data)]
unknown_go_terms = nrow(pwf) - length(gene2cat)
if ((!use.genes.without.cat) && unknown_go_terms > 0)
{
message(paste("For", unknown_go_terms, "genes, we could not find any categories. These genes will be excluded."))
message("To force their use, please run with use_genes_without_cat=TRUE (see documentation).")
message("This was the default behavior for version 1.15.1 and earlier.")
pwf = pwf[rownames(pwf) %in% names(gene2cat), ]
}
cats <- names(cat2gene)
DE <- rownames(pwf)[pwf$DEgenes == 1]
num_de <- length(DE)
num_genes <- nrow(pwf)
pvals <- data.frame(gene_set = cats, over_represented_pvalue = NA, under_represented_pvalue = NA,
stringsAsFactors = FALSE, numSIGInCat = NA, numInCat = NA)
if (method == "Sampling")
{
num_DE_mask <- rep(0, length(cats))
a <- table(unlist(gene2cat[DE], FALSE, FALSE))
num_DE_mask[match(names(a), cats)] <- as.numeric(a)
num_DE_mask <- as.integer(num_DE_mask)
gene2cat <- gene2cat[rownames(pwf)]
names(gene2cat) <- rownames(pwf)
message("Running the simulation...")
lookup <- matrix(0, nrow = repcnt, ncol = length(cats))
for (i in 1:repcnt)
{
a <- table(as.character(unlist(gene2cat[order(runif(num_genes)^(1/pwf$pwf),
decreasing = TRUE)[1:num_de]], FALSE, FALSE)))
lookup[i, match(names(a), cats)] <- a
pp(repcnt)
}
message("Calculating the p-values...")
pvals[, 2] <- (colSums(lookup >= outer(rep(1, repcnt), num_DE_mask)) + 1)/(repcnt +
1)
pvals[, 3] <- (colSums(lookup <= outer(rep(1, repcnt), num_DE_mask)) + 1)/(repcnt +
1)
}
if (method == "Wallenius")
{
message("Calculating the p-values...")
degenesnum <- which(pwf$DEgenes == 1)
cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
alpha <- sum(pwf$pwf)
pvals[, 2:3] <- t(sapply(cat2genenum, function(u)
{
num_de_incat <- sum(degenesnum %in% u)
num_incat <- length(u)
avg_weight <- mean(pwf$pwf[u])
weight <- (avg_weight * (num_genes - num_incat))/(alpha - num_incat *
avg_weight)
if (num_incat == num_genes)
{
weight <- 1
}
c(dWNCHypergeo(num_de_incat, num_incat, num_genes - num_incat, num_de,
weight) + pWNCHypergeo(num_de_incat, num_incat, num_genes - num_incat,
num_de, weight, lower.tail = FALSE), pWNCHypergeo(num_de_incat, num_incat,
num_genes - num_incat, num_de, weight))
}))
}
if (method == "Hypergeometric")
{
message("Calculating the p-values...")
degenesnum <- which(pwf$DEgenes == 1)
cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
pvals[, 2:3] <- t(sapply(cat2genenum, function(u)
{
num_de_incat <- sum(degenesnum %in% u)
num_incat <- length(u)
c(dhyper(num_de_incat, num_incat, num_genes - num_incat, num_de) + phyper(num_de_incat,
num_incat, num_genes - num_incat, num_de, lower.tail = FALSE), phyper(num_de_incat,
num_incat, num_genes - num_incat, num_de))
}))
}
degenesnum <- which(pwf$DEgenes == 1)
cat2genenum <- relist(match(unlist(cat2gene), rownames(pwf)), cat2gene)
pvals[, 4:5] <- t(sapply(cat2genenum, function(u)
{
c(sum(degenesnum %in% u), length(u))
}))
DE_pwf <- rownames(pwf[degenesnum, ])
pvals.6 <- sapply(cat2gene, function(u, DE_pwf)
{
x <- u[which(u %in% DE_pwf)]
x
}, DE_pwf)
xxx <- match2Genome(genome)
pvals.6.gene.symbol <- sapply(pvals.6, function(u, xxx)
{
y <- xxx[match(u, as.character(xxx[, 2])), 1]
y
}, xxx)
gene_pwf <- rownames(pwf)
pvals.7 <- sapply(cat2gene, function(u, gene_pwf)
{
x <- u[which(u %in% gene_pwf)]
x
}, gene_pwf)
pvals.7.gene.symbol <- sapply(pvals.7, function(u, xxx)
{
y <- xxx[match(u, as.character(xxx[, 2])), 1]
y
}, xxx)
pvals.6.df <- list_to_df(pvals.6)
pvals.6.gene.symbol.df <- list_to_df(pvals.6.gene.symbol)
pvals.7.df <- list_to_df(pvals.7)
pvals.7.gene.symbol.df <- list_to_df(pvals.7.gene.symbol)
dataset2 <- pvals.6.gene.symbol.df
dataset2[sapply(dataset2, is.list)] <- sapply(dataset2[sapply(dataset2, is.list)],
function(x) sapply(x, function(y) paste(unlist(y), collapse = ", ")))
temp.gene.name <- unique(apply(dataset2[, 2], 1, c))
temp.gene.name.2 <- unique(gdata::trim(unlist(strsplit(temp.gene.name, split = ","))))
DE_from_GO <- temp.gene.name.2
colnames(pvals.6.df) <- c("gene_set", "SIGgene_ensembl")
colnames(pvals.6.gene.symbol.df) <- c("gene_set", "SIGgene_symbol")
colnames(pvals.7.df) <- c("gene_set", "All_gene_ensembl")
colnames(pvals.7.gene.symbol.df) <- c("gene_set", "All_gene_symbol")
pvals <- pvals[order(pvals$over_represented_pvalue), ]
if (any(grep("^GO:", pvals$gene_set)))
{
GOnames <- select(GO.db, keys = pvals$gene_set, columns = c("TERM", "ONTOLOGY"))[,
2:3]
colnames(GOnames) <- tolower(colnames(GOnames))
colnames(GOnames)[colnames(GOnames) == "term"] <- "description"
pvals <- cbind(pvals, GOnames)
}
re.2 <- merge(pvals, pvals.6.df, by = "gene_set", sort = FALSE)
re.3 <- merge(re.2, pvals.6.gene.symbol.df, by = "gene_set", sort = FALSE)
re.4 <- merge(re.3, pvals.7.df, by = "gene_set", sort = FALSE)
re.5 <- merge(re.4, pvals.7.gene.symbol.df, by = "gene_set", sort = FALSE)
re.6 <- list(GO = re.5, DE_GO = DE_from_GO, cat2gene = cat2gene)
return(re.6)
}
getGeneSet <- function(genes, genome, id, fetch.cats = c("GO:CC", "GO:BP", "GO:MF"))
{
if (any(!fetch.cats %in% c("GO:CC", "GO:BP", "GO:MF", "KEGG")))
{
stop("Invaled gene_set specified. Categories can only be GO:CC, GO:BP, GO:MF or KEGG")
}
orgstring <- as.character(.ORG_PACKAGES[match(gsub("[0-9]+", "", genome), names(.ORG_PACKAGES))])
if (length(orgstring) != 1)
{
stop("Couldn't grab GO categories automatically. Please manually specify.")
}
library(paste(orgstring, "db", sep = "."), character.only = TRUE)
coreid <- strsplit(orgstring, "\\.")[[1]][3]
userid <- as.character(.ID_MAP[match(id, names(.ID_MAP))])
if (is.na(userid) | (length(userid) != 1))
{
stop("Couldn't grab GO categories automatically. Please manually specify.")
}
core2cat <- NULL
if (length(grep("^GO", fetch.cats)) != 0)
{
gomapFunction <- .ORG_GOMAP_FUNCTION[orgstring]
if (is.na(gomapFunction))
gomapFunction <- .ORG_GOMAP_FUNCTION["default"]
x <- toTable(get(paste(orgstring, gomapFunction, sep = "")))
x[!x$Ontology %in% gsub("^GO:", "", fetch.cats), 2] <- "Other"
core2cat <- x[, 1:2]
colnames(core2cat) <- c("gene_id", "gene_set")
}
if (length(grep("^KEGG", fetch.cats)) != 0)
{
x <- toTable(get(paste(orgstring, "PATH", sep = "")))
colnames(x) <- c("gene_id", "gene_set")
if (!is.null(core2cat))
{
core2cat <- rbind(core2cat, x)
} else
{
core2cat <- x
}
}
if (coreid != userid)
{
user2core <- toTable(get(paste(orgstring, userid, sep = "")))
user2core <- user2core[user2core[, 1] %in% core2cat[, 1], ]
list_core2cat <- split(core2cat[, 2], core2cat[, 1])
list_core2cat <- list_core2cat[match(user2core[, 1], names(list_core2cat))]
user2cat <- split(unlist(list_core2cat, FALSE, FALSE), rep(user2core[, 2],
sapply(list_core2cat, length)))
user2cat <- sapply(user2cat, unique)
} else
{
user2cat <- split(core2cat[, 2], core2cat[, 1])
user2cat <- sapply(user2cat, unique)
}
user2cat <- lapply(user2cat, function(x)
{
if (length(x) > 1)
x = x[x != "Other"]
x
})
names(user2cat) <- toupper(names(user2cat))
gene2go <- user2cat[toupper(genes)]
return(gene2go)
}
pp <- function(total, count, i = i)
{
if (missing(count))
{
count <- evalq(i, envir = parent.frame())
}
if (missing(total))
{
total <- evalq(stop, envir = parent.frame())
}
cat(round(100 * (count/total)), "% \r")
}
outputGoBasedSelection <- function(Re.Go.adjusted.by.exon.SJ)
{
index.select <- which(Re.Go.adjusted.by.exon.SJ[[1]]$numInCat >= 10 & Re.Go.adjusted.by.exon.SJ[[1]]$numInCat <=
300 & Re.Go.adjusted.by.exon.SJ[[1]]$ontology == "BP")
Re.Go.adjusted.by.exon.SJ.select <- Re.Go.adjusted.by.exon.SJ[[1]][index.select,
]
Re.Go.adjusted.by.exon.SJ.select <- Re.Go.adjusted.by.exon.SJ.select[, -3]
temp <- format(Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue, scientific = TRUE,
digits = 2)
Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue <- temp
rank.value.by.over_represented_pvalue <- rank(as.numeric(Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue),
ties.method = "min")
Re.Go.adjusted.by.exon.SJ.select <- cbind(Re.Go.adjusted.by.exon.SJ.select, rank.value.by.over_represented_pvalue)
return(Re.Go.adjusted.by.exon.SJ.select)
}
outputCatBasedSelection <- function(Re.Go.adjusted.by.exon.SJ)
{
index.select <- which(Re.Go.adjusted.by.exon.SJ$numInCat >= 10 & Re.Go.adjusted.by.exon.SJ$numInCat <=
300)
Re.Go.adjusted.by.exon.SJ.select <- Re.Go.adjusted.by.exon.SJ[index.select,
]
Re.Go.adjusted.by.exon.SJ.select <- Re.Go.adjusted.by.exon.SJ.select[, -3]
temp <- format(Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue, scientific = TRUE,
digits = 2)
Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue <- temp
rank.value.by.over_represented_pvalue <- rank(as.numeric(Re.Go.adjusted.by.exon.SJ.select$over_represented_pvalue),
ties.method = "min")
Re.Go.adjusted.by.exon.SJ.select <- as_tibble(cbind(Re.Go.adjusted.by.exon.SJ.select, rank.value.by.over_represented_pvalue))
return(Re.Go.adjusted.by.exon.SJ.select)
}
getStaisitcs4Go <- function(GO.wall.DE_interest, genewise.table)
{
GO.data <- GO.wall.DE_interest[[1]]
y <- as.list(GO.data$SIGgene_ensembl)
re <- lapply(1:length(y), function(u, y, genewise.table)
{
yy <- y[[u]]
y.id <- trim(c(unlist(strsplit(y[[u]], split = ","))))
if (length(y.id) != 0)
{
yyy <- mean(as.numeric(unlist(genewise.table[match(y.id, genewise.table$geneID),
]$numFeature)))
} else
{
yyy <- 0
}
yyy
}, y, genewise.table)
re2 <- list_to_df(re)
GO.data.1 <- cbind(GO.data, re2)
GO.data.2 <- GO.data.1[, -(dim(GO.data.1)[2] - 1)]
colnames(GO.data.2)[dim(GO.data.2)[2]] <- "Ave_value_DE"
cat2gene <- GO.wall.DE_interest[[3]]
rre <- lapply(1:length(cat2gene), function(u, cat2gene, genewise.table)
{
yy <- cat2gene[[u]]
y.id <- yy
if (length(y.id) != 0)
{
yyy <- mean(as.numeric(unlist(genewise.table[match(y.id, genewise.table$geneID),
]$numFeature)), na.rm = TRUE)
} else
{
yyy <- 0
}
yyy
}, cat2gene, genewise.table)
names(rre) <- names(cat2gene)
rre2 <- list_to_df(rre)
colnames(rre2) <- c("gene_set", "Ave_value_all_gene")
GO.data.3 <- merge(GO.data.2, rre2, by = "gene_set", sort = FALSE)
GO.data.3$Ave_value_DE <- unlist(GO.data.3$Ave_value_DE)
GO.data.3$Ave_value_all_gene <- unlist(GO.data.3$Ave_value_all_gene)
re3 <- list(GO.wall.DE_interest = GO.data.3, pwf.DE_interest = GO.wall.DE_interest[[2]])
return(re3)
}
overlap_ratio <- function(x, y)
{
x <- unlist(x)
y <- unlist(y)
length(intersect(x, y))/length(unique(c(x, y)))
}
color_scale <- function(c1 = "grey", c2 = "red")
{
pal <- colorRampPalette(c(c1, c2))
colors <- pal(100)
return(colors)
}
getIdx <- function(v, MIN, MAX)
{
if (MIN == MAX)
{
return(100)
}
intervals <- seq(MIN, MAX, length.out = 100)
max(which(intervals <= v))
}
match2Genome <- function(genome_id)
{
ah <- AnnotationHub()
switch(genome_id, hg38 = {
hs <- query(ah, c("Ensembl", "GRCh38", "Homo sapiens"))
res <- hs[["AH53211"]]
res <- genes(res, columns = c("gene_name"))
xxx <- mcols(res)
yyy <- xxx
}, hg19 = {
edb <- org.Hs.eg.db
entrezid <- keys(edb, keytype = "ENTREZID")
suppressMessages(xxx <- select(edb, keys = entrezid, columns = c("ENSEMBL",
"SYMBOL")))
yyy <- xxx[, c(3, 2, 1)]
}, mm10 = {
edb <- org.Mm.eg.db
entrezid <- keys(edb, keytype = "ENTREZID")
suppressMessages(xxx <- select(edb, keys = entrezid, columns = c("ENSEMBL",
"SYMBOL")))
yyy <- xxx[, c(3, 2, 1)]
}, {
hs <- query(ah, c("Ensembl", "GRCm38", "Mus Musculus"))
res <- hs[["AH53222"]]
res <- genes(res, columns = c("gene_name"))
xxx <- S4Vectors::mcols(res)
yyy <- xxx
})
return(yyy)
}
reformatpath <- function(dir.name)
{
CheckOPS <- Sys.info()[["sysname"]]
if (CheckOPS == "Darwin")
{
temp <- unlist(strsplit(dir.name, split = "\\/"))
if (!is.na(temp[3] == "H_driver"))
{
if (temp[3] == "H_driver")
{
temp[2] <- "Volumes"
temp[3] <- "Bioinformatics$"
dir.name <- do.call("file.path", as.list(temp))
}
}
}
return(dir.name)
}
getGeneSetBySize <- function(user2cat, go.size.limit)
{
gene2go <- user2cat
gene2go.select <- lapply(gene2go, function(x)
{
x = x[x != "Other"]
x
})
gene2go.select.1 <- gene2go.select[lapply(gene2go.select, length) > 0]
lower.size <- go.size.limit[1]
upper.size <- go.size.limit[2]
if (is.finite(lower.size) & is.finite(upper.size))
{
gene2go.select.2 <- gene2go.select.1[lapply(gene2go.select.1, length) > lower.size &
lapply(gene2go.select.1, length) <= upper.size]
} else
{
gene2go.select.2 <- gene2go.select.1
}
return(gene2go.select.2)
}
makeFeatureTable <- function(jscs, use.multigene.aggregates = TRUE)
{
temp <- fData(jscs)
temp2 <- temp[which(temp$testable == TRUE), ]
index.1 <- which(colnames(temp2) %in% c("geneID"))
index.2 <- which(colnames(temp2) %in% c("countbinID"))
index.3 <- which(colnames(temp2) %in% c("pvalue"))
temp3 <- temp2[, c(index.1, index.2, index.3)]
temp3 <- rapply(temp3, as.character, classes = "factor", how = "replace")
if (use.multigene.aggregates == FALSE)
{
temp3 <- temp3[-grep("\\+", temp3$geneID), ]
}
row.names(temp3) = seq(1, dim(temp3)[1], 1)
return(temp3)
}
nullpSplice = function(DEgenes, genome, id, bias.data = NULL, plot.fit = TRUE, binsize = "auto")
{
if (!is.null(bias.data) & length(bias.data) != length(DEgenes))
{
stop("bias.data vector must have the same length as DEgenes vector!")
}
bias.data = unfactor(bias.data)
DEgenes = unfactor(DEgenes)
if (is.null(bias.data))
{
bias.data = getlength(names(DEgenes), genome, id)
}
pwf = rep(NA, length(DEgenes))
w = !is.na(bias.data)
pwf[w] = makespline(bias.data[w], DEgenes[w])
out = data.frame(DEgenes = DEgenes, bias.data = bias.data, pwf = pwf, stringsAsFactors = FALSE)
rownames(out) = names(DEgenes)
if (plot.fit)
{
plotPwfSplice(out, binsize)
}
out
}
plotPwfSplice = function(pwf, binsize, pwf_col = 3, pwf_lwd = 2, xlab = "Biased Data in <binsize> gene bins.",
ylab = "Proportion of significant genes", ...)
{
w = !is.na(pwf$bias.data)
o = order(pwf$bias.data[w])
rang = max(pwf$pwf, na.rm = TRUE) - min(pwf$pwf, na.rm = TRUE)
if (rang == 0 & binsize == "auto")
binsize = 1000
if (binsize == "auto")
{
binsize = max(1, min(100, floor(sum(w) * 0.08)))
resid = rang
oldwarn = options()$warn
options(warn = -1)
while (binsize <= floor(sum(w) * 0.1) & resid/rang > 0.001)
{
binsize = binsize + 100
splitter = ceiling(1:length(pwf$DEgenes[w][o])/binsize)
de = sapply(split(pwf$DEgenes[w][o], splitter), mean)
binlen = sapply(split(as.numeric(pwf$bias.data[w][o]), splitter), mean)
resid = sum((de - approx(pwf$bias.data[w][o], pwf$pwf[w][o], binlen)$y)^2)/length(binlen)
}
options(warn = oldwarn)
} else
{
splitter = ceiling(1:length(pwf$DEgenes[w][o])/binsize)
de = sapply(split(pwf$DEgenes[w][o], splitter), mean)
binlen = sapply(split(as.numeric(pwf$bias.data[w][o]), splitter), mean)
}
xlab = gsub("<binsize>", as.character(binsize), xlab)
if ("xlab" %in% names(list(...)))
{
if ("ylab" %in% names(list(...)))
{
plot(binlen, de, ...)
} else
{
plot(binlen, de, ylab = ylab, ...)
}
} else if ("ylab" %in% names(list(...)))
{
plot(binlen, de, xlab = xlab, ...)
} else
{
plot(binlen, de, xlab = xlab, ylab = ylab, ...)
}
lines(pwf$bias.data[w][o], pwf$pwf[w][o], col = pwf_col, lwd = pwf_lwd)
}
reformatPathwayOut <- function(pathway.in)
{
res <- dplyr::as_data_frame(pathway.in$GO)
res
}
writeTibble <- function(tibble.input, output.file.name = tempfile())
{
if (!dir.exists(dirname(output.file.name)))
{
dir.create(dirname(output.file.name), recursive = TRUE)
}
flatten_list = function(x)
{
if (typeof(x) != "list")
{
return(x)
}
sapply(x, function(y) paste(y, collapse = " | "))
}
tibble.input %>% mutate_all(funs(flatten_list)) %>% write.csv(output.file.name)
}
splitGeneCluster <- function(re.gene.based)
{
re2 <- re.gene.based
All.gene.id.based.on.sub_feature <- unique(unlist(strsplit(re2$geneID, "\\+")))
All.gene.id.index <- rep(0, length(All.gene.id.based.on.sub_feature))
names(All.gene.id.index) <- All.gene.id.based.on.sub_feature
re3 <- lapply(All.gene.id.based.on.sub_feature, function(u, re2)
{
x <- as.data.frame(re2[grep(u, re2$geneID), ], stringsAsFactors = FALSE)
}, re2)
re4 <- do.call(rbind.data.frame, c(re3, stringsAsFactors = FALSE))
index.geneID <- which(colnames(re4) %in% c("geneID"))
re5 <- cbind.data.frame(All.gene.id.based.on.sub_feature, re4[, -c(index.geneID)],
stringsAsFactors = FALSE)
colnames(re5)[1] <- "geneID"
return(re5)
}
# res1 <- generateRankFeatureCount(res.unadj,res.adj)
#
# res.go.unadj <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
# test.cats=c('GO:BP'),
# go.size.limit=c(5,30),
# method='Hypergeometric',binsize=20)
#
# res.go.adj <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
# test.cats=c('GO:BP'),
# go.size.limit=c(5,30),
# method='Wallenius',binsize=20)
#
# res2 <- generateRankFeatureCount(res.go.unadj,res.go.adj)
generateRankFeatureCount <- function(res.unadj,res.adj)
{
res.unadj.rank <- outputCatBasedSelection(res.unadj)
res.adj.rank <- outputCatBasedSelection(res.adj)
res.unadj.rank.reorder <- res.unadj.rank[match(res.adj.rank$gene_set,res.unadj.rank$gene_set),]
res <- as_tibble(cbind.data.frame(res.adj.rank$gene_set,res.unadj.rank.reorder$gene_set,
res.adj.rank$Ave_value_all_gene,res.unadj.rank.reorder$Ave_value_all_gene,res.adj.rank$rank.value.by.over_represented_pvalue,
res.unadj.rank.reorder$rank.value.by.over_represented_pvalue,stringsAsFactors = FALSE))
colnames(res) <- c("gene_set_adj","gene_set_unadj","Ave_value_all_gene_adj","Ave_value_all_gene_unadj","rank_adj","rank_unadj")
plot(log10(res$Ave_value_all_gene_adj),res$rank_unadj-res$rank_adj,ylab="Increase in rank in PathwaySplice",xlab="log10(Average gene set feature number)")
abline(h=0,col="red",lty=3)
res
}
# res.cor <- tuningRepcnt(gene.based.table,n=25,repcnt.start = 1000,repcnt.end=50000)
tuningRepcnt <- function(gene.based.table,n=25,repcnt.start = 1000,repcnt.end = 30000)
{
set.seed(1000)
res.start <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
test.cats=c('GO:BP'),
go.size.limit=c(5,30),
method='Sampling',repcnt=repcnt.start, binsize=800)
g.start = res.start[1:n,]$gene_set
vp.start = res.start$over_represented_pvalue
temp <- data.frame(repcnt=1000,cor.coeff=0,J.index=0)
J.index = 0
repcnt = 5000
while(repcnt <= repcnt.end){
set.seed(1000)
res.end <- runPathwaySplice(gene.based.table,genome='hg19',id='ensGene',
test.cats=c('GO:BP'),
go.size.limit=c(5,30),
method='Sampling',repcnt= repcnt, binsize=800)
g.end = res.end[1:n,]$gene_set
J.index <- overlap_ratio(g.start,g.end)
vp.end <- res.end[match(res.start$gene_set,res.end$gene_set),]$over_represented_pvalue
cor.coeff = cor(vp.start , vp.end , method = "spearman")
temp <- rbind.data.frame(temp,cbind.data.frame(repcnt,cor.coeff,J.index))
res.start = res.end
vp.start = vp.end
g.start = g.end
repcnt = repcnt + 5000
}
par(mfrow=c(1,2))
plot(temp[-1,1],temp[-1,2],type="b",xlab="Number of random samples",
ylab="Rank correlation ")
plot(temp[-1,1],temp[-1,3],type="b",xlab="Number of random samples",
ylab="Jaccard index")
temp
}
gmtlist2file <- function(gmtlist, filename)
{
gsname=names(gmtlist)
for (i in seq_along(gmtlist) ){
cat(gsname[i], length(gmtlist[[i]]), gmtlist[[i]], file=filename, append=TRUE, sep = "\t")
cat("\n", append=TRUE, file=filename)
}
}
#' outKegg2gmt
#'
#' This function obtains a .gmt file for KEGG pathways.
#'
#' @param organism.id an identifier for the organism being studied, for example, "hsa" for "Homo sapiens"
#' @param out.gmt.file name of the output .gmt file
#'
#' @return Returns a .gmt file for KEGG pathways
#'
#' @details The function calls the \code{get.kegg.genesets} function in \code{EnrichmentBrowser} R package
#' and modifies the resulting output into a .gmt file.
#'
#' @examples \dontrun{
#'
#' data.dir <- tempdir()
#' outKegg2Gmt ("hsa",file.path(data.dir,"kegg.gmt.txt"))
#'
#' kegg.pathways <- gmtGene2Cat(file.path(data.dir, "kegg.gmt.txt"),genomeID = "hg19")
#'
#' result.kegg <- runPathwaySplice(genewise.table = gene.based.table.fdr,
#' genome = "hg19",
#' id = "ensGene",
#' gene2cat = kegg.pathways,
#' go.size.limit = c(5, 100),
#' method = "Wallenius",
#' use.genes.without.cat = TRUE)
#' }
#'
#' @export
outKegg2Gmt <- function(organism.id, out.gmt.file)
{
gs <- get.kegg.genesets(organism.id)
switch(organism.id, hsa = {
meta.data <- 'org.Hs.eg'
}, mmu = {
meta.data <- 'org.Mm.eg'
})
gs.n <- lapply(gs,function(u){
x <- as.character(getSYMBOL(na.omit(u), data = meta.data))
}
)
gmtlist2file(gs.n,out.gmt.file)
}
# compareResults2(result.hyper,result.Wall,result.Sampling,result.Sampling.200k)
#
compareResults2 <- function(result.hyper,result.Wall,result.Sampling,result.Sampling.200k){
res.Wall.reorder <- result.Wall[match(result.hyper$gene_set,result.Wall$gene_set),]
res.Sampling.reorder <- result.Sampling[match(result.hyper$gene_set,result.Sampling$gene_set),]
result.Sampling.200k.reorder <- result.Sampling.200k[match(result.hyper$gene_set,result.Sampling.200k$gene_set),]
res.hyper.Wall.Sampling <- as_tibble(cbind.data.frame(result.hyper$gene_set, result.hyper$over_represented_pvalue,res.Wall.reorder$over_represented_pvalue,res.Sampling.reorder$over_represented_pvalue,result.Sampling.200k.reorder$over_represented_pvalue,stringsAsFactors = FALSE))
colnames(res.hyper.Wall.Sampling) = c("gene_set","hyper","wall","sampling","sampling_200k")
cor(res.hyper.Wall.Sampling[,2:5])
save(res.hyper.Wall.Sampling,file="vignettes/res.hyper.Wall.Sampling.RData")
temp.1 <- cbind.data.frame(res.hyper.Wall.Sampling$sampling_200k,
rep("Sampling_200k",length(res.hyper.Wall.Sampling$sampling_200k)),res.hyper.Wall.Sampling$sampling_200k)
colnames(temp.1) <- c("PValue","PathwaySplice","random_sampling_200k")
temp.2 <- cbind.data.frame(res.hyper.Wall.Sampling$sampling,
rep("Sampling_30k",length(res.hyper.Wall.Sampling$sampling)),
res.hyper.Wall.Sampling$sampling_200k)
colnames(temp.2) <- c("PValue","PathwaySplice","random_sampling_200k")
temp.3 <- cbind.data.frame(res.hyper.Wall.Sampling$wall,
rep("Wall",length(res.hyper.Wall.Sampling$wall)),
res.hyper.Wall.Sampling$sampling_200k)
colnames(temp.3) <- c("PValue","PathwaySplice","random_sampling_200k")
temp.4 <- cbind.data.frame(res.hyper.Wall.Sampling$hyper,
rep("Hyper",length(res.hyper.Wall.Sampling$hyper)),
res.hyper.Wall.Sampling$sampling_200k)
colnames(temp.4) <- c("PValue","PathwaySplice","random_sampling_200k")
temp <- as_tibble(rbind.data.frame(temp.1,temp.2,temp.3,temp.4))
ggplot(temp, aes(x=random_sampling_200k, y=PValue, colour=PathwaySplice, shape=PathwaySplice)) +
geom_point(size=1, alpha=0.6) + geom_smooth(data=subset(temp,PathwaySplice=="Sampling_200k"),
aes(x=random_sampling_200k, y=PValue),
method=lm,se=FALSE,colour="red") +
labs(color= "Methods", x="P value of random sampling(200k)",y="P value of alternative methods")+
scale_colour_manual(name = "Alternative methods",
labels = c("Sampling_200k", "Sampling_30k", "Wall", "Hyper"),
values = c("red", "blue", "green", "black")) +
scale_shape_manual(name = "Alternative methods",
labels = c("Sampling_200k", "Sampling_30k", "Wall", "Hyper"),
values = c(2, 3, 4, 20))
}
netplot <- function(g,
vertex.label.font=2,
vertex.label.color='#666666',
vertex.label.cex=1.5,
layout=layout.fruchterman.reingold,
foldChange=NULL,
fixed=TRUE,
col.bin=10,
legend.x=1,
legend.y=1, ...) {
if (fixed){
plot.igraph(g,
vertex.label.font=vertex.label.font,
vertex.label.color=vertex.label.color,
vertex.label.cex=vertex.label.cex,
vertex.frame.color=V(g)$color,
layout=layout, ...)
## add legend
if (!is.null(foldChange)) {
## gn <- V(g)$name
## fc <- foldChange[gn]
## fc <- fc[!is.na(fc)]
fc <- foldChange
lbs <- hist(fc, breaks=col.bin-1, plot=FALSE)$breaks
col.legend <- get.col.scale(lbs)
x <- seq(from=legend.x, by=0.03, length.out=length(col.legend))
y <- rep(legend.y, length(col.legend))
points(x, y, pch=15, col=col.legend, cex=2)
idx <- c(1, seq(4, length(col.legend)-1, by=3), length(col.legend))
text(x=x[idx],
y=rep(legend.y-0.05, length(idx)),
label=lbs[idx],
cex = 0.8)
text(x=mean(x),
y=legend.y+0.05,
labels="Fold Change",
cex=0.8, font=2)
}
} else {
tkplot(g,
vertex.label.font=vertex.label.font,
vertex.label.color=vertex.label.color,
vertex.label.cex=vertex.label.cex,
vertex.frame.color=V(g)$color,
layout=layout)
}
invisible(g)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.