# Author: Simona Cristea; scristea@jimmy.harvard.edu
###############################################################################
#' @title Finds all mutually exclusive pairs in a dataset
#'
#' @description \code{analyzePairs} performs step 1 of the TiMEx procedure:
#' tests all gene pairs for mutual exclusivity. It returns a complex list,
#' including parameter estimates, pairwise p-values and intensities of mutual
#' exclusivity, log likelihoods, and others.
#'
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#'
#' @details In the first step, the TiMEx procedure for identifying mutually
#' exclusive groups of alterations tests all pairs for mutual exclusivity. The
#' data corresponding to each pair of genes is therefore fitted to both the
#' Null (Conditional Independence) and the Mutual Exclusivity models.
#' Parameters under the two models are estimated, and, since they are nested,
#' a likelihood ratio test is performed between the corresponding log
#' likelihoods, in order to test whether mu (the intensity of mutual
#' exclusivity) is different from 0. For more details on the TiMEx procedure,
#' as well as on the underlying mathematical model, see Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#'
#' The output list contains exhaustive information on the pairwise testing of
#' genes, including parameter estimates, pairwise p-values and intensities of
#' mutual exclusivity, log likelihoods, and others.
#'
#' This function displays progress messages indicating the gene that is
#' currently being tested against the remaining genes.
#'
#' @return List consisting of a set of matrices, all of dimension \code{n x n}
#' (\code{n} being the number of genes). Element \code{(i,j)} of each matrix
#' corresponds to the pairwise test between genes \emph{i} and \emph{j}:
#' \itemize{
#' \item{\code{lamiEstNull}} {matrix with rate estimates (lambda) of the
#' waiting time of one gene (gene \code{i}) under the Null model.}
#' \item{\code{lamjEstNull}} {matrix with rate estimates (lambda) of the
#' waiting time of the other gene (gene \code{j}) under the Null model.}
#' \item{\code{lamiEstME}} {matrix with rate estimates (lambda) of the waiting
#' time of one gene (gene \code{i}) under the Mutual Exclusivity model.}
#' \item{\code{lamjEstME}} {matrix with rate estimates (lambda) of the
#' waiting time of the other gene (gene \code{j}) under the Mutual Exclusivity
#' model.}
#' \item{\code{likeNull}} {matrix with log likelihoods under the Null model.}
#' \item{\code{likeAlt}} {matrix with log likelihoods under the Mutual
#' Exclusivity model.}
#' \item{\code{LRT}} {matrix with log likelihood ratio test (LRT) statistics.}
#' \item{\code{pvalueLRTCorrectSym}} {list of the following symmetric matrices:
#' \itemize{
#' \item{\code{bonferroni}} {bonferroni corrected p-values corresponding to
#' the LRT statistic.}
#' \item{\code{bonferroni}} {fdr corrected p-values corresponding to the
#' LRT statistic.}
#' \item{\code{uncorrected}} {uncorrected p-values corresponding to the
#' LRT statistic.}
#' }
#' }
#' \item{\code{muEstSym}} {symmetric matrix with estimated pairwise mu
#' (intensity of mutual exclusivity)}
#'}
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{doMaxCliques}} for step 2 of the TiMEx procedure;
#' \code{\link{findSignifCliques}} for step 3 of the TiMEx procedure;
#' the wrapper function \code{\link{TiMEx}} for combining these three steps,
#' and identifying mutually exclusive groups in a binary dataset with the
#' TiMEx model.
#'
#' @examples
#' # Test all pairs from the ovarian dataset for mutual exclusivity (takes
#' # approximately 5 minutes)
#' data(ovarian)
#' \donttest{ovarianPairs<-analyzePairs(ovarian)}
#'
#' @import stats
#'
#' @aliases analyzePairs
#'
#' @export
analyzePairs <- function(mat) {
# check inputs
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
noGenes <- dim(mat)[2]
if (is.null(colnames(mat)))
colnames(mat) <- paste("gene", c(1:noGenes), sep = "")
muEst <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(muEst) <- colnames(mat)
rownames(muEst) <- colnames(mat)
lamiEstME <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(lamiEstME) <- colnames(mat)
rownames(lamiEstME) <- colnames(mat)
lamjEstME <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(lamjEstME) <- colnames(mat)
rownames(lamjEstME) <- colnames(mat)
lamiEstNull <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(lamiEstNull) <- colnames(mat)
rownames(lamiEstNull) <- colnames(mat)
lamjEstNull <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(lamjEstNull) <- colnames(mat)
rownames(lamjEstNull) <- colnames(mat)
likeNull <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(likeNull) <- colnames(mat)
rownames(likeNull) <- colnames(mat)
likeAlt <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(likeAlt) <- colnames(mat)
rownames(likeAlt) <- colnames(mat)
LRT <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(LRT) <- colnames(mat)
rownames(LRT) <- colnames(mat)
pvalueLRT <- matrix(NA, nrow = noGenes, ncol = noGenes)
colnames(pvalueLRT) <- colnames(mat)
rownames(pvalueLRT) <- colnames(mat)
for (gene1 in 1:(noGenes - 1)) {
print(paste("gene", gene1))
for (gene2 in (gene1 + 1):noGenes) {
genes <- mat[, c(gene1, gene2)] #the two genes to be tested
ns <- computeContTable(genes) #their contingency table
nsVect <- c(ns$n00, ns$n01, ns$n10, ns$n11)
# estimate parameters under ME try(opMu<-optim(par=c(0.01,0.01,0.01),fn=computeParmsMu,gr=NULL,
# n=nsVect,method='L-BFGS-B',lower=c(1e-15,1e-15,0),
# upper=c(Inf,Inf,1-(1e-15)),control=list(fnscale=-20,maxit=10000))) while (!exists('opMu')) {
# try(opMu<-optim(par=c(0.01+abs(rnorm(1)),0.01+abs(rnorm(1)),
# min(0.0001+abs(rnorm(1)),0.999)),fn=computeParmsMu,gr=NULL,
# n=nsVect,method='L-BFGS-B',lower=c(1e-15,1e-15,0),
# upper=c(Inf,Inf,1-(1e-15)),control=list(fnscale=-20,maxit=10000), silent=TRUE)) }
opMu <- optim(par = c(0.01, 0.01, 0.01), fn = computeParmsMu, gr = NULL, n = nsVect, method = "L-BFGS-B",
lower = c(1e-15, 1e-15, 0), upper = c(Inf, Inf, 1 - (1e-15)), control = list(fnscale = -20, maxit = 10000))
# estimate parameters under Null try(opNull<-optim(par=c(0.01,0.01),fn=computeParmsNull,gr=NULL,
# n=nsVect,method='L-BFGS-B',lower=c(1e-15,1e-15), upper=c(Inf,Inf),control=list(fnscale=-20,maxit=10000)))
# while (!exists('opNull')) { try(opNull<-optim(par=c(0.01+abs(rnorm(1)),0.01+
# abs(rnorm(1))),fn=computeParmsNull,gr=NULL,n=nsVect, method='L-BFGS-B',lower=c(1e-15,1e-15),
# upper=c(Inf,Inf),control=list(fnscale=-20,maxit=10000), silent=TRUE)) }
opNull <- optim(par = c(0.01, 0.01), fn = computeParmsNull, gr = NULL, n = nsVect, method = "L-BFGS-B",
lower = c(1e-15, 1e-15), upper = c(Inf, Inf), control = list(fnscale = -20, maxit = 10000))
lamiEstME[gene1, gene2] <- opMu$par[1]
lamiEstME[gene2, gene1] <- opMu$par[2]
lamjEstME[gene1, gene2] <- opMu$par[2]
lamjEstME[gene2, gene1] <- opMu$par[1]
muEst[gene1, gene2] <- opMu$par[3]
lamiEstNull[gene1, gene2] <- opNull$par[1]
lamiEstNull[gene2, gene1] <- opNull$par[2]
lamjEstNull[gene1, gene2] <- opNull$par[2]
lamjEstNull[gene2, gene1] <- opNull$par[1]
# compute likelihoods
likeNull[gene1, gene2] <- computeParmsNull(c(lamiEstNull[gene1, gene2], lamjEstNull[gene1, gene2]),
nsVect)
likeAlt[gene1, gene2] <- computeParmsMu(c(lamiEstME[gene1, gene2], lamjEstME[gene1, gene2], muEst[gene1,
gene2]), nsVect)
# perform the LRT
LRT[gene1, gene2] <- 2 * likeAlt[gene1, gene2] - 2 * likeNull[gene1, gene2]
pvalueLRT[gene1, gene2] <- pchisq(LRT[gene1, gene2], df = 1, ncp = 0, lower.tail = FALSE)
}
}
# correct the pvalues
pvalueLRTCorrect <- list()
pvalueLRTCorrect$bonferroni <- matrix(p.adjust(as.vector(pvalueLRT), method = "bonferroni"), nrow = noGenes,
ncol = noGenes)
colnames(pvalueLRTCorrect$bonferroni) <- colnames(pvalueLRT)
rownames(pvalueLRTCorrect$bonferroni) <- rownames(pvalueLRT)
pvalueLRTCorrect$fdr <- matrix(p.adjust(as.vector(pvalueLRT), method = "fdr"), nrow = noGenes, ncol = noGenes)
colnames(pvalueLRTCorrect$fdr) <- colnames(pvalueLRT)
rownames(pvalueLRTCorrect$fdr) <- rownames(pvalueLRT)
pvalueLRTCorrect$uncorrected <- pvalueLRT
# transform to symmetric matrices
muEstSym <- makeSym(muEst)
likeNull <- makeSym(likeNull)
likeAlt <- makeSym(likeAlt)
LRT <- makeSym(LRT)
pvalueLRTSym <- makeSym(pvalueLRT)
pvalueLRTCorrectSym <- list()
pvalueLRTCorrectSym$bonferroni <- makeSym(pvalueLRTCorrect$bonferroni)
pvalueLRTCorrectSym$fdr <- makeSym(pvalueLRTCorrect$fdr)
pvalueLRTCorrectSym$uncorrected <- makeSym(pvalueLRTCorrect$uncorrected)
results <- list(lamiEstNull = lamiEstNull, lamjEstNull = lamjEstNull, lamiEstME = lamiEstME, lamjEstME = lamjEstME,
likeNull = likeNull, likeAlt = likeAlt, LRT = LRT, pvalueLRTCorrectSym = pvalueLRTCorrectSym, muEstSym = muEstSym)
return(results)
}
###############################################################################
#' @title Identifies maximal cliques from pairwise testing information
#'
#' @description \code{doMaxCliques} performs step 2 of the TiMEx procedure:
#' identifies maximal cliques using information from pairwise testing. The
#' maximal clique detection routine only uses the connections between gene
#' pairs which satisfy the thresholds on mu (\code{pairMu}) and pvalue
#' (\code{pairPvalue}).
#'
#' @param pairs list resulting after pairwise testing, as returned by
#' \code{\link{analyzePairs}}
#' @param pairMu pair-level threshold on mu (real number between 0 and 1).
#' Default is 0.5.
#' @param pairPvalue pair-level threshold on p-value (real number between 0 and
#' 1). Default is 0.01.
#'
#' @details In the second step, the TiMEx procedure for identifying mutually
#' exclusive groups of alterations detects maximal cliques using pairwise
#' testing information from step 1. A graph is constructed, in which genes
#' are vertices, and an edge is drawn between any pair \emph{(i,j)} if both the
#' estimated intensity of mutual exclusivity and the computed p-value satisfy
#' the chosen thresholds \code{pairMu} and \code{pairPvalue}. Maximal
#' cliques are detected on this graph.
#'
#' The two thresholds can be set by the
#' user, and are recommended to be chosen based on the sensitivity and
#' specificity levels to which they correspond, as assessed in simulated data.
#' For details, see 'TiMEx: A Waiting Time Model For Mutually Exclusive Cancer
#' Alterations', by Constantinescu \emph{et al.} (2015). The default values are
#' 0.5 for \code{pairMu} and 0.01 for \code{pairPvalue}.
#'
#' This function needs functions from the packages \emph{RBGL} and
#' \emph{igraph} to run.
#'
#' @return list consisting of:
#' \itemize{
#' \item{\code{detectedLengths}} {vector of lengths of the identified maximal
#' cliques.}
#' \item{\code{idxInCliques}} {list with as many elements as lengths of
#' the identified maximal cliques. Each element of the list is a matrix, in
#' which each row represents the indices of genes in an identified maximal
#' clique.}
#' \item{\code{genesInCliques}} {list with as many elements as lengths of
#' the identified maximal cliques. Each element of the list is a matrix, in
#' which each row represents the names of genes in an identified maximal
#' clique.}
#' \item{\code{noMaxCliques}} {vector of numbers of identified maximal cliques
#' corresponding to each length present in the field \code{detectedLengths}.}
#' \item{\code{Mus}} {list of two elements: \code{OrderedGenesInCliques} and
#' \code{OrderedIdxInCliques}, which have the same structure as the elements
#' \code{genesInCliques} and \code{idxInCliques}. The only difference is that
#' the identified maximal cliques are now ordered by their averge pairwise
#' mu.}
#' \item \code{pairMu} input pair-level threshold on mu.
#' \item \code{pairPvalue} input pair-level threshold on p-value.
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{analyzePairs}} for step 1 of the TiMEx procedure;
#' \code{\link{findSignifCliques}} for step 3 of the TiMEx procedure;
#' the wrapper function \code{\link{TiMEx}} for combining these three steps,
#' and identifying mutually exclusive groups in a binary dataset with the
#' TiMEx model.
#'
#' @examples
#' # First, test all pairs from the ovarian cancer dataset for mutual
#' # exclusivity (take approximately 5 minutes)
#' data(ovarian)
#' \donttest{ovarianPairs<-analyzePairs(ovarian)}
#'
#' # Then, identify all maximal cliques using the default thresholds
#' \donttest{ovarianMaxCliques<-doMaxCliques(ovarianPairs)}
#'
#' @importFrom igraph graph.adjacency
#' @importFrom igraph igraph.to.graphNEL
#' @import RBGL
#'
#' @aliases doMaxCliques
#'
#' @export
doMaxCliques <- function(pairs, pairMu, pairPvalue) {
# check inputs
if (missing(pairs))
stop("a list containing pairwise testing information needs to be
specified")
if (!class(pairs) == "list")
stop("a list containing pairwise testing information needs to be
specified")
if (!length(setdiff(c("muEstSym", "pvalueLRTCorrectSym"), names(pairs))) == 0)
stop("the input list 'pairs' should have contained a field with
pairwise information on mu and a field with pairwise information
on p-value")
if (class(pairs$pvalueLRTCorrectSym) != "list")
stop("the field 'pvalueLRTCorrectSym' of the input list should have
an element named 'uncorrected'")
if (length(setdiff(c("uncorrected"), names(pairs$pvalueLRTCorrectSym)) == 0))
stop("the field 'pvalueLRTCorrectSym' of the input list should have
an element named 'uncorrected'")
if (!length(unique(c(dim(pairs$muEstSym), dim(pairs$pvalueLRTCorrectSym$uncorrected)))) == 1)
stop("the matrices with pvalue and mu information should have the
same dimension and be squared")
if (is.null(colnames(pairs$pvalueLRTCorrectSym$uncorrected)))
colnames(pairs$pvalueLRTCorrectSym$uncorrected) <- paste("gene", c(1:dim(pairs$pvalueLRTCorrectSym$uncorrected)[1]),
sep = "")
if (is.null(rownames(pairs$pvalueLRTCorrectSym$uncorrected)))
rownames(pairs$pvalueLRTCorrectSym$uncorrected) <- paste("gene", c(1:dim(pairs$pvalueLRTCorrectSym$uncorrected)[1]),
sep = "")
if (is.null(colnames(pairs$muEstSym)))
colnames(pairs$muEstSym) <- paste("gene", c(1:dim(pairs$muEstSym)[1]), sep = "")
if (is.null(rownames(pairs$muEstSym)))
rownames(pairs$muEstSym) <- paste("gene", c(1:dim(pairs$muEstSym)[1]), sep = "")
if (missing(pairMu))
pairMu <- 0.5
if (!(class(pairMu) == "numeric"))
stop("the threshold on mu needs to be a real number between 0 and 1")
if (length(pairMu) > 1)
stop("the threshold on mu needs to be a real number between 0 and 1")
if (!(0 <= pairMu && pairMu <= 1))
stop("the threshold on mu needs to be a real number between 0 and 1")
if (missing(pairPvalue))
pairPvalue <- 0.01
if (!(class(pairPvalue) == "numeric"))
stop("the threshold on p-value needs to be a real number between 0
and 1")
if (length(pairPvalue) > 1)
stop("the threshold on p-value needs to be a real number between 0
and 1")
if (!(0 <= pairPvalue && pairPvalue <= 1))
stop("the threshold on p-value needs to be a real number between 0
and 1")
cliques <- doClique(pairs$muEstSym, pairs$pvalueLRTCorrectSym$uncorrected, pairMu, pairPvalue)
mcStruct <- analyzeMaxCliquesBySize(cliques)
mcStruct$Mus <- getMusAllMaxCliques(mcStruct, pairs$muEstSym)
# do not return these fields anymore (redundant or unnecessary)
mcStruct$Mus$detectedLengths <- NULL
mcStruct$Mus$idxInCliques <- NULL
mcStruct$Mus$genesInCliques <- NULL
mcStruct$Mus$noMaxCliques <- NULL
mcStruct$pairMu <- pairMu
mcStruct$pairPvalue <- pairPvalue
return(mcStruct)
}
###############################################################################
#' @title Tests whether a given group is mutually exclusive
#'
#' @description \code{testCliqueAsGroup} tests whether a group, given as gene
#' indices, is mutually exclusive.
#'
#' @param geneIdx vector of indices in the input matrix of the genes to be
#' tested
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#' @param lo rate of observation time. Default is 1.
#'
#' @details For deciding whether a group is mutually exclusive, the group of
#' genes is fitted to both the Null (Conditional Independence) and the Mutual
#' Exclusivity models. Parameters under the two models are estimated, and,
#' since they are nested, a likelihood ratio test is performed between the
#' corresponding log likelihoods, in order to test whether mu (the intensity of
#' mutual exclusivity) is different from 0. For computing the likelihood of
#' the data under both models, an exhaustive enumeration of all possible
#' orders of the input alterations needs to be performed. Therefore, the
#' complexity of the test is exponential in the number of genes to be tested,
#' which makes it unfeasible for large number of genes (usually more than 10).
#'
#' \code{lo} (the rate of observation time) is by default set to 1, as both
#' models are otherwise unindentifiable. We recommend leaving the value of this
#' parameter unchanged, as otherwise the estimated waiting time rates of the
#' genes require additional interpretation.
#'
#' For more details on the TiMEx procedure, as well as on the underlying
#' mathematical model, see Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @return List consisting of:
#' \itemize{
#' \item{\code{opMu}} {list as returned by \code{\link[stats]{optim}}. The
#' field \code{par} is a vector of \code{n+1} positions (\code{n} being the
#' number of genes) containing the estimates of the waiting time rates
#' (lambda) for the \code{n} genes under the mutual exclusivity model,
#' followed by the estimate for mu.}
#' \item{\code{opNull}} {list as returned by \code{\link[stats]{optim}}. The
#' field \code{par} is a vector of \code{n} positions (\code{n} being the
#' number of genes) containing the estimates of the waiting time rates
#' (lambda) for the \code{n} genes under the null model.}
#' \item{\code{countsVec}} {the contingency table of the input genes, as a
#' vector. The first element is the count of the samples where no gene was
#' altered, the next \code{n} elements are the counts of the samples where
#' exactly one gene was altered (starting with gene 1 and ending with gene
#' \code{n}), the next \code{n} elements are the counts of the samples where
#' exactly two genes were altered (starting with genes 1 and 2, continuing with
#' genes 1 and 3, and ending with genes \code{n-1} and \code{n}), and so on.}
#' \item{\code{genes}} {subset of the input binary matrix, corresponding to the
#' genes to be tested}
#' \item{\code{LRT}} {log likelihood ratio (LRT)}
#' \item{\code{pvalueLRT}} {the p-value corresponding to the LRT}
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso the wrapper function \code{\link{TiMEx}} for identifying
#' mutually exclusive groups in a binary dataset with the TiMEx model.
#'
#' @examples
#' # Tests for mutual exclusivity the group of genes with indices 13, 204, and
#' # 310 in the ovarian cancer dataset
#' data(ovarian)
#' testGroup<-testCliqueAsGroup(c(13,204,310),ovarian)
#'
#' @import stats
#'
#' @aliases testCliquesAsGroup
#'
#' @export
testCliqueAsGroup <- function(geneIdx, mat, lo) {
# check inputs
if (missing(geneIdx))
stop("need to specify a vector of gene indices to be tested for
mutual exclusivity")
if (!(class(geneIdx) %in% c("numeric", "integer")))
stop("the gene indices need to be a numeric vector")
if (!sum(geneIdx%%1) == 0)
stop("the gene indices need to be positive integers")
if (any(geneIdx <= 0))
stop("the gene indices need to be positive integers")
if (length(unique(geneIdx)) != length(geneIdx))
stop("the gene indices need to be different")
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("the input 'mat' needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("the input 'mat' needs to be a binary matrix")
if (any(geneIdx > dim(mat)[2]))
stop("the gene indices need to be lower than the maximum number of
input genes")
if (missing(lo))
lo <- 1
if (lo != 1)
warning("you are changing the default value of lo")
genes <- mat[, geneIdx]
n <- length(geneIdx)
l <- list()
for (i in 1:n) {
l[[i]] <- seq(0, 1)
}
allGenos <- as.matrix(expand.grid(l))
tabNonZero <- table(unlist(sapply(c(1:dim(genes)[1]), function(y, genes, allGenos) {
which(apply(allGenos, 1, function(x, y) {
w <- genes[y, ]
names(w) <- NULL
all.equal(as.vector(x), w)
}, y = y) == "TRUE")
}, genes = genes, allGenos = allGenos)))
countsVec <- rep(0, dim(allGenos)[1])
countsVec[as.numeric(names(tabNonZero))] <- as.vector(tabNonZero)
opMu <- optim(par = c(rep(0.1, (n + 1))), fn = optimizeParamsMuGroup, gr = NULL, countsVec = countsVec, n = n,
lo = lo, model = "ME", method = "L-BFGS-B", lower = c(rep(1e-10, n), 0), upper = c(rep(Inf, n), (1 - (1e-15))),
control = list(fnscale = -20))
opNull <- optim(par = c(rep(0.1, n)), fn = optimizeParamsMuGroup, gr = NULL, countsVec = countsVec, n = n,
lo = lo, model = "Null", method = "L-BFGS-B", lower = rep(1e-10, n), upper = rep(Inf, n), control = list(fnscale = -20))
likeMu <- opMu$value
likeNull <- opNull$value
LRT <- 2 * likeMu - 2 * likeNull
pvalueLRT <- pchisq(LRT, df = 1, ncp = 0, lower.tail = FALSE)
return(l = list(opMu = opMu, opNull = opNull, countsVec = countsVec, genes = genes, LRT = LRT, pvalueLRT = pvalueLRT))
}
###############################################################################
#' @title Tests all maximal cliques for mutual exclusivity
#'
#' @description \code{findSignifCliques} performs step 3 of the TiMEx
#' procedure, namely tests all candidate maximal cliques for mutual
#' exclusivity and reports the significant ones.
#'
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#' @param mcStruct list containing maximal cliques, as returned by
#' \code{\link{doMaxCliques}}
#' @param groupPvalue threshold for the corrected p-value of the groups, lower
#' than which cliques are significant (real number between 0 and 1). Default
#' is 0.1.
#'
#' @details This function displays progress messages, namely the size of the
#' clique currently being tested, and the number of cliques to test.
#'
#' Note that sequentially performing steps 1, 2, and 3 of the TiMEx procedure
#' (functions \code{\link{analyzePairs}}, \code{\link{doMaxCliques}}, and
#' \code{\link{findSignifCliques}}) is equivalent to simply running the
#' function \code{\link{TiMEx}}.
#'
#' @return list consisting of:
#' \itemize{
#' \item{\code{genesSignif}} {list of significantly mutually exclusive groups,
#' as gene names, sorted by corrected p-value. The list contains as many
#' elements as identified lengths of groups. For example,
#' \code{genesSignif[[2]]}
#' is a list containing the gene names of the significant groups of size 2.
#' Each list of this type further has two elements, \code{fdr} and
#' \code{bonf}, corresponding to different multiple testing correction
#' methods. Each element is a matrix, in which rows represent gene names of
#' significantly mutually exclusive groups.}
#'
#' \item{\code{idxSignif}} {list of significantly mutually exclusive groups, as
#' indices in the input matrix, sorted by corrected p-value. The list
#' contains as many elements as identified lengths of groups. For example,
#' \code{idxSignif[[2]]} is a list containing the indices of the
#' significant groups of size 2. Each list of this type further has two
#' elements, \code{fdr} and \code{bonf}, corresponding to different multiple
#' testing correction methods. Each element is a matrix, in which rows
#' represent indices of significantly mutually exclusive groups.}
#'
#' \item{\code{pvals}} {list of corrected significant p-values corresponding to
#' the tested cliques, ordered ascendingly. The list contains as many elements
#' as identified lengths of significant groups. For example, \code{pvals[[2]]}
#' is a list containing the p-values of the significant maximal cliques of
#' size 2. Each list of this type further has two elements, \code{fdr} and
#' \code{bonf}, corresponding to different multiple testing correction
#' methods. Each element is a vector, of length the number of significant
#' maximal cliques of a given size.}
#'
#' \item{\code{posSignif}} {list of positions of the significant groups in the
#' input list of maximal cliques, ordered ascendingly by corrected p-value.
#' The list contains as many elements as identified lengths of significant
#' groups. For example, \code{posSignif[[2]]} is a list containing the
#' positions of the significant groups of size 2. Each list of this type
#' further has two elements, \code{fdr} and \code{bonf}, corresponding to
#' different multiple correction methods. Each element is a vector, of length
#' the number of significant maximal cliques of a given size.}
#'
#' \item{\code{MusGroup}} {list of inferred mu values corresponding to
#' the tested cliques, ordered ascendingly by the corresponding corrected
#' p-value. The list contains as many elements as identified lengths of
#' significant groups. For example, \code{MusGroup[[2]]} is a list containing
#' the mu values of the significant maximal cliques of size 2. Each list of
#' this type further has two elements, \code{fdr} and \code{bonf},
#' corresponding to different multiple testing correction methods. Each
#' element is a vector, of length the number of significant maximal cliques
#' of a given size.}
#'
#' \item{\code{mcStruct}} {input structure of maximal cliques to be tested
#' for mutual exclusivity, as returned by \code{\link{doMaxCliques}}}
#'
#' \item{\code{matrix}} {input binary alteration matrix}
#'
#' \item{\code{groupPvalue}} {input threshold for the corrected p-value, lower
#' than which cliques are significant}
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{analyzePairs}} for step 1 of the TiMEx procedure;
#' \code{\link{doMaxCliques}} for step 2 of the TiMEx procedure;
#' the wrapper function \code{\link{TiMEx}} for combining these three steps,
#' and identifying mutually exclusive groups in a binary dataset with the
#' TiMEx model. The data structures \code{\link{ovarianOutput}},
#' \code{\link{breastOutput}}, \code{\link{gbmDendrixOutput}}, and
#' \code{\link{gbmMuexOutput}} are examples of structures resulting after
#' running TiMEx on large cancer datasets.
#'
#' @examples
#' # First, test all pairs from the ovarian dataset for mutual exclusivity
#' # (takes approximately 5 minutes).
#' data(ovarian)
#' \donttest{ovarianPairs<-analyzePairs(ovarian)}
#'
#' # Second, identify all maximal cliques using the default thresholds
#' \donttest{ovarianMaxCliques<-doMaxCliques(ovarianPairs)}
#'
#' # Then, test all maximal cliques for mutual exclusivity and report the
#' # significant ones, based on a corrected p-value threshold of 0.1 (default).
#' \donttest{ovarianMEgroups<-findSignifCliques(ovarian,ovarianMaxCliques)}
#'
#' @aliases findSignifCliques
#'
#' @export
findSignifCliques <- function(mat, mcStruct, groupPvalue) {
# check inputs
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("need to specify a binary matrix as input")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
if (is.null(colnames(mat)))
colnames(mat) <- paste("gene", c(1:dim(mat)[2]), sep = "")
if (missing(mcStruct))
stop("need to specify a list of identified maximal cliques as input")
if (length(setdiff(c("detectedLengths", "genesInCliques", "idxInCliques", "Mus", "noMaxCliques", "pairMu",
"pairPvalue"), names(mcStruct))) != 0)
stop("check the names of the elements of the input list")
if (!(class(mcStruct$detectedLengths) == "numeric" || class(mcStruct$detectedLengths) == "integer"))
stop("the elements of the field 'detectedLengths' of the input list
need to be numeric")
if (!sum(mcStruct$detectedLengths%%1) == 0)
stop("the elements of the field 'detectedLengths' of the input list
need to be positive integers")
if (any(mcStruct$detectedLengths <= 0))
stop("the elements of the field 'detectedLengths' of the input list
need to be positive integers")
# if (!(length(mcStruct$idxInCliques)==length(mcStruct$detectedLengths))) stop('the fields of the input list
# detectedLengths and idxInCliques do not match up in size')
if (sum(is.na(match(unlist(mcStruct$idxInCliques), c(1:dim(mat)[2])))))
stop("at least one index provided in the element 'idxInCliques' of
the input list is not part of the indices of the input genes")
# if (!(length(mcStruct$genesInCliques)==length(mcStruct$detectedLengths))) stop('the fields of the input list
# detectedLengths and genesInCliques do not match up in size')
if (sum(is.na(match(unlist(mcStruct$genesInCliques), colnames(mat)))) > 0)
stop("at least one gene name provided in the element 'genesInCliques'
of the input list is not part of the input genes")
if (length(setdiff(c("OrderedIdxInCliques", "OrderedGenesInCliques"), names(mcStruct$Mus))) != 0)
stop("at least one of the fields 'OrderedIdxInCliques'
and 'OrderedGenesInCliques' of the element 'Mus' of the input
list is missing")
if (sum(is.na(match(unlist(mcStruct$Mus$OrderedIdxInCliques), c(1:dim(mat)[2])))))
stop("at least one index provided in the element 'idxInCliques'
of the element 'Mus' of the input list is not part of the indices
of the input genes")
# if (!(length(mcStruct$Mus$OrderedIdxInCliques)== length(mcStruct$detectedLengths))) stop('the fields of the
# input list detectedLengths and Mus$OrderedIdxInCliques do not match up in size')
if (sum(is.na(match(unlist(mcStruct$Mus$OrderedGenesInCliques), colnames(mat)))) > 0)
stop("at least one gene name provided in the element
'OrderedGenesInCliques' of the element 'Mus' of the input list
is not part of the input genes")
# if (!(length(mcStruct$Mus$OrderedGenesInCliques)== length(mcStruct$detectedLengths))) stop('the fields of
# the input list detectedLengths and Mus$OrderedGenesInCliques do not match up in size')
if (!(class(mcStruct$noMaxCliques) == "numeric" || class(mcStruct$noMaxCliques) == "integer"))
stop("the field 'noMaxCliques' of the input list needs to be numeric")
if (!sum(mcStruct$noMaxCliques%%1) == 0)
stop("the elements of the field 'noMaxCliques' of the input
list need to be positive integers")
if (any(mcStruct$noMaxCliques <= 0))
stop("the elements of the field 'noMaxCliques' of the input
list need to be positive integers")
if (!(class(mcStruct$pairMu) == "numeric"))
stop("the field 'pairMu' of the input list should have been a real
number between 0 and 1")
if (length(mcStruct$pairMu) > 1)
stop("the field 'pairMu' of the input list should have been a real
number between 0 and 1")
if (!(0 <= mcStruct$pairMu && mcStruct$pairMu <= 1))
stop("the field 'pairMu' of the input list should have been a real
number between 0 and 1")
if (!(class(mcStruct$pairPvalue) == "numeric"))
stop("the field 'pairPvalue' of the input list should have been a real
number between 0 and 1")
if (length(mcStruct$pairPvalue) > 1)
stop("the field 'pairPvalue' of the input list should have been a real
number between 0 and 1")
if (!(0 <= mcStruct$pairPvalue && mcStruct$pairPvalue <= 1))
stop("the field 'pairPvalue' should have been a real
number between 0 and 1")
if (missing(groupPvalue))
groupPvalue <- 0.1
if (!(class(groupPvalue) == "numeric"))
stop("the level of the corrected p value has to be a real number
between 0 and 1")
if (length(groupPvalue) > 1)
stop("the level of the corrected p value has to be a real number
between 0 and 1")
if (!(0 <= groupPvalue && groupPvalue <= 1))
stop("the level of the corrected p value needs to be a real number
between 0 and 1")
testedCand <- testAllCandidateGroups(mcStruct, mat)
signifCliques <- filterSignifCliques(mcStruct, testedCand, groupPvalue)
signifCliques$mcStruct <- mcStruct
signifCliques$matrix <- mat
signifCliques$groupPvalue <- groupPvalue
signifCliques$uncorrectedPvalues <- testedCand$pvalueLRTMu
return(signifCliques)
}
###############################################################################
#' @title Finds mutually exclusive groups
#'
#' @description \code{TiMEx} is the main function of this package. It
#' identifies all groups of mutually exclusive cancer alterations in a large
#' binary input dataset.
#'
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#' @param pairMu pair-level threshold on mu (real number between 0 and 1).
#' Default is 0.5.
#' @param pairPvalue pair-level threshold on p-value (real number between 0
#' and 1). Default is 0.01.
#' @param groupPvalue threshold for the corrected p-value, lower than which
#' cliques are significant. Default to 0.1
#'
#' @details Dependning on the size of the dataset (both in terms of samples
#' and alterations), TiMEx can require a reasonable time to run. For example,
#' the approximate running time is 10 minutes for the
#' \code{\link{ovarian}} cancer dataset, and 45 minutes for the
#' \code{\link{breast}} cancer dataset included in this package, on a
#' personal computer.
#'
#' \code{TiMEx} displays progress messages. In a fist step, it indicates
#' the gene which is currently being tested against the remaining genes. In a
#' later step, it indicates the size of the clique currently being tested,
#' and the number of cliques to test.
#'
#' @return list consisting of:
#' \itemize{
#' \item{\code{genesSignif}} {list of significantly mutually exclusive groups,
#' as gene names, sorted by corrected p-value. The list contains as many
#' elements as identified lengths of groups. For example,
#' \code{genesSignif[[2]]} is a list containing the gene names of the
#' significant groups of size 2. Each list of this type further has two
#' elements, \code{fdr} and \code{bonf}, corresponding to different multiple
#' testing correction methods. Each element is a matrix, in which rows
#' represent gene names of significantly mutually exclusive groups.}
#'
#' \item{\code{idxSignif}} {list of significantly mutually exclusive groups,
#' as indices in the input matrix, sorted by corrected p-value. The list
#' contains as many elements as identified lengths of groups. For example,
#' \code{idxSignif[[2]]} is a list containing the indices of the
#' significant groups of size 2. Each list of this type further has two
#' elements, \code{fdr} and \code{bonf}, corresponding to different multiple
#' testing correction methods. Each element is a matrix, in which rows
#' represent indices of significantly mutually exclusive groups.}
#'
#' \item{\code{pvals}} {list of corrected significant p-values corresponding to
#' the tested cliques, ordered ascendingly. The list contains as many elements
#' as identified lengths of significant groups. For example, \code{pvals[[2]]}
#' is a list containing the p-values of the significant maximal cliques of
#' size 2. Each list of this type further has two elements, \code{fdr} and
#' \code{bonf}, corresponding to different multiple testing correction
#' methods. Each element is a vector, of length the number of significant
#' maximal cliques of a given size.}
#'
#' \item{\code{posSignif}} {list of positions of the significant groups in the
#' input list of maximal cliques, ordered ascendingly by corrected p-value.
#' The list contains as many elements as identified lengths of significant
#' groups. For example, \code{posSignif[[2]]} is a list containing the
#' positions of the significant groups of size 2. Each list of this type
#' further has two elements, \code{fdr} and \code{bonf}, corresponding to
#' different multiple correction methods. Each element is a vector, of
#' length the number of significant maximal cliques of a given size.}
#'
#' \item{\code{MusGroup}} {list of inferred mu values corresponding to
#' the tested cliques, ordered ascendingly by the corresponding corrected
#' p-value. The list contains as many elements as identified lengths of
#' significant groups. For example, \code{MusGroup[[2]]} is a list containing
#' the mu values of the significant maximal cliques of size 2. Each list of
#' this type further has two elements, \code{fdr} and \code{bonf},
#' corresponding to different multiple testing correction methods. Each
#' element is a vector, of length the number of significant maximal cliques of
#' a given size.}
#'
#' \item{\code{mcStruct}} {input structure of maximal cliques to be tested
#' for mutual exclusivity, as returned by \code{\link{doMaxCliques}}.}
#'
#' \item{\code{matrix}} {input binary alteration matrix.}
#'
#' \item{\code{groupPvalue}} {input threshold for the corrected p-value, lower
#' than which cliques are significant.}
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{analyzePairs}} for step 1 of the TiMEx procedure;
#' \code{\link{doMaxCliques}} for step 2 of the TiMEx procedure, and
#' \code{\link{findSignifCliques}} for step 3 of the TiMEx procedure. The data
#' structures \code{\link{ovarianOutput}}, \code{\link{breastOutput}},
#' \code{\link{gbmDendrixOutput}}, and \code{\link{gbmMuexOutput}}
#' are examples of structures resulting after running TiMEx on large cancer
#' datasets.
#'
#' @examples
#' # Run TiMEx on the ovarian cancer dataset with default parameters
#' # (takes approximately 10 minutes)
#' data(ovarian)
#' \donttest{ovarianMEGroups<-TiMEx(ovarian)}
#'
#' @aliases TiMEx
#'
#' @export
TiMEx <- function(mat, pairMu, pairPvalue, groupPvalue) {
# check inputs
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
if (missing(pairMu))
pairMu <- 0.5
if (!(class(pairMu) == "numeric"))
stop("the threshold on mu needs to be a real number between 0 and 1")
if (length(pairMu) > 1)
stop("the threshold on mu needs to be a real number between 0 and 1")
if (!(0 <= pairMu && pairMu <= 1))
stop("the threshold on mu needs to be a real number between 0 and 1")
if (missing(pairPvalue))
pairPvalue <- 0.01
if (!(class(pairPvalue) == "numeric"))
stop("the threshold on p-value needs to be a real number between
0 and 1")
if (length(pairPvalue) > 1)
stop("the threshold on p-value needs to be a real number between
0 and 1")
if (!(0 <= pairPvalue && pairPvalue <= 1))
stop("the threshold on p-value needs to be a real number between
0 and 1")
if (missing(groupPvalue))
groupPvalue <- 0.1
if (length(groupPvalue) > 1)
stop("the level of the corrected p v-alue has to be a real
number between 0 and 1")
if (!(class(groupPvalue) == "numeric"))
stop("the level of the corrected p v-alue has to be a real
number between 0 and 1")
if (!(0 <= groupPvalue && groupPvalue <= 1))
stop("the level of the corrected p v-alue needs to be a real
number between 0 and 1")
pairs <- analyzePairs(mat)
mcStruct <- doMaxCliques(pairs, pairMu, pairPvalue)
signifCliques <- findSignifCliques(mat, mcStruct, groupPvalue)
return(signifCliques)
}
###############################################################################
#' @title Creates metagroups of genes
#'
#' @description \code{doMetagene} collapses genes with identical alteration
#' patterns across patients into metagroups. It returns a new matrix with
#' the collapsed genes, as well as the members of the metagroups.
#'
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#'
#' @details It is recommended to run this function on the input binary matrix
#' before applying TiMEx, because genes with identical alteration patterns
#' across patients will otherwise be indistinguishable.
#'
#' Note that in the datasets provided in this package, the genes
#' have already been collapsed into metagroups.
#'
#' @return List consisting of:
#' \itemize{
#' \item {\code{newMat}} {the collapsed input binary matrix, with metagenes
#' instead of genes.}
#' \item {\code{groups}} {list of metagenes, with as many elements as input
#' genes which had an identical alteration pattern with at least one other
#' input gene.}
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{ovarianGroups}}, \code{\link{breastGroups}}
#' for examples of metagroups in large cancer datasets.
#'
#' @examples
#' # Simulate genes and extract groups
#' simGenes<-simulateGenes(c(0.5,1,0.3),0.8,4000)
#' genes<-cbind(simGenes$genes,simGenes$genes)
#' genesNew<-doMetagene(genes)
#'
#' # In the datasets provided in this package, the genes have already been
#' # collapsed into metagroups, hence the new matrix will be identical to the
#' # old one.
#' data(ovarian)
#' \donttest{ovarianNew<-doMetagene(ovarian)}
#'
#' @aliases doMetagene
#'
#' @export
doMetagene <- function(mat) {
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
table.combos <- mat
matcRow <- function(mine, table.combos) {
row.is.a.match <- apply(table.combos, 2, identical, mine)
match.idx <- which(row.is.a.match)
return(match.idx)
}
ll <- apply(mat, 2, matcRow, table.combos = mat)
if (class(ll) == "matrix") {
ll <- split(ll, rep(1:ncol(ll), each = nrow(ll)))
}
# list with the groups of metagenes
groups <- ll[which(unlist(lapply(ll, length)) != 1)]
# the new alteration matrix with the identical genes collapsed into metagroups
newMat <- mat[, which(duplicated(t(mat)) == 0)]
return(result = list(newMat = newMat, groups = groups))
}
###############################################################################
#' @title Removes alterations based on frequency
#'
#' @description \code{removeLowFreqs} returns a binary matrix from which genes
#' altered with lower frequency than the input level are removed.
#'
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#' @param level frequency level under which the genes are to be removed
#' (real number between 0 and 1). Default is 0.03.
#'
#' @details It is only recommended to run this function if the user is not at
#' all interested in the role of low-frequently altered genes.
#'
#' @return the binary input matrix without the low-frequently altered genes.
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{ovarian}}, \code{\link{breast}},
#' and \code{\link{gbmDendrix}} for examples of biological large cancer
#' datasets.
#'
#' @examples
#' # Remove genes altered in less than 3% (the default level) of the samples
#' # in the ovarian cancer dataset
#' data(ovarian)
#' ovarianNew<-removeLowFreqs(ovarian)
#'
#' @aliases removeLowFreqs
#'
#' @export
removeLowFreqs <- function(mat, level) {
# check inputs
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
if (missing(level))
level <- 0.03
if (!(class(level) == "numeric"))
stop("the level parameter needs to be a real number between 0 and 1")
if (length(level) > 1)
stop("the level parameter needs to be a real number between 0 and 1")
if (!(0 <= level && level <= 1))
stop("the level parameter needs to be a real number between 0 and 1")
nrpats <- dim(mat)[1]
freqs <- apply(mat, 2, sum)/nrpats
small <- which(freqs < level)
if (length(small) > 0) {
mat <- mat[, -small]
}
return(mat)
}
###############################################################################
#' @title Produces tables with groups
#'
#' @description \code{produceTablesSignifGroups} produces tables with the
#' significant groups. These tables include the names of the genes part of the
#' groups, their respective frequency in the dataset, and the mu and corrected
#' pvalue corresponding to each group.
#'
#' @param signifGroups result structure with the significant groups, as
#' returned by either \code{\link{TiMEx}} or \code{\link{findSignifCliques}}
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#' @param noToShow maximum number of groups to include in the table. Default
#' is 30.
#'
#' @details This function summarizes information on the mutually exclusive
#' groups identified by TiMEx in a dataset, as tables.
#'
#' @return list with as many elements as lengths of the identified mutually
#' exclusive groups, containing tables with the significant groups for each
#' size.
#' Each list of this type further has two elements, \code{fdr} and
#' \code{bonf}, corresponding to different multiple testing correction
#' methods. Each element is a matrix, in which rows represent significantly
#' mutually exclusive groups.
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso the wrapper function \code{\link{TiMEx}} for identifying
#' mutually exclusive groups in a binary dataset with the TiMEx model.
#'
#' @examples
#' # Produce tables on the output of TiMEx on the ovarian cancer dataset
#' data(ovarian)
#' data(ovarianOutput)
#' ovarianTables<-produceTablesSignifGroups(ovarianOutput,ovarian)
#'
#' @aliases produceTablesSignifGroups
#'
#' @export
produceTablesSignifGroups <- function(signifGroups, mat, noToShow) {
# check inputs
if (missing(signifGroups))
stop("need to specify a result structure as returned by function
TiMEx as input")
if (length(setdiff(c("idxSignif", "mcStruct", "genesSignif", "MusGroup", "pvals"), names(signifGroups))) !=
0)
stop("check that the input list contains the following elements:
'idxSignif', 'mcStruct', 'genesSignif', 'MusGroup', and 'pvals' ")
if (class(signifGroups$mcStruct) != "list")
stop("the element 'mcStruct' of the input list should have been a
list")
if (is.null(signifGroups$mcStruct$detectedLengths))
stop("the element 'mcStruct' of the input list should have contained
a field, 'detectedLengths'")
if (!(class(signifGroups$mcStruct$detectedLengths) == "numeric" || class(signifGroups$mcStruct$detectedLengths) ==
"integer"))
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (!sum(signifGroups$mcStruct$detectedLengths%%1) == 0)
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (any(signifGroups$mcStruct$detectedLengths <= 0))
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (is.null(signifGroups$mcStruct$noMaxCliques))
stop("the element 'mcStruct' of the input list should
have contained a field, 'noMaxCliques'")
if (!(class(signifGroups$mcStruct$noMaxCliques) == "numeric" || class(signifGroups$mcStruct$noMaxCliques) ==
"integer"))
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (!sum(signifGroups$mcStruct$noMaxCliques%%1) == 0)
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (any(signifGroups$mcStruct$noMaxCliques <= 0))
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (length(signifGroups$mcStruct$noMaxCliques) != length(signifGroups$mcStruct$detectedLengths))
stop("the lengths of the fields 'detectedLengths' and
'noMaxCliques' of 'mcStruct' in the input list do not coincide")
if (class(signifGroups$idxSignif) != "list")
stop("the field 'idxSignif' of the input list
should have been a non-empty list")
if (length(signifGroups$idxSignif) == 0)
stop("the field 'idxSignif' of the input list
should have been a non-empty list")
for (i in 1:length(signifGroups$idxSignif)) if (!is.null(signifGroups$idxSignif[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$idxSignif[[i]]))) != 0)
stop("each element of the 'idxSignif' field of the input list
should have been a list with two elements,
'fdr' and 'bonf'")
}
if (class(signifGroups$genesSignif) != "list")
stop("the field 'genesSignif' of the input list should have been
a non-empty list")
if (length(signifGroups$genesSignif) == 0)
stop("the field 'genesSignif' of the input list should have been
a non-empty list")
for (i in 1:length(signifGroups$genesSignif)) if (!is.null(signifGroups$genesSignif[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$genesSignif[[i]]))) != 0)
stop("each element of the 'genesSignif' field of the
input list should have been a list with two elements,
'fdr' and 'bonf'")
}
if (class(signifGroups$pvals) != "list")
stop("the field 'pvals' of the input list should have been
a non-empty list")
if (length(signifGroups$pvals) == 0)
stop("the field 'pvals' of the input list should have been
a non-empty list")
for (i in 1:length(signifGroups$pvals)) if (!is.null(signifGroups$pvals[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$pvals[[i]]))) != 0)
stop("each element of the 'pvals' field of the input list
should have been a list with two elements,
'fdr' and 'bonf'")
}
if (class(signifGroups$MusGroup) != "list")
stop("the field 'MusGroup' of the input list should have been
a non-empty list")
if (length(signifGroups$MusGroup) == 0)
stop("the field 'MusGroup' of the input list should have been
a non-empty list")
for (i in 1:length(signifGroups$MusGroup)) if (!is.null(signifGroups$MusGroup[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$MusGroup[[i]]))) != 0)
stop("each element of the 'MusGroup' field of the input list
should have been a list with two elements,
'fdr' and 'bonf'")
}
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
if (missing(noToShow))
noToShow <- 30
if (length(noToShow) > 1)
stop("the number of groups to show needs to be a positive integer")
if (!(class(noToShow) == "numeric"))
stop("the number of groups to show needs to be a positive integer")
if (!(noToShow%%1 == 0))
stop("the number of groups to show needs to be a positive integer")
if (noToShow <= 0)
stop("the number of groups to show needs to be a positive integer")
ff <- getFreqsForCliquesAfterTest(signifGroups$idxSignif, mat)
namesFreqs <- list()
if (length(signifGroups$mcStruct$noMaxCliques) >= 2) {
for (i in 2:length(signifGroups$mcStruct$noMaxCliques)) {
namesFreqs[[i]] <- list()
namesFreqs[[i]]$fdr <- combineNameWithFreq(signifGroups$genesSignif[[i]]$fdr, ff[[i]]$fdr, signifGroups$MusGroup[[i]]$fdr,
signifGroups$pvals[[i]]$fdr)
if (!is.null(dim(namesFreqs[[i]]$fdr))) {
if (dim(namesFreqs[[i]]$fdr)[1] > noToShow)
namesFreqs[[i]]$fdr <- namesFreqs[[i]]$fdr[c(1:noToShow), ]
}
namesFreqs[[i]]$fdr <- as.data.frame(namesFreqs[[i]]$fdr)
namesFreqs[[i]]$bonf <- combineNameWithFreq(signifGroups$genesSignif[[i]]$bonf, ff[[i]]$bonf, signifGroups$MusGroup[[i]]$bonf,
signifGroups$pvals[[i]]$bonf)
if (!is.null(dim(namesFreqs[[i]]$bonf))) {
if (dim(namesFreqs[[i]]$bonf)[1] > noToShow)
namesFreqs[[i]]$bonf <- namesFreqs[[i]]$bonf[c(1:noToShow), ]
}
namesFreqs[[i]]$bonf <- as.data.frame(namesFreqs[[i]]$bonf)
}
} else {
i <- signifGroups$mcStruct$detectedLengths
namesFreqs[[i]] <- list()
namesFreqs[[i]]$fdr <- combineNameWithFreq(signifGroups$genesSignif[[i]]$fdr, ff[[i]]$fdr, signifGroups$MusGroup[[i]]$fdr,
signifGroups$pvals[[i]]$fdr)
if (!is.null(dim(namesFreqs[[i]]$fdr))) {
if (dim(namesFreqs[[i]]$fdr)[1] > noToShow)
namesFreqs[[i]]$fdr <- namesFreqs[[i]]$fdr[c(1:noToShow), ]
}
namesFreqs[[i]]$fdr <- as.data.frame(namesFreqs[[i]]$fdr)
namesFreqs[[i]]$bonf <- combineNameWithFreq(signifGroups$genesSignif[[i]]$bonf, ff[[i]]$bonf, signifGroups$MusGroup[[i]]$bonf,
signifGroups$pvals[[i]]$bonf)
if (!is.null(dim(namesFreqs[[i]]$bonf))) {
if (dim(namesFreqs[[i]]$bonf)[1] > noToShow)
namesFreqs[[i]]$bonf <- namesFreqs[[i]]$bonf[c(1:noToShow), ]
}
namesFreqs[[i]]$bonf <- as.data.frame(namesFreqs[[i]]$bonf)
}
return(namesFreqs)
}
###############################################################################
#' @title Assesses the stability of groups by subsampling
#'
#' @description \code{subsampleAnalysis} subsamples the set of patients and
#' assess the stability of the identified mutually exclusive groups.
#'
#' @param subsampl a vector with subsampling frequencies
#' @param noReps number of repetitions of subsampling. Default is 100.
#' @param signifGroups result structure with the significant groups, as
#' returned by either \code{\link{TiMEx}} or \code{\link{findSignifCliques}}
#'
#' @details As this function runs TiMEx many times sequentially, it is
#' computationally very intensive. For a version of this function which can be
#' directly ran on a cluster, please contact me (see e-mail below).
#'
#' For example, after loading \code{data(ovarianOutput)} and running
#'
#' \code{counts<-subsampleAnalysis(subsampl=c(0.3,0.5,0.8),
#' signifGroups=signifGroups)}
#'
#' \code{counts[[1]][[3]]} will represent
#' the relative counts of the identified mutually exclusive groups of size 3
#' for a subsampling frequency of 30\%, for both \code{fdr} and \code{bonf}
#' (bonferroni) multiple correction methods.
#'
#' @return list with as many elements as subsampling frequencies provided. Each
#' element is further a list with as many elemenets as number of sizes of the
#' significantly mutually exclusive groups identified. Aditionally, \code{bonf}
#' and \code{fdr} are two lists corresponding to each of these elements,
#' representing different multiple correction methods. Finally, each element
#' is a vector of subsampling frequencies of the significantly mutually
#' exclusive groups identified. For an example, see \emph{Details} above.
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso the wrapper function \code{\link{TiMEx}} for identifying mutually
#' exclusive groups in a binary dataset with the TiMEx model,
#' \code{\link{ovarianSubsampling}},
#' \code{\link{breastSubsampling}},
#' \code{\link{gbmDendrixSubsampling}}, and
#' \code{\link{gbmMuexSubsampling}} for examples of outputs after
#' performing the subsampling analysis.
#'
#' @examples
#' # running this function is time-intensive
#' data(ovarianOutput)
#' \dontrun{subsampleOvarian<-subsampleAnalysis(c(0.3,0.5,0.8),ovarianOutput)}
#'
#' @aliases subsampleAnalysis
#'
#' @export
subsampleAnalysis <- function(subsampl, noReps, signifGroups) {
# check inputs
if (missing(subsampl))
stop("a vector of subsampling frequencies is needed as input")
if (!(class(subsampl) == "numeric"))
stop("the subsampling frequencies need to be real numbers between
0 and 1")
if (sum(sapply(subsampl, function(x) {
if (!(x > 0 && x <= 1)) p <- 1 else p <- 0
return(p)
})) > 0)
stop("the subsampling frequencies need to be real numbers between
0 and 1")
if (missing(noReps))
noReps <- 100
if (!(class(noReps) == "numeric"))
stop("the number of repetitions needs to be positive integer")
if (!(noReps%%1 == 0))
stop("the number of repetitions needs to be positive integer")
if (!noReps > 0)
stop("the number of repetitions needs to be positive integer")
if (missing(signifGroups))
stop("need to specify a result structure as returned by
function TiMEx as input")
if (length(setdiff(c("idxSignif", "mcStruct", "genesSignif", "MusGroup", "pvals"), names(signifGroups))) !=
0)
stop("check that the input list contains the following elements:
'idxSignif', 'mcStruct', 'genesSignif', 'MusGroup', and 'pvals' ")
if (class(signifGroups$mcStruct) != "list")
stop("the element 'mcStruct' of the input list should have been a
list")
if (is.null(signifGroups$mcStruct$detectedLengths))
stop("the element 'mcStruct' of the input list should have contained
a field, 'detectedLengths'")
if (!(class(signifGroups$mcStruct$detectedLengths) == "numeric" || class(signifGroups$mcStruct$detectedLengths) ==
"integer"))
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (!sum(signifGroups$mcStruct$detectedLengths%%1) == 0)
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (any(signifGroups$mcStruct$detectedLengths <= 0))
stop("the elements of the 'detectedLengths' field of
'mcStruct' in the input list should have been positive integers")
if (is.null(signifGroups$mcStruct$noMaxCliques))
stop("the element 'mcStruct' of the input list
should have contained a field, 'noMaxCliques'")
if (!(class(signifGroups$mcStruct$noMaxCliques) == "numeric" || class(signifGroups$mcStruct$noMaxCliques) ==
"integer"))
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (!sum(signifGroups$mcStruct$noMaxCliques%%1) == 0)
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (any(signifGroups$mcStruct$noMaxCliques <= 0))
stop("the elements of the 'noMaxCliques' field of
'mcStruct' in the input list should have been positive integers")
if (length(signifGroups$mcStruct$noMaxCliques) != length(signifGroups$mcStruct$detectedLengths))
stop("the lengths of the fields 'detectedLengths' and
'noMaxCliques' of 'mcStruct' in the input list do not coincide")
if (class(signifGroups$idxSignif) != "list")
stop("the field 'idxSignif' of the input list should have been a
non-empty list")
if (length(signifGroups$idxSignif) == 0)
stop("the field 'idxSignif' of the input list should have been a
non-empty list")
for (i in 1:length(signifGroups$idxSignif)) if (!is.null(signifGroups$idxSignif[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$idxSignif[[i]]))) != 0)
stop("each element of the 'idxSignif'
field of the input list should have been a list with two elements,
'fdr' and 'bonf'")
}
if (class(signifGroups$genesSignif) != "list")
stop("the field 'genesSignif' of the input list should have been
a non-empty list")
if (length(signifGroups$genesSignif) == 0)
stop("the field 'genesSignif' of the input list should have been
a non-empty list")
for (i in 1:length(signifGroups$genesSignif)) if (!is.null(signifGroups$genesSignif[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$genesSignif[[i]]))) != 0)
stop("each element of the 'genesSignif' field of the input
list should have been a list with two elements, 'fdr' and
'bonf'")
}
if (class(signifGroups$pvals) != "list")
stop("the field 'pvals' of the input list should have been a non-empty
list")
if (length(signifGroups$pvals) == 0)
stop("the field 'pvals' of the input list should have been a non-empty
list")
for (i in 1:length(signifGroups$pvals)) if (!is.null(signifGroups$pvals[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$pvals[[i]]))) != 0)
stop("each element of the 'pvals' field of the input list
should have been a list with two elements,
'fdr' and 'bonf'")
}
if (class(signifGroups$MusGroup) != "list")
stop("the field 'MusGroup' of the input list should have
been a non-empty list")
if (length(signifGroups$MusGroup) == 0)
stop("the field 'MusGroup' of the input list should have
been a non-empty list")
for (i in 1:length(signifGroups$MusGroup)) if (!is.null(signifGroups$MusGroup[[i]])) {
if (length(setdiff(c("fdr", "bonf"), names(signifGroups$MusGroup[[i]]))) != 0)
stop("each element of the 'MusGroup' field of the input list
should have been a list with two elements,
'fdr' and 'bonf'")
}
if (length(setdiff(c("pairMu", "pairPvalue"), names(signifGroups$mcStruct))) > 0)
stop("the 'mcStruct' field of the input list should have included
the parameters pairMu and pairPvalue")
if (!(class(signifGroups$mcStruct$pairPvalue) == "numeric"))
stop("the field 'pairPvalue' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (!(0 <= signifGroups$mcStruct$pairPvalue && signifGroups$mcStruct$pairPvalue <= 1))
stop("the field 'pairPvalue' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (length(signifGroups$mcStruct$pairPvalue) != 1)
stop("the field 'pairPvalue' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (!(class(signifGroups$mcStruct$pairMu) == "numeric"))
stop("the field 'pairMu' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (!(0 <= signifGroups$mcStruct$pairMu && signifGroups$mcStruct$pairMu <= 1))
stop("the field 'pairMu' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (length(signifGroups$mcStruct$pairMu) != 1)
stop("the field 'pairMu' of 'mcStruct' of the input list should
have been a real number between 0 and 1")
if (!(class(signifGroups$groupPvalue) == "numeric"))
stop("the field 'groupPvalue' of the input list should
have been a real number between 0 and 1")
if (!(0 <= signifGroups$groupPvalue && signifGroups$groupPvalue <= 1))
stop("the field 'groupPvalue' of the input list should have been
a real number between 0 and 1")
if (length(signifGroups$groupPvalue) != 1)
stop("the field 'groupPvalue' of the input list should have been
a real number between 0 and 1")
if (is.null(signifGroups$matrix))
stop("the input list should have had a binary matrix, 'matrix',
as one of the fields")
if (is.null(dim(signifGroups$matrix)))
stop("the input list should have had a binary matrix, 'matrix',
as one of the fields")
if ((all.equal(c(0, 1), sort(unique(as.vector(signifGroups$matrix)))) != 1))
stop("the input list should have had a binary matrix, 'matrix',
as one of the fields")
if (sum(is.na(match(unlist(signifGroups$genesSignif), colnames(signifGroups$matrix)))) > 0)
stop("at least one gene name provided in the element 'genesInCliques'
of the input list is not part of the input genes")
pairMu <- signifGroups$mcStruct$pairMu
pairPvalue <- signifGroups$mcStruct$pairPvalue
groupPvalue <- signifGroups$groupPvalue
countsAll <- list()
if (length(signifGroups$genesSignif) > 0) {
for (idxSubs in 1:length(subsampl)) {
countsFreq <- list()
for (io in 1:length(signifGroups$mcStruct$detectedLengths)) {
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]] <- list()
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr <- rep(0, max(1, dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$fdr)[1]))
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf <- rep(0, max(1, dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$bonf)[1]))
}
for (idxR in 1:noReps) {
matrixSubsample <- signifGroups$matrix[sample(c(1:dim(signifGroups$matrix)[1]), ceiling(subsampl[idxSubs] *
dim(signifGroups$matrix)[1])), ]
signifGroupsSubsample <- TiMEx(matrixSubsample, pairMu, pairPvalue, groupPvalue)
genesSignifCombined <- list()
for (io in 1:length(signifGroups$mcStruct$detectedLengths)) {
genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]] <- list()
if (signifGroups$mcStruct$detectedLengths[io] > 1) {
# if groups of the tested length were detected at all
if (length(signifGroupsSubsample$genesSignif) >= signifGroups$mcStruct$detectedLengths[io]) {
genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$bonf <- rbind(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$bonf,
signifGroupsSubsample$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$bonf)
dph <- duplicated(genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$bonf) |
duplicated(genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$bonf, fromLast = TRUE)
# if more than one group was detected initially
if (length(dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$bonf)[1]) >
0)
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf +
dph[1:dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$bonf)[1]] +
0 else countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf +
1
genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$fdr <- rbind(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$fdr,
signifGroupsSubsample$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$fdr)
dph <- duplicated(genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$fdr) |
duplicated(genesSignifCombined[[signifGroups$mcStruct$detectedLengths[io]]]$fdr, fromLast = TRUE)
# if more than one group was detected initially
if (length(dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$fdr)[1]) >
0)
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr +
dph[1:dim(signifGroups$genesSignif[[signifGroups$mcStruct$detectedLengths[io]]]$fdr)[1]] +
0 else countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr +
1
}
}
}
}
for (io in 1:length(signifGroups$mcStruct$detectedLengths)) {
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$bonf/noReps
countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr <- countsFreq[[signifGroups$mcStruct$detectedLengths[io]]]$fdr/noReps
}
countsAll[[idxSubs]] <- countsFreq
}
names(countsAll) <- paste("SubsamplingFreq:", subsampl, sep = "")
for (jo in 1:length(countsAll)) {
names(countsAll[[jo]]) <- paste("groupSize=", c(1:length(countsAll[[jo]])), sep = "")
}
}
return(countsAll)
}
###############################################################################
#' @title Plots a Mutually Exclusive group
#'
#' @description \code{plotGroupByName} plots a mutually exclusive group
#' (including the frequencies of the genes), given the names of the genes in
#' the group.
#'
#' @param group vector of gene names to be plotted
#' @param mat binary alteration matrix, with rows representing patients and
#' columns representing genes
#'
#' @details The plotting is done based on the function
#' \code{\link[graphics]{image}}.
#'
#' @return None
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references 'Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso the wrapper function \code{\link{TiMEx}} for identifying
#' mutually exclusive groups in a binary dataset with the TiMEx model.
#'
#' @examples
#' # Plot the group consisting of the copy number aberrations of MIEN1 and
#' # CDKN1B, and the point mutations of CDH1, GATA3, and MAP3K1, in breast
#' # cancer.
#' data(breast)
#' group<-c('MIEN1-CNA','CDH1-Mut','GATA3-Mut','MAP3K1-Mut','CDKN1B-CNA')
#' plotGroupByName(group,breast)
#'
#' @importFrom graphics image
#' @importFrom graphics axis
#'
#' @aliases plotGroupByName
#'
#' @export
plotGroupByName <- function(group, mat) {
# check inputs
if (missing(group))
stop("a group of genes needs to be specified")
if (missing(mat))
stop("need to specify a binary matrix as input")
if (is.null(dim(mat)))
stop("input needs to be a binary matrix")
if ((all.equal(c(0, 1), sort(unique(as.vector(mat)))) != 1))
stop("input needs to be a binary matrix")
if (sum(is.na(match(group, colnames(mat)))) > 0)
stop("at least one gene name is not part of the input genes")
l <- length(group)
idx <- c()
for (i in 1:l) {
idx[i] <- which(colnames(mat) == group[i])
}
submatrix <- mat[, idx]
freqs <- round(apply(submatrix, 2, sum) * 100/dim(submatrix)[1], 2)
ord <- order(freqs, decreasing = TRUE)
submatrix <- submatrix[, ord]
mat2 <- submatrix
for (i in l:1) mat2 <- mat2[order(mat2[, i]), ]
image(t(mat2), col = c("white", "black"), yaxt = "n", xaxt = "n", ylab = "samples")
axis(1, at = seq(from = 0, to = 1, length.out = l), tck = -0.03, cex.axis = 0.7, labels = paste(group[ord],
"\n ", freqs[ord], " %", sep = ""))
}
###############################################################################
#' @title Recovers members of the metagroups
#'
#' @description \code{recoverAllNameGroups} recovers, from a metagroup, the
#' names of the genes part of an identified mutually exclusive group.
#'
#' @param groupsMeta list containing groups of equivalent genes, as returned
#' by the field \code{groups} of \code{\link{doMetagene}}
#' @param clGenes matrix of mutually exclusive groups of same size, as gene
#' names. This type of matrix is returned by either \code{\link{TiMEx}} or
#' \code{\link{findSignifCliques}}, as one of the
#' matrix elements of the \code{genesSignif} field.
#'
#' @details This function can be used if the input binary matrix contains
#' identical events that need to be merged into metagenes using
#' \code{\link{doMetagene}}. Running \code{recoverAllNamesGroups} provides the
#' set of identical alterations which are part of the identified mutually
#' exclusive groups.
#'
#' In order to run this function on all the identified mutually exclusive
#' groups as returned by \code{\link{TiMEx}} or
#' \code{\link{findSignifCliques}}, it is necessary to run it separately on
#' each matrix element (corresponding to different group sizes and different
#' correction methods) of the \code{genesSignif} field in the structure
#' returned by either \code{\link{TiMEx}} or \code{\link{findSignifCliques}}.
#'
#' For example, after loading \code{data(ovarianGroups)} and
#' \code{data(ovarianOutput)}, and running
#'
#' \code{rGroups<-recoverAllNamesGroups(ovarianGroups,
#' signifGroups$genesSignif[[3]]$bonf)}
#'
#' \code{rGroups[[14]]} has 3 elements (as many as genes part
#' of the identified mutually exclusive groups). Each element is the metagroup
#' of each of the genes part of the 14th mutually exclusive group in the input
#' matrix
#' \code{signifGroups$genesSignif[[3]]$bonf}. Namely, \emph{BRD4-CNA} and
#' \emph{MYC-CNA} have unique alteration patterns among samples, and are alone
#' in their metagroup, while \emph{CASC1-CNA} has an identical alteration
#' pattern with \emph{KRAS-CNA} and \emph{LYRM5-CNA}. The numbers
#' below the gene names are the indices of the genes in the initial input
#' binary matrix of patients.
#'
#' @return list with as many elements as number of identified mutually
#' exclusive groups, \emph{i.e.} number of rows in the input matrix. Each of
#' its elements is further a list, containing, at each position, the metagroup
#' of the genes in the initial group at that resepective position. For an
#' example, see \emph{Details} above.
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso \code{\link{doMetagene}} for collapsing the genes of an input
#' matrix with identical alteration patterns into metagroups.
#'
#' @examples
#' data(ovarianGroups)
#' data(ovarianOutput)
#' r<-recoverAllNamesGroups(ovarianGroups,ovarianOutput$genesSignif[[3]]$bonf)
#'
#' @aliases recoverAllNamesGroups
#'
#' @export
recoverAllNamesGroups <- function(groupsMeta, clGenes) {
# check inputs
if (missing(groupsMeta))
stop("a list of groups needs to be provided as input")
if (class(groupsMeta) != "list" && class(groupsMeta) != "character")
stop("a list of groups needs to be provided as input")
if (missing(clGenes))
stop("a matrix of identified groups, as gene names, needs to be
provided as input")
if (class(clGenes) != "matrix" && class(clGenes) != "character")
stop("a matrix of identified groups, as gene names, needs to be
provided as input")
newGroups <- list()
for (i in 1:dim(clGenes)[1]) {
newGroups[[i]] <- list()
k <- 1
for (j in 1:dim(clGenes)[2]) {
pos <- which(names(groupsMeta) == clGenes[i, j])
if (length(pos) > 0)
newGroups[[i]][[k]] <- groupsMeta[pos] else newGroups[[i]][[k]] <- clGenes[i, j]
k <- k + 1
}
}
return(newGroups)
}
###############################################################################
#' @title Generates data from the TiMEx model
#'
#' @description \code{simulateGenes} returns a list containing a binary matrix
#' simulated from the TiMEx model, for given lambdas (exponential rates),
#' mu (intensity of mutual exclusivity), and N (sample size).
#'
#' @param lambdas vector of exponential rates (positive real numbers).
#' The length of the vector equals the number of simulated genes.
#' @param mu intensity of mutual exclusivity (real number between
#' 0 and 1). Default is 1 (perfect mutual exclusivity).
#' @param N sample size (positive integer).
#'
#' @details This function needs \code{\link[gtools]{permutations}} in order to
#' run.
#'
#' For details on how the values of the exponential rates correspond to
#' frequencies, see \emph{References} below.
#'
#' @return list consisting of:
#' \itemize{
#' \item{\code{genes}} {the simulated dataset, as a binary matrix.}
#' \item{\code{genoMat}} {the matrix with genotype probabilities, from which
#' the dataset was simulated. This matrix has as many dimensions as number of
#' genes, i.e. 2x2x...x2. For each dimension, the first position corresponds to
#' the probability of observing a 0 for that gene, and the second position
#' corresponds to the probability of observing an 1. For example, in the case
#' of 4 genes, the probability of observing the null genotype \code{(0000)} is
#' given by \code{genoMat[1,1,1,1]}; the probability of observing the genotype
#' \code{(1011)} is given by \code{genoMat[2,1,2,2]}; the probability of
#' observing the genotype \code{(1111)} is given by \code{genoMat[2,2,2,2]}.
#' The entries of this matrix are nonnegative and sum up to 1.}
#' }
#'
#' @author Simona Cristea, \email{scristea@@jimmy.harvard.edu}
#'
#' @references Constantinescu
#' \emph{et al.}: \emph{TiMEx: A Waiting Time Model for Mutually Exclusive
#' Cancer Alterations}. Bioinformatics (2015).
#'
#' @seealso the wrapper function \code{\link{TiMEx}} for identifying
#' mutually exclusive groups in a binary dataset with the TiMEx model,
#' \code{\link{ovarian}}, \code{\link{breast}}, and
#' \code{\link{gbmDendrix}} for examples of biological large cancer
#' datasets.
#'
#' @examples
#' simGenes<-simulateGenes(c(0.5,1,0.3),0.8,4000)
#'
#' @importFrom gtools permutations
#'
#' @aliases simulateGenes
#'
#' @export
simulateGenes <- function(lambdas, mu, N) {
# check inputs
if (missing(lambdas))
stop("a vector of nonegative exponential rates needs to be
provided as input")
if (!(class(lambdas) == "numeric"))
stop("a vector of nonegative exponential rates needs to be
provided as input")
if (any(lambdas <= 0))
stop("a vector of nonegative exponential rates needs to be
provided as input")
if (missing(mu))
mu <- 1
# if (class(mu)!='numeric') stop('mu needs to be a real number between 0 and 1')
if (!(mu >= 0 && mu <= 1))
stop("mu needs to be a real number between 0 and 1")
if (missing(N))
stop("the sample size N needs to be a positive integer")
if (!(N%%1 == 0))
stop("the sample size N needs to be a positive integer")
if (N <= 0)
stop("the sample size N needs to be a positive integer")
# set the observation lambda lo=1
lo <- 1
n <- length(lambdas)
genoMat <- array(NA, dim = rep(2, n))
for (i in 1:n) {
dimnames(genoMat)[[i]] <- paste("pos", i, "=", c(0, 1), sep = "")
}
l <- list()
for (i in 1:n) {
l[[i]] <- seq(0, 1)
}
allGenos <- as.matrix(expand.grid(l))
allGenosAtLeastTwo <- allGenos[which(apply(allGenos, 1, sum) > 1), ]
allGenosOne <- allGenos[which(apply(allGenos, 1, sum) == 1), ]
allGenosZero <- allGenos[which(apply(allGenos, 1, sum) == 0), ]
allGenoPositive <- rbind(allGenosOne, allGenosAtLeastTwo)
# for the full0 case
p1 <- compProbFullZeroSims(allGenosZero, lambdas, lo)
# for all the genotypes which are not full 0
p2 <- apply(allGenoPositive, 1, compAllProbsSims, lambdas = lambdas, lo = lo, mu = mu)
genoMat[1] <- p1
genoMat[(allGenoPositive + 1)] <- p2
simGenes <- produceGenesGroup(N, genoMat, n)
genes <- simGenes$genes
return(list(genes = genes, genoMat = genoMat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.