R/user_functions.R
In globalSeq: Global Test for Counts

Documented in cursus omnibus proprius

############### Genome-wide analysis ###########################################

#' Genome-wide analysis
#' 
#' This function tests for associations between gene expression
#' or exon abundance (\code{Y})
#' and genetic or epigenetic alterations (\code{X}).
#' Using the locations of genes (\code{Yloc}),
#' and the locations of genetic
#' or epigenetic alterations (\code{Xloc}),
#' the expression of each gene is tested for associations with 
#' alterations on the same chromosome that are closer to the gene
#' than a given distance (\code{window}).
#' 
#' @export
#' @keywords methods
#' 
#' @inheritParams omnibus
#' @param Y
#' \strong{RNA-Seq data}\strong{:}
#' numeric matrix with \code{q} rows (genes)
#' and \code{n} columns (samples);
#' or a SummarizedExperiment object
#' @param Yloc
#' \strong{location RNA-Seq}\strong{:}
#' numeric vector of length \code{q}
#' (point location)\strong{;}
#' numeric matrix with \code{q} rows
#' and two columns (start and end locations)
#' @param X
#' \strong{genomic profile}\strong{:}
#' numeric matrix with \code{p} rows (covariates)
#' and \code{n} columns (samples)
#' @param Xloc
#' \strong{location covariates}\strong{:}
#' numeric vector of length \code{p}
#' @param Ychr
#' chromosome RNA-Seq\strong{:}
#' factor of length \code{q}
#' @param Xchr
#' chromosome covariates\strong{:}
#' factor of length \code{p}
#' @param window
#' \strong{maximum distance}\strong{:}
#' non-negative real number
#' @param nodes
#' number of cluster nodes for parallel computation
#' @param phi
#' dispersion parameters\strong{:} vector of length \code{q}
#' 
#' @details
#' Note that \code{Yloc}, \code{Xloc} and \code{window} must
#' be given in the same unit, usually in base pairs.
#' If \code{Yloc} indicates interval \strong{locations},
#' and \code{window} is zero,
#' then only covariates between the start and end location
#' of the gene are of interest.
#' Typically \code{window} is larger than one million base pairs.
#' 
#' If \code{Y} and \code{X} include data from a single chromosome,
#' \code{Ychr} and \code{Xchr} are redundant.
#' If \code{Y} or \code{X} include data
#' from \strong{multiple chromosomes},
#' \code{Ychr} and \code{Xchr} should be specified
#' in order to prevent confusion between chromosomes. 
#' 
#' For the simultaneous analysis of
#' \strong{multiple genomic profiles}
#' \code{X} should be a list of numeric matrices with
#' \code{n} columns (samples),
#' \code{Xloc} a list of numeric vectors,
#' and \code{window} a list of non-negative real numbers.
#' If provided, \code{Xchr} should be alist of of numeric vectors.
#' 
#' The \code{offset} is meant to account for
#' different \strong{libary sizes}.
#' By default the \code{offset} is calculated based on \code{Y}.
#' Different library sizes can be ignored by 
#' setting the \code{offset} to \code{rep(1,n)}.
#' 
#' The user can provide the \strong{confounding} variable \code{group}.
#' Note that each level of \code{group} must appear at least twice
#' in order to allow stratified permutations.
#' 
#' Efficient alternatives to classical \strong{permutation} (\code{kind=1})
#' are the method of control variates (\code{kind=0})
#' and permutation in chunks (0 < \code{kind} < 1)
#' \link[=intern.crude]{details}.
#' 
#' @return
#'
#' The function returns a dataframe,
#' with the p-values in the first row
#' and the test statistics in the second row.
#' 
#' @references
#' 
#' A Rauschenberger, MA Jonker, MA van de Wiel, and RX Menezes (2016).
#' "Testing for association between RNA-Seq and high-dimensional data",
#' \emph{BMC Bioinformatics}. 17:118.
#' \href{http://dx.doi.org/10.1186/s12859-016-0961-5}{html}
#' \href{http://www.biomedcentral.com/content/pdf/s12859-016-0961-5.pdf}{pdf}
#' (open access)
#' 
#' RX Menezes, M Boetzer, M Sieswerda, GJB van Ommen, and JM Boer (2009).
#' "Integrated analysis of DNA copy number
#' and gene expression microarray data using gene sets",
#' \emph{BMC Bioinformatics}. 10:203.
#' \href{http://dx.doi.org/10.1186/1471-2105-10-203}{html}
#' \href{http://www.biomedcentral.com/content/pdf/1471-2105-10-203.pdf}{pdf}
#' (open access)
#' 
#' @seealso The function \code{\link{omnibus}} tests for associations
#' between an overdispersed response variable
#' and a high-dimensional covariate set.
#' The function \code{\link{proprius}} calculates the contributions
#' of individual samples or covariates to the test statistic.
#' All other function of the R package
#' \code{\link{globalSeq}} are \code{\link{internal}}.
#' 
#' @examples
#' # simulate high-dimensional data
#' n <- 30; q <- 10; p <- 100
#' Y <- matrix(rnbinom(q*n,mu=10,
#'     size=1/0.25),nrow=q,ncol=n)
#' X <- matrix(rnorm(p*n),nrow=p,ncol=n)
#' Yloc <- seq(0,1,length.out=q)
#' Xloc <- seq(0,1,length.out=p)
#' window <- 1
#' 
#' # hypothesis testing
#' cursus(Y,Yloc,X,Xloc,window)
#' 
#' @usage
#' cursus(Y, Yloc, X, Xloc, window,
#'         Ychr = NULL, Xchr = NULL,
#'         offset = NULL, group = NULL,
#'         perm = 1000, nodes = 2,
#'         phi = NULL, kind = 0.01)
#'           
cursus <- function(Y, Yloc, X, Xloc, window, Ychr = NULL, Xchr = NULL, 
    offset = NULL, group = NULL, perm = 1000,
    nodes = 2, phi = NULL, kind = 0.01) {
    Y <- globalSeq::intern.matrix(Y)
    if (is.vector(Yloc)) {
        Ystart <- Yend <- Yloc
    } else {
        Ystart <- Yloc[, 1]
        Yend <- Yloc[, 2]
    }
    if (length(perm) == 1) {
        perm <- globalSeq::intern.permu(n = ncol(Y), it = perm - 1, group = group, kind=kind) # new: kind=kind
    }
    if (is.null(offset)) {
        ls <- colSums(Y)
        gm <- exp(1/length(ls) * sum(log(ls)))
        offset <- ls/gm
    }
    ###  alternative with edgeR
    # offset <- edgeR::calcNormFactors(Y)
    # utils::capture.output({phi <- edgeR::estimateDisp(edgeR::DGEList(counts=Y,norm.factors=offset))})
    # phi <- phi$tagwise.dispersion
    if (is.null(Ychr) | is.null(Xchr)) {
        ########## Analysing a single chromosome ##########
        message("Analysing a single chromosome.")
        out <- globalSeq::intern.chromo(Y = Y, Ystart = Ystart, Yend = Yend, X = X, 
            Xloc = Xloc, window = window, offset = offset, group = group, 
            perm = perm, nodes = nodes, phi = phi, kind = kind) # was disp = disp instead of phi = phi
    } else if (!is.null(Ychr) & !is.null(Xchr)) {
        ########## Analysing multiple chromosomes ##########
        out <- list()
        chr <- unique(Ychr)
        message("Analysing multiple chromosomes:")
        for (i in 1:length(chr)) {
            message(paste("chromosome", chr[i],"\n"))
            if (is.list(X)) {
                Xpass <- lapply(1:length(X), function(j) X[[j]][Xchr[[j]] == 
                  chr[i], , drop = FALSE])
            } else {
                Xpass <- X[Xchr == chr[i], , drop = FALSE]
            }
            if (is.list(Xloc)) {
                Xlocpass <- lapply(1:length(Xloc),
                function(j) Xloc[[j]][Xchr[[j]] == chr[i]])
            } else {
                Xlocpass <- Xloc[Xchr == chr[i]]
            }
            out[[i]] <- globalSeq::intern.chromo(Y = Y[Ychr == chr[i], , drop = FALSE], 
                Ystart = Ystart[Ychr == chr[i]], Yend = Yend[Ychr == chr[i]], 
                X = Xpass, Xloc = Xlocpass, window = window, perm = perm, 
                offset = offset, group = group, nodes = nodes, phi = phi, # was disp = disp instead of phi = phi
                kind = kind)
        }
        out <- do.call(cbind, out)
    } else {
        ### Ambiguous arguments ###
        stop("Please provide either both or none of \"Ychr\" and \"Xchr\".")
    }
    col <- matrix(NA, nrow = ncol(out), ncol = nrow(out))
    for (i in 1:nrow(out)) {
        col[, i] <- unlist(out[i, ])
    }
    colnames(col) <- rownames(out)
    rownames(col) <- rownames(Y)
    as.data.frame(col)
}


############### OMNIBUS ########################################################

#' Omnibus test
#' 
#' Test of association between a count response and
#' one or more covariate sets.
#' This test may be conceptualised as
#' a test of overall significance in regression analysis,
#' where the response variable is overdispersed, and where
#' the number of explanatory variables (\code{p})
#' exceeds the sample size (\code{n}).
#' The negative binomial distribution accounts for overdispersion
#' and a random effect model accounts for high dimensionality
#' (\code{p}>>\code{n}).
#' 
#' @export
#' @keywords methods
#' 
#' @param y
#' \strong{response variable}\strong{:}
#' numeric vector of length \code{n}
#' @param X
#' \strong{one covariate set}\strong{:}
#' numeric matrix with \code{n} rows (samples)
#' and \code{p} columns (covariates);
#' \cr \strong{multiple covariate sets}\strong{:}
#' list of numeric matrices with \code{n} rows (samples)
#' @param offset
#' numeric vector of length \code{n}
#' @param group
#' confounding variable\strong{:}
#' factor of length \code{n}
#' @param perm
#' number of iterations\strong{:}
#' positive integer
#' @param mu
#' mean parameters\strong{:}
#' numeric vector of length \code{1} or \code{n}
#' @param phi
#' dispersion parameter\strong{:}
#' non-negative real number
#' @param kind
#' computation \strong{:}
#' number between 0 and 1
#' 
#' @details
#' 
#' The user can provide a common \code{mu} for all samples
#' or sample-specific \code{mu}, and a common \code{phi}.
#' Setting \code{phi} equal to zero is equivalent
#' to using the Poisson model.
#' If \code{mu} is missing, then \code{mu} is estimated from \code{y}.
#' If \code{phi} is missing, then \code{mu} and \code{phi}
#' are estimated from \code{y}.
#' The \code{offset} is only taken into account
#' for estimating \code{mu} or \code{phi}.
#' By default the offset is \code{rep(1,n)}.
#' 
#' The user can provide the \strong{confounding} variable \code{group}.
#' Note that each level of \code{group} must appear at least twice
#' in order to allow stratified permutations.
#' 
#' Efficient alternatives to classical \strong{permutation} (\code{kind=1})
#' are the method of control variates (\code{kind=0})
#' and permutation in chunks (0 < \code{kind} < 1)
#' \link[=intern.crude]{details}.
#' 
#' @return
#' 
#' The function returns a dataframe,
#' with the p-value in the first column,
#' and the test statistic in the second column.
#' 
#' @references
#' 
#' A Rauschenberger, MA Jonker, MA van de Wiel, and RX Menezes (2016).
#' "Testing for association between RNA-Seq and high-dimensional data",
#' \emph{BMC Bioinformatics}. 17:118.
#' \href{http://dx.doi.org/10.1186/s12859-016-0961-5}{html}
#' \href{http://www.biomedcentral.com/content/pdf/s12859-016-0961-5.pdf}{pdf}
#' (open access)
#' 
#' RX Menezes, L Mohammadi, JJ Goeman, and JM Boer (2016).
#' "Analysing multiple types of molecular profiles simultaneously:
#' connecting the needles in the haystack",
#' \emph{BMC Bioinformatics}. 17:77.
#' \href{http://dx.doi.org/10.1186/s12859-016-0926-8}{html}
#' \href{http://www.biomedcentral.com/content/pdf/s12859-016-0926-8.pdf}{pdf}
#' (open access)
#' 
#' S le Cessie, and HC van Houwelingen (1995).
#' "Testing the fit of a regression model 
#' via score tests in random effects models",
#' \emph{Biometrics}. 51:600-614.
#' \href{http://dx.doi.org/10.2307/2532948}{html}
#' \href{http://www.jstor.org/stable/pdf/2532948.pdf?acceptTC=true}{pdf}
#' (restricted access)
#' 
#' @seealso
#' 
#' The function \code{\link{proprius}} calculates
#' the contributions of individual samples or covariates
#' to the test statistic.
#' The function \code{\link{cursus}} tests for association
#' between RNA-Seq and local genetic or epigenetic alternations
#' across the whole genome.
#' All other functions of the R package \code{\link{globalSeq}}
#' are \code{\link{internal}}.
#' 
#' @examples
#' # simulate high-dimensional data
#' n <- 30; p <- 100
#' y <- rnbinom(n,mu=10,size=1/0.25)
#' X <- matrix(rnorm(n*p),nrow=n,ncol=p)
#'
#' # hypothesis testing
#' omnibus(y,X)
#' 
#' @usage
#' omnibus(y, X, offset = NULL, group = NULL,
#'         mu = NULL, phi = NULL,
#'         perm = 1000, kind = 1)
#'         
omnibus <- function(y, X, offset = NULL, group = NULL, mu = NULL, phi = NULL, 
    perm = 1000, kind = 1) {
    ########## initialisation ##########
    n <- length(y) # number of samples
    if (is.null(mu) | is.null(phi)) {
        est <- globalSeq::intern.estim(y = y, offset = offset)
        mu <- est$mu
        if (is.null(phi)) {
            phi <- est$phi
        }
    } else {
        if (length(mu) == 1) {
            mu <- rep(mu, n)
        }
    }
    if (length(perm) == 1) {
        it <- perm
        perm <- globalSeq::intern.permu(n = n, it = perm - 1, group = group, kind = kind) # new: kind=kind
        it <- ncol(perm)
    } else {
        it <- ncol(perm)
    }
    ########## testing one covariate set ##########
    if (is.matrix(X) | is.data.frame(X)) {
        if (kind == 1) {
            globalSeq::intern.crude(y = y, X = X, mu = mu, phi = phi, perm = perm)
        } else if (kind == 0) {
            globalSeq::intern.conva(y = y, X = X, mu = mu, phi = phi, perm = perm,
                offset = offset)
        } else if (kind > 0 & kind < 1) {
            globalSeq::intern.focus(y = y, X = X, mu = mu, phi = phi, perm = perm, 
                focus = kind)
        } else {
            stop("Argument \"kind\" must be within 0 and 1.")
        }
    } else {
        ########## testing multiple sets ##########
        sets <- X
        k <- length(sets)  # number of sets
        # # # # # # # # # # # separate testing # # # # # # # # # # #
        single <- covs <- rep(NA, times = k)  # single p-values
        sim <- matrix(NA, nrow = k, ncol = it)  # simulated test statistics
        for (i in 1:k) {
            X <- sets[[i]]
            R <- X %*% t(X)/ncol(X)
            temp <- apply(perm, 2, function(perm) globalSeq::intern.score(y = y[perm], 
                R = R, mu = mu[perm], phi = phi))
            single[i] <- sum(temp >= temp[1])/it  # single p-values
            covs[i] <- ncol(X)
            sim[i, ] <- temp
        }
        # # # # # # # # # joint testing # # # # # # # # #
        sim_mu <- rowMeans(sim)
        sim_sd <- apply(sim, 1, stats::sd)
        com <- rep(NA, times = it)  # simulated test statistics
        for (i in 1:it) {
            score <- sim[, i, drop = FALSE]
            com[i] <- sum((score - sim_mu)/sim_sd)  # standardisation
        }
        joint <- sum(com >= com[1])/it  # joint p-value
        data.frame(joint = joint, teststat = com[1], single = matrix(single, 
            nrow = 1),covs=matrix(covs,nrow=1))                                                              # new: p=p
    }
}


############### PROPRIUS #######################################################

#' Decomposition
#' 
#' Even though the function \code{\link{omnibus}} tests
#' a single hypothesis on a whole covariate set,
#' this function allows to calculate
#' the individual contributions of \code{n} samples or
#' \code{p} covariates to the test statistic.
#' 
#' @export
#' @keywords methods
#' 
#' @inheritParams omnibus
#' @param X \strong{covariate set}\strong{:}
#' numeric matrix with \code{n} rows (samples) 
#' and \code{p} columns (covariates)
#' @param type
#' character '\strong{covariates}' or '\strong{samples}'
#' @param plot
#' plot of results\strong{:} logical
#' @param alpha
#' significance level\strong{:} real number between 0 and 1
#' 
#' @details
#' 
#' The user can provide a common \code{mu} for all samples
#' or sample-specific \code{mu}, and a common \code{phi}.
#' Setting \code{phi} equal to zero
#' is equivalent to using the Poisson model.
#' If \code{mu} is missing, then \code{mu} is estimated from \code{y}.
#' If \code{phi} is missing, then \code{mu} and \code{phi}
#' are estimated from \code{y}.
#' The \code{offset} is only taken into account
#' for estimating \code{mu} or \code{phi}.
#' 
#' The user can provide the confounding variable \code{group}.
#' Note that each level of \code{group} must appear at least twice
#' in order to allow stratified permutations.
#' 
#' @return
#' 
#' If \code{alpha=NULL}, then the function returns a numeric vector,
#' and else a list of numeric vectors.
#' 
#' @references
#' 
#' A Rauschenberger, MA Jonker, MA van de Wiel, and RX Menezes (2016).
#' "Testing for association between RNA-Seq and high-dimensional data",
#' \emph{BMC Bioinformatics}. 17:118.
#' \href{http://dx.doi.org/10.1186/s12859-016-0961-5}{html}
#' \href{http://www.biomedcentral.com/content/pdf/s12859-016-0961-5.pdf}{pdf}
#' (open access)
#' 
#' JJ Goeman, SA van de Geer, F de Kort, and HC van Houwelingen (2004).
#' "A global test for groups of genes:
#' testing association with a clinical outcome",
#' \emph{Bioinformatics}. 20:93-99.
#' \href{http://dx.doi.org/10.1093/bioinformatics/btg382}{html}
#' \href{http://bioinformatics.oxfordjournals.org/content/20/1/93.full.pdf}{pdf}
#' (open access)
#' 
#' @seealso
#' 
#' The function \code{\link{omnibus}} tests for associations
#' between an overdispersed response variable and a high-dimensional
#' covariate set.
#' The function \code{\link{cursus}} tests for association
#' between RNA-Seq and local genetic or epigenetic alternations
#' across the whole genome.
#' All other functions of the R package \code{\link{globalSeq}}
#' are \code{\link{internal}}.
#' 
#' @examples
#' # simulate high-dimensional data
#' n <- 30; p <- 100
#' y <- rnbinom(n,mu=10,size=1/0.25)
#' X <- matrix(rnorm(n*p),nrow=n,ncol=p)
#'
#' # decomposition
#' proprius(y,X,type="samples")
#' proprius(y,X,type="covariates")
#' 
#' @usage
#' proprius(y, X, type, offset = NULL, group = NULL,
#'         mu = NULL, phi = NULL,
#'         alpha = NULL, perm = 1000, plot = TRUE)
#' 
proprius <- function(y, X, type, offset = NULL, group = NULL, mu = NULL, 
    phi = NULL, alpha = NULL, perm = 1000, plot = TRUE) {
    if (is.null(mu) | is.null(phi)) {
        est <- globalSeq::intern.estim(y = y, offset = offset)
        mu <- est$mu
        if (is.null(phi)) {
            phi <- est$phi
        }
    } else {
        if (length(mu) == 1) {
            mu <- rep(mu, length(y))
        }
    }
    ### decomposition ###
    if (is.null(alpha)) {
        if (type == "samples") {
            u <- globalSeq::intern.sam(y, X, mu, phi)
        }
        if (type == "covariates") {
            u <- globalSeq::intern.cov(y, X, mu, phi)
        }
        upper <- NULL
    } else {
        if (length(perm) == 1) 
            {
                perm <- globalSeq::intern.permu(n = length(y),
                it = perm - 1, group = group, kind = Inf)
            }
        if (type == "covariates") {
            sim <- apply(perm, 2, function(perm) globalSeq::intern.cov(y = y[perm], 
                X = X, mu = mu[perm], phi = phi))
        }
        if (type == "samples") {
            sim <- apply(perm, 2, function(perm) globalSeq::intern.sam(y = y[perm], 
                X = X, mu = mu[perm], phi = phi))
        }
        u <- sim[, 1]
        upper <- apply(sim, 1, function(x) stats::quantile(x, p = 1 - alpha))
    }
    ### plot ###
    if (plot == "TRUE") {
        globalSeq::intern.plot(u = u, upper = upper, xlab = paste("indices of", type))
    }
    ### output ###
    if (is.null(alpha)) {
        u
    } else {
        list(u = u, upper = upper)
    }
}