R/randRot.R

Defines functions qqunif pFdr .pFdr .fdr.q .fdr.qu df_estimate randpermut rotateStat contrastModel initBatchRandrot initRandrotW initRandrot X_decomp

Documented in contrastModel df_estimate .fdr.q .fdr.qu initBatchRandrot initRandrot initRandrotW pFdr .pFdr qqunif randpermut rotateStat X_decomp

#' @details
#' Please refer to the package vignette for further details on usage and for
#' a "quick start". \code{\link[randRotation:rotateStat]{rotateStat}} is
#' the central function of the package. Methods are described in
#' \insertCite{Hettegger2021}{randRotation}.
#' @importFrom Rdpack reprompt
#' @references \insertAllCited{}
"_PACKAGE"




#' Decomposition of the design matrix for random rotation generation
#'
#' Full QR decomposition of the design matrix \code{X}. No argument checks are
#' performed, see \code{Details}.
#'
#' @param X Design matrix as generated by
#'   \code{\link[stats:model.matrix]{model.matrix}}.
#' @param coef.d Non-\code{H0} coefficients.
#'
#' @return A \code{\link[base:list]{list}} object containing matrices \code{Xd},
#'   \code{Xhe} and rank of the qr decomposition.
#' @export
#'
#' @details The design matrix \code{X} is QR decomposed into \code{X = Xq Xr}.
#' By performing a full QR decomposition, \code{Xq} is automatically extended to
#' a full basis. \code{Xq} is further split into \code{Xd} and \code{Xhe}, where
#' \code{Xd} corresponds to columns \code{coef.d} (non-\code{H0} or
#' non-Null-Hypothesis columns) and \code{Xhe} correspond to all other columns
#' (\code{H0} and error columns), see \code{\link[randRotation:initRandrot]{initRandrot}}.
#' No argument checks are performed for reasons
#' of performance as this function is called frequently by
#' \code{\link[randRotation:initRandrot]{initRandrot}} when weights are used.
#' See \insertCite{Hettegger2021}{randRotation} and \insertCite{Langsrud2005}{randRotation} for further details.
#'
#' @references \insertAllCited{}
#' @author Peter Hettegger
#' @examples
#' design <- cbind(1, rep(0:1, 5))
#' X_decomp(design)

X_decomp <- function(X = NULL, coef.d = seq_len(ncol(X)-1))
{
  qrX <- qr(X)
  Xq <- qr.Q(qrX, complete = TRUE)

  list(Xd = Xq[, coef.d, drop = FALSE],
       Xhe = Xq[, setdiff(seq_len(ncol(Xq)), coef.d), drop = FALSE],
       rank = qrX$rank)
}


initRandrot <- function(Y = NULL, X = NULL, coef.h = NULL, weights = NULL,
                        cormat = NULL)
{

  if(is.null(X)) stop("Please specify X")
  if(any(dim(X) < 1)) stop("Dimensions of X (design matrix) invalid")
  if(is.null(Y) || ncol(Y) != nrow(X))
    stop("Number of rows in X (design matrix) and number of columns in Y do not match")

  if(is.null(coef.h)){
    if(!is.null(attr(X, "coef.h"))){
      coef.h <- attr(X, "coef.h")
    } else {
      coef.h <- seq_len(ncol(X))
    }
  }
  if(length(coef.h) < 1) stop("length(coef.h) must be at least 1")
  if(length(coef.h) == 1 && coef.h == -1) coef.h = NULL
  if(!is.null(coef.h) && !all(coef.h %in% seq_len(ncol(X))))
    stop("coef.h not contained in X (design matrix)")


  if(is.null(colnames(X))) colnames(X) <- seq_len(ncol(X))

  if(nrow(X) != ncol(Y)) stop("nrow(X) must match ncol(Y)")

  if(any(is.na(Y)))
    warning("Missing values found in Y. Missing values are tolerated as much as possible, but should be avoided. See \"initRandrot\" help page.")
  if(any(is.na(X)))
    warning("Missing values found in X. Missing values are tolerated as much as possible, but should be avoided. See \"initRandrot\" help page.")
  if(any(is.na(weights)))
    warning("Missing values found in weights. Missing values are tolerated as much as possible, but should be avoided. See \"initRandrot\" help page.")
  if(any(is.na(cormat)))
    warning("Missing values found in cormat. Missing values are tolerated as much as possible, but should be avoided. See \"initRandrot\" help page.")


  coef.h <- sort(coef.h)
  coef.d <- setdiff(seq_len(ncol(X)), coef.h)
  if(length(coef.d > 0) && length(coef.h > 0) && (min(coef.h) < max(coef.d))){
    message("coef.h ", ifelse(length(coef.d)==1, "does", "do"), " not correspond to the last columns in the design matrix.\nColumns of design matrix are rearranged.")
    X <- X[, c(coef.d, coef.h), drop = FALSE]
    coef.d <- seq_len(length(coef.d))
    coef.h <- seq_len(length(coef.h)) + length(coef.d)
  }

  rank.xd <- qr(X[,coef.d])$rank
  if(rank.xd < length(coef.d)){
    coef.d <- seq_len(qr(X[,coef.d])$rank)
    warning("Partitioned design matrix X[,coef.d] does not have full rank, see ?initRandrot.")
  }


  # Cholesky factorisation of correlationmatrix for whitening transformation
  if(!is.null(cormat)){
    if(any(abs(cormat) > 1) | !isSymmetric(cormat))
      stop("cormat has elements >1, < -1 and/or is not symmetric")
    if(ncol(cormat) != nrow(X))
      stop("Dimensions of cormat must match sample size")
    if(all(abs(cormat[upper.tri(cormat),drop = FALSE]) < 1e-10))
      warning("all off-diagonal entries of cormat are < 1e-10 or > -1e-10")
    if(!all(diag(cormat) == 1))
      stop("Correlation matrix must have 1s in the diagonal.")

    ### cormat = t(tcholC) %*% tcholC, to be in usual naming convention of
    ### cholesky decomposition (x = LL')
    tcholC <- chol(cormat)
    cholCinv <- drop(forwardsolve(t(tcholC), diag(ncol(tcholC))))
    tcholC <- drop(tcholC)
  } else {
    tcholC <- NULL
    cholCinv <- NULL
  }


  if(!is.null(weights)) return(
    initRandrotW(Y = Y, X = X, coef.h = coef.h, coef.d = coef.d,
                   weights = weights, cormat = cormat, cholCinv = cholCinv,
                   tcholC = tcholC))

  #### (Whitening) transformation of X and Y
  if(!is.null(cormat)){
    Y.w <- Y %*% t(cholCinv)
    X.w <- cholCinv %*% X
  } else {
    Y.w <- Y
    X.w <- X
  }

  #### QR-decomposition of X
  decomp <- X_decomp(X.w, coef.d)

  if(ncol(decomp$Xhe) < 1) warning("Dimension of random rotation matrix is 0x0.")
  if(ncol(decomp$Xhe) == 1) warning("Dimension of random rotation matrix is 1x1.")

  Yd <- Y.w %*% decomp$Xd %*% t(decomp$Xd)
  dimnames(Yd) <- dimnames(Y)

  Xhe.Y.w <- t(decomp$Xhe) %*% t(Y.w)

  new("initRandrot", list(X = X, Xhe.Y.w = Xhe.Y.w, Yd = Yd, Xhe = decomp$Xhe,
                           coef.d = coef.d, coef.h = coef.h, cormat = cormat,
                           tcholC = tcholC, rank = decomp$rank))
}

#' Internal function
#'
#' @inheritParams initRandrot
#' @param coef.d Determined coefficients. These are all other coefficients that
#'   are not hypothesis coefficients (see also
#'   \code{\link[randRotation:initRandrot]{initRandrot}}).
#' @param cholCinv Inverse of the Cholesky factorisation of \code{cormat}.
#' @param tcholC Transposed of the Cholesky factorisation of \code{cormat}.
#'
#' @return An initialised
#'   \code{\link[randRotation:initRandrot-class]{initRandrotW}} object.
#' @author Peter Hettegger
#' @keywords internal
#' @examples
#' # For further examples see '?rotateStat' and package vignette.
#'
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' ### Simulate weights
#' weights <- matrix(rbeta(features * nrow(pdata), 2, 2)+0.1, features)
#'
#' # Initialisation of the random rotation class
#' init1 <- initRandrot(Y = edata, X = mod1, coef.h = 2,
#'                         weights = weights)
#' init1
initRandrotW <- function(Y, X, coef.h, coef.d, weights, cormat,
                           cholCinv, tcholC)
{
  if(any(weights <= 0) | any(!is.finite(weights)))
    stop("Weights must be finite > 0")
  if(!all(dim(weights) == dim(Y)))
    stop("Y and weights must have equal dimensions.")

  w <- sqrt(weights)



  #### (Whitening) transformation of Y and X and QR-decomposition of X
  if(!is.null(cormat)){
    Y.w <- (w * Y) %*% t(cholCinv)

    decomp.list <- apply(w, 1, function(w.i){
      X_decomp(cholCinv %*% (w.i * X), coef.d)
    })
  } else {
    Y.w <- (w * Y)

    decomp.list <- apply(w, 1, function(w.i){
      X_decomp(w.i * X, coef.d)
    })
  }


  Yd <- t(vapply(seq_len(nrow(Y.w)), function(i){
    Y.w[i,,drop = FALSE] %*% decomp.list[[i]]$Xd %*% t(decomp.list[[i]]$Xd)
  }, numeric(ncol(Y.w))))

  dimnames(Yd) <- dimnames(Y)

  Xhe.Y.w <- vapply(seq_len(nrow(Y.w)), function(i){
    if(ncol(decomp.list[[i]]$Xhe) < 1)
      warning(paste0("Feature ", i, ": Dimension of random rotation matrix is 0x0."))
    if(ncol(decomp.list[[i]]$Xhe) == 1)
      warning(paste0("Feature ", i, ": Dimension of random rotation matrix is 1x1."))

    Y.w[i,,drop = FALSE] %*% decomp.list[[i]]$Xhe
  }, numeric(ncol(Y.w)-length(coef.d)))

  new("initRandrotW", list(X = X, Xhe.Y.w = Xhe.Y.w, Yd = Yd,
                             decomp.list = decomp.list, coef.d = coef.d,
                             coef.h = coef.h, cormat = cormat, tcholC = tcholC,
                             w = w))
}


#' Initialisation of a random rotation Object
#'
#' Initialization of a linear model for subsequent generation of randomly
#' rotated data (\code{\link[randRotation:randrot]{randrot}}) associated with
#' the null hypothesis \eqn{H_{0}: \beta_{coef.h} = 0}{H0: \beta_coef.h = 0}.
#' Basics of rotation tests are found in \insertCite{Hettegger2021}{randRotation} and
#' \insertCite{Langsrud2005}{randRotation}.
#'
#' @param Y a data matrix with \code{features x samples} dimensions or a list
#'   with elements \code{E}, \code{design} and \code{weights} (see
#'   \code{Details}). Missing values (\code{\link[base:NA]{NA}}) are allowed but
#'   e.g. lead to NAs for all samples of the respective features in the rotated
#'   dataset and should thus be avoided. We highly recommend avoiding missing
#'   values by e.g. replacing them by imputation or removing features containing
#'   NAs.
#' @param X the design matrix of the experiment with \code{samples x
#'   coefficients} dimensions. For \code{initBatchRandrot}, specify the design
#'   matrix without the batch variable. A warning is generated if
#'   \code{X[,coef.d]} does not have full rank, see Details.
#' @param coef.h single integer or vector of integers specifying the "hypothesis
#'   coefficients" (\code{H0} coefficients). \code{coef.h} should correspond to
#'   the last columns in \code{X} (see \code{Details}). If available,
#'   \code{attr(X, "coef.h")} is used, see
#'   \code{\link[randRotation:contrastModel]{contrastModel}}. By default, all
#'   coefficients are set as \code{H0} coefficients. If \code{coef.h} is set
#'   \code{-1}, no coefficient is set as \code{H0} coefficient.
#' @param weights numerical matrix of finite positive weights > 0 (as in
#'   weighted least squares regression. Dimensions must be equal to dimensions
#'   of \code{Y}.
#' @param cormat the sample correlation matrix with \code{samples x samples}
#'   dimensions. Must be a real symmetric positive-definite square matrix. See
#'   \code{Details} for usage in \code{initBatchRandrot}.
#'
#' @rdname initRandrot
#' @return An initialised
#'   \code{\link[randRotation:initRandrot-class]{initRandrot}},
#'   \code{\link[randRotation:initRandrot-class]{initRandrotW}} or
#'   \code{\link[randRotation:initBatchRandrot-class]{initBatchRandrot}}
#'   object.
#' @author Peter Hettegger
#' @references \insertAllCited{}
#'
#' @details
#'
#' This function performs basic initial checks and preparatory calculations for
#' random rotation data generation.
#' Nomenclature of variables is mainly as in
#' \insertCite{Langsrud2005}{randRotation} and \insertCite{Hettegger2021}{randRotation}. See also package vignette for
#' application examples.
#'
#' \code{Y} can also be a list with elements \code{E}, \code{design} and
#' \code{weights}. \code{Y$E} is thereby used as \code{Y}, \code{Y$design} is
#' used as \code{X} and \code{Y$weights} is used as \code{weights}. By this, the
#' functions are compatible with results from e.g. \code{voom} (\code{limma}
#' package), see \code{Examples}.
#'
#' \code{coef.h} specifies the model coefficients associated with the null
#' hypothesis ("hypothesis coefficients"). All other model coefficients are
#' considered as "determined coefficients" \code{coef.d}
#' \insertCite{Langsrud2005}{randRotation}. The design matrix is rearranged so
#' that \code{coef.h} correspond to the last columns of the design matrix and
#' \code{coef.d} correspond to the first columns of the design matrix. This
#' is necessary for adequate transformation of the combined null-hypothesis
#' \eqn{H_{0}: \beta_{coef.h} = 0}{H0: \beta_coef.h = 0} by QR decomposition.
#' If \code{X[,coef.d]} does not have full rank, a warning is generated and
#' \code{coef.d} is set to \code{coef.d <- seq_len(qr(X[,coef.d])$rank)}.
#'
#'
#' Weights must be finite positive \code{numerics} greater zero. This is
#' necessary for model (QR) decomposition and for back transformation of the
#' rotated data into the original variance structure, see also
#' \code{\link[randRotation:randrot]{randrot}}. Weights as estimated e.g. by
#' voom \insertCite{Law2014}{randRotation} are suitable and can be used without
#' further processing. Note that due to the whitening transformation (i.e. by
#' using the arguments \code{weights} and/or \code{cormat}) the rank of the
#' transformed (whitened) design matrix \code{X} could change (become smaller),
#' which could become dangerous for the fitting procedures. If you get errors
#' using \code{weights} and/or \code{cormat}, try the routine without using
#' \code{weights} and/or \code{cormat} to exclude this source of errors.
#'
#'
#' The following section provides a brief summary how rotations are calculated.
#' A more general introduction is given in
#' \insertCite{Langsrud2005}{randRotation}. For reasons of readability, we omit
#' writing \code{\%*\%} for matrix multiplication and write \code{*} for
#' transposed matrix. The rotation is done by multiplying the \code{features x
#' samples} data matrix \code{Y} with the transpose of the restricted random
#' rotation matrix \code{Rt}
#'
#' \code{Rt = Xd Xd* + [Xh  Xe] R [Xh  Xe]*}
#'
#' with \code{R} being a (reduced) random rotation matrix and \code{Xd},
#' \code{Xh} and \code{Xe} being columns of the full QR decomposition of the
#' design matrix \code{X}. \code{[Xd Xh Xe] = qr.Q(qr(X), complete = TRUE)},
#' where \code{Xd} correspond to columns \code{coef.d}, \code{Xh} to columns
#' \code{coef.h} and \code{Xe} to the remaining columns.
#'
#' If \code{weights} and/or \code{cormat} are specified, each feature
#' \code{Y[i,]} and the design matrix \code{X} are whitening transformed before
#' rotation. The whitening matrix \code{T} is defined as \code{T = solve(C) w},
#' where \code{solve(C)} is the inverse Cholesky decompostion of the correlation
#' matrix (\code{cormat = CC*}) and \code{w} is a diagonal matrix of the square
#' roots of the sample weights for the according feature (\code{w =
#' diag(sqrt(weights[i,]}))).
#'
#' The rotated data for one feature \code{y.r[i,]} is thus calculated as
#'
#' \code{y.r[i,] = ( solve(T) Rt T (y[i,])* )*} and \code{[Xd Xh Xe] =
#' qr.Q(qr(TX), complete = TRUE)}
#'
#' For \code{weights = NULL} and \code{cormat = NULL}, \code{T} is the identity
#' matrix.
#'
#' Note that a separate QR decomposition is calculated for each feature if
#' \code{weights} are specified. The restricted random orthogonal matrix
#' \code{Rt} is calculated with the same reduced random orthogonal matrix \code{R}
#' for all features.
#'
#' @export
#' @seealso \code{\link[randRotation:randrot]{randrot}},
#'   \code{\link[randRotation:rotateStat]{rotateStat}}
#'
#' @import methods
#' @importFrom stats approx ecdf p.adjust ppoints qqline qqplot quantile qunif
#'   rnorm spline
#' @importFrom utils head
#' @examples
#' # For further examples see '?rotateStat' and package vignette.
#'
#' # Example 1: Compatibility with limma::voom
#'
#' \dontrun{
#' v <- voom(counts, design)
#' ir <- initRandrot(v)}
#'
#' # Example 2:
#'
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(batch = rep(1:3, c(10,10,10)),
#'                    phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' # Initialisation of the random rotation class
#' init1 <- initBatchRandrot(Y = edata, X = mod1, coef.h = 2, batch = pdata$batch)
#' init1
setGeneric("initRandrot", function(Y = NULL, X = NULL, coef.h = NULL,
                                    weights = NULL, cormat = NULL)
  {standardGeneric("initRandrot")})



#' Initialised random rotation class
#'
#' List-based S4 class containing all information necessary to generate randomly
#' rotated data with the \code{\link[randRotation:randrot]{randrot}} method.
#' \code{initRandrot} and \code{initRandrotW} objects are created with the
#' \code{\link[randRotation:initRandrot]{initRandrot}} method.
#' @section Components: The following components are included as list elements:
#' \describe{
#'   \item{\code{X}}{Original (non-transformed) design matrix.}
#'   \item{\code{Xhe}, \code{Xhe.Y.w}, \code{Yd}}{Pre-multiplied matrix products needed for generation of rotated data (\code{\link[randRotation:randrot]{randrot}}).}
#'   \item{\code{coef.h}, \code{coef.d}}{Indices of \eqn{H_0}{H0} coefficients (\code{coef.h} or "hypothesis coefficients") and indices of all other coefficients (\code{coef.d} or "determined coefficients").}
#'   \item{\code{cormat}}{Correlation matrix, see \code{\link[randRotation:initRandrot]{initRandrot}}.}
#'   \item{\code{tcholC}}{Cholesky decomposition of cormat: \code{cormat = crossprod(tcholC)}.}
#'   \item{\code{rank}}{Rank of the qr decomposition of (transformed/whitened) \code{X}}
#' }
#' @export
#' @rdname initRandrot-class
setClass("initRandrot", contains = "list")

#' Initialised random rotation class with weights
#'
#' \code{initRandrotW} is organised as its base class \code{initRandrot},
#' altough some components are changed or added.
#' @section Components:
#' The following components are changed or added in \code{initRandrotW-class}
#' as compared to \code{initRandrot-class}:
#' \describe{
#'   \item{\code{decomp.list}}{List containing \code{Xd}, \code{Xhe} and rank of the transformed/whitened design matrix for each feature, see also \code{\link[randRotation:X_decomp]{X_decomp}}.}
#'   \item{\code{w}}{Numeric matrix with dimensions \code{features x samples} containing component wise square root of the weight matrix, see \code{\link[randRotation:initRandrot]{initRandrot}}.}
#' }
#' @export
#' @rdname initRandrot-class
#' @author Peter Hettegger
setClass("initRandrotW", contains = c("initRandrot", "list"))



#' Initialised random rotation batch object
#'
#' This class contains \code{initRandrot} or \code{initRandrotW} class
#' objects for each batch. See also descriptions in
#' \code{\link[randRotation:initRandrot]{initRandrot}} and
#' \code{\link[randRotation:initRandrot-class]{initRandrot-class}}.
#' @section Components:
#' \describe{
#'   \item{\code{batch.obj}}{List of \code{initRandrot} or \code{initRandrotW} class objects for each batch.}
#'   \item{\code{split.by}}{List of sample indices for each batch.}
#' }
#' @export
#' @author Peter Hettegger
setClass("initBatchRandrot", contains = c("list"))


#' Rotated object containing rotated and non-rotated statistics
#'
#' This list based class contains calculated statistics for the original data
#' (\code{s0}) and rotated data (\code{stats}). See also
#' \code{\link[randRotation:rotateStat]{rotateStat}}.
#' @section Components:
#' \describe{
#'   \item{\code{s0}}{Calculated statistics for original (non-rotated) data as returned by the \code{statistic} function (\code{\link[randRotation:rotateStat]{rotateStat}}).}
#'   \item{\code{stats}}{List of length \code{ncol.s} containing statistics on rotated data for each column returned by the \code{statistic} function.}
#'   \item{\code{ncol.s}}{Number of columns returned by the \code{statistic} function.}
#'   \item{\code{R}}{Number of resamples/rotations.}
#' }
#' @export
#' @author Peter Hettegger
setClass("rotateStat", contains = "list")




initBatchRandrot <- function(Y = NULL, X = NULL, coef.h = NULL, batch = NULL,
                               weights = NULL, cormat = NULL)
{
  if(is.null(batch) || (length(batch) != ncol(Y)))
    stop("Please specify batch variable with length(batch) being equal to ncol(Y)")

  tmp1 <- seq_along(batch)
  names(tmp1) <- colnames(Y)
  spl1 <- split(tmp1, batch)


  Y.l <- lapply(spl1, function(i){
    Y[,i,drop = FALSE]
  })

  X.l <- lapply(spl1, function(i){
    X[i,,drop = FALSE]
  })

  weights.l <- lapply(spl1, function(i){
    weights[,i,drop = FALSE]
  })


  initRandrot.l <- lapply(seq_len(length(spl1)), function(i){
    cat("Initialising batch \"", names(spl1)[i], "\"\n", sep = "")
    initRandrot(Y = Y.l[[i]], X = X.l[[i]], coef.h = coef.h,
                 weights = weights.l[[i]], cormat = cormat[[i]])
  })


  new("initBatchRandrot", list(batch.obj = initRandrot.l, split.by = spl1))

}



#' @param batch Batch covariate of the same length as \code{ncol(Y)}.
#' @export
#' @details When using \code{initBatchRandrot}, \code{initRandrot} is called
#'   for each batch separately. When using \code{initBatchRandrot} with
#'   \code{cormat}, \code{cormat} needs to be a list of correlation matrices
#'   with one matrix for each batch. Note that this implicitly assumes a block
#'   design of the sample correlation matrix, where sample correlation
#'   coefficients between batches are zero. For a more general sample
#'   correlation matrix, allowing non-zero sample correlation coefficients
#'   between batches, see package vignette. Batches are split according to
#'   \code{split(seq_along(batch), batch)}.
#'
#' @rdname initRandrot
#'
#' @examples # See '?rotateStat'
setGeneric("initBatchRandrot",
           function(Y = NULL, X = NULL, coef.h = NULL, batch = NULL,
                    weights = NULL, cormat = NULL) {standardGeneric("initBatchRandrot")})


#' @export
#' @rdname initRandrot
setMethod("initBatchRandrot", "list",
          function(Y = NULL, X = Y$design, coef.h = NULL, batch = NULL, weights = Y$weights, cormat = NULL){
            initBatchRandrot(Y = Y$E, X = X, coef.h = coef.h, batch = batch, weights = weights, cormat = cormat)
          })



#' @export
#' @rdname initRandrot
setMethod("initRandrot", "list",
          function(Y = NULL, X = Y$design, coef.h = NULL, weights = Y$weights, cormat = NULL){
            initRandrot(Y = Y$E, X = X, coef.h = coef.h, weights = weights, cormat = cormat)
          })





#' Create transformed model matrix for contrast rotation
#'
#' This function takes a model matrix \code{X} and a contrast matrix \code{C}
#' and creates a transformed model matrix corresponding to a transformed set
#' of coefficients.
#'
#' @param X \code{(numeric)} model matrix with dimensions
#' \code{samples x coefficients}.
#' @param C \code{(numeric)} contrast matrix with dimensions
#' \code{coefficients x contrasts}. The contrast matrix must have full column rank.
#' @param coef.h column numbers of contrasts (in \code{C}) which should be set
#' as \code{coef.h} in the transformed model, see
#' \code{\link[randRotation:initRandrot]{initRandrot}}. All columns are set as
#' \code{coef.h} by default.
#'
#' @details
#' The last n coefficients of the transformed model matrix correspond to the n
#' contrasts. By default, all contrasts are set as \code{coef.h}.
#' See package vignette for examples of data rotations with contrasts.
#'
#' @return A transformed model matrix with \code{coef.h} set as attribute.
#' @export
#' @author Peter Hettegger
#' @examples
#'
#' group <- c("A", "A", "B", "B")
#' X <- model.matrix(~0+group)
#' C <- cbind(contrast1 = c(1, -1))
#' X2 <- contrastModel(X, C)
contrastModel <- function(X, C, coef.h = seq_len(ncol(C))){

  if(!(is.matrix(X)) || (!is.matrix(C)))
    stop("X and C must be numeric matrices.")

  stopifnot(ncol(X) == nrow(C))

  c1 <- ncol(C)
  k1 <- nrow(C)

  qr1 <- qr(C)

  if(qr1$rank != c1) stop("C must have full column rank.")

  Q.s <- qr.Q(qr1, TRUE)
  Q.s <- Q.s[,c((c1+1):k1, seq_len(c1))]

  R <- qr.R(qr1)
  R.sti <- diag(k1)
  R.sti[(k1-c1+1):k1, (k1-c1+1):k1] <- forwardsolve(t(R), diag(ncol(R)))

  X.transformed <- X %*% Q.s %*% R.sti
  coef.h <- ((k1-c1+1):k1)[coef.h]
  attr(X.transformed, "coef.h") <- coef.h
  colnames(X.transformed)[coef.h] <- colnames(C)
  X.transformed
}




#' Show an Object
#'
#' Display the object by printing structured summary information.
#'
#' @param object An object of class
#'   \code{\link[randRotation:initRandrot-class]{initRandrot-class}},
#'   \code{\link[randRotation:initRandrot-class]{initRandrotW-class}} or
#'   \code{\link[randRotation:initBatchRandrot-class]{initBatchRandrot-class}}.
#'
#'
#' @export
#' @rdname show
#' @return \code{show} returns an invisible \code{NULL}.
#' @details The show method always displays the original design matrix
#'   (\code{X}), not the transformed (whitened) versions.

setMethod("show", "initRandrot",
          function(object)
          {
            cat("Initialised random rotation object", if(!is.null(object$w))"(with weights)","\n\n")
            cat(dim(object$Yd)[1], "features   ", dim(object$Yd)[2], "samples\n\n")
            cat("Coefficients to test (coef.h):", colnames(object$X)[object$coef.h], "\n\n")

            cat("design matrix (X):\n")
            print(head(object$X, n = 6))
            if(nrow(object$X) > 6)cat(".\n.\n",nrow(object$X)-6, "more rows\n")

            if(!is.null(object$cormat)){
              cat("\ncorrelation matrix (cormat) - showing first 6x6 entries:\n")
              show.i <- seq_len(min(6, nrow(object$cormat)))
              show.j <- seq_len(min(6, ncol(object$cormat)))
              print(object$cormat[show.i, show.j, drop = FALSE])
              cat("\n")
            }

            if(!is.null(object$w)){
              cat("\nweights - showing first 6x6 entries:\n")
              show.i <- seq_len(min(6, nrow(object$w)))
              show.j <- seq_len(min(6, ncol(object$w)))
              print((object$w[show.i, show.j, drop = FALSE])^2)
              cat("\n")
            }

            cat("\n\n")

            invisible() # return NULL

          }
)


#' @rdname show
#' @export

setMethod("show", "initBatchRandrot",
          function(object)
          {
            cat("Initialised batch random rotation ojbect - In the following, the structure of each batch is listed separately:\n\n")

            invisible(lapply(seq_along(object$batch.obj), function(i){
              cat("##########################\n")
              cat("     BATCH ", i)
              cat("\n##########################\n\n")

              show(object$batch.obj[[i]])
            }))
            invisible() # return NULL
          }
)

#' @rdname show
#' @export

setMethod("show", "rotateStat",
          function(object)
          {
            cat("Rotate stat object\n\n")
            cat("R =", object$R)
            cat("\n\ndim(s0):", dim(object$s0))
            cat("\n\nStatistic:\n")
            cat(show(object$statistic))
            cat("\nCall:\n")
            cat(show(object$call))
            invisible() # return NULL
          }
)


#' Extract model weights
#'
#' \code{weights} is a generic function which extracts fitting weights from objects returned by modeling functions.
#' NOTE: This man page is for the weights S4 generic function defined in the \link[randRotation:randRotation-package]{randRotation} package.
#'
#' @inheritParams show,initRandrot-method
#' @param ... Kept for compatibility with the default method, see
#' \code{?stats::\link[stats:weights]{weights}}.
#' For objects defined in package \link[randRotation:randRotation-package]{randRotation},
#' this argument is currently not needed.
#'
#' @export
#' @rdname weights
#' @return Weights extracted from the object \code{object}. \code{NULL} if no
#' weights were specified.
#' See \code{?stats::\link[stats:weights]{weights}} for the value returned by the default method.
#'
#' @examples
#' weights
#' showMethods("weights")
#' selectMethod("weights", "ANY")  # the default method
#'
setMethod("weights", "initRandrot",
          function(object,...){
            object$w^2
          }
)

#' @rdname weights
#' @export
setMethod("weights", "initBatchRandrot",
          function(object,...){
            do.call(cbind, lapply(object$batch.obj, weights))
          }
)



#' Random rotation of initialised object
#'
#' Perform random data rotation of a previously initialised object (see
#' \code{\link[randRotation:initRandrot]{initRandrot}}) associated with the
#' null hypothesis \eqn{H_{0}: \beta_{coef.h} = 0}{H0: \beta_coef.h = 0}.
#'
#'
#' @param object An initialised object of class
#'   \code{\link[randRotation:initRandrot-class]{initRandrot-class}} or
#'   \code{\link[randRotation:initBatchRandrot-class]{initBatchRandrot-class}}.
#'
#' @param ... further arguments passed to
#'   \code{\link[randRotation:randorth]{randorth}}
#'
#' @return \code{numeric} matrix of rotated data under the specified combined
#'   null hypothesis.
#' @export
#' @rdname randrot
#' @details
#'
#' This function generates a randomly rotated dataset from an initialised
#' randrot object (see \code{\link[randRotation:initRandrot]{initRandrot}}).
#' See also package vignette for application examples. Only the numerical matrix
#' of rotated data is returned, no design matrix, weights or other info is
#' return for efficiency purposes. Please consider that, if you e.g. use
#' \code{weights} or if you use
#' \code{\link[randRotation:rotateStat]{rotateStat}}, you may need to forward
#' the design matrix \code{X}, \code{weights} etc. to subsequent analyses. See
#' the example in \code{\link[randRotation:rotateStat]{rotateStat}}.
#'
#' Details on the calculation of a rotated dataset are given in
#' \code{\link[randRotation:initRandrot]{initRandrot}},
#' \insertCite{Langsrud2005}{randRotation} and \insertCite{Hettegger2021}{randRotation}.
#'
#' @author Peter Hettegger
#' @references \insertAllCited{}
#' @examples
#' # For further examples see '?rotateStat' and package vignette.
#'
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(batch = rep(1:3, c(10,10,10)),
#'                    phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' # Initialisation of the random rotation class
#' init1 <- initBatchRandrot(Y = edata, X = mod1, coef.h = 2,
#'                             batch = pdata$batch)
#' init1
#'
#' ### Fit model to original data
#'
#' fit.orig <- lm.fit(mod1, t(edata))
#' head(t(coef(fit.orig)))
#'
#' ### Fit model to rotated data
#'
#' edata.rot <- randrot(init1)
#' fit.rot <- lm.fit(mod1, t(edata.rot))
#' head(t(coef(fit.rot)))
#'
#'
#' # Note that the coefficients stay equal if we regress only on the
#' # non-hypothesis coefficients
#'
#' mod0 <- model.matrix(~1, pdata)
#' fit.orig0 <- lm.fit(mod0, t(edata))
#' fit.rot0  <- lm.fit(mod0, t(edata.rot))
#' head(t(coef(fit.orig0)))
#' head(t(coef(fit.rot0)))
#'
setGeneric("randrot", function(object, ...) standardGeneric("randrot"))






#' @export
#' @rdname randrot

setMethod("randrot", "initRandrot",
          function(object,...){
            ### No excessive initial checks for efficiency purposes.
            R <- randorth(nrow(object$Xhe.Y.w),...)

            if(is.null(object$cormat))
              (object$Yd + t(object$Xhe %*% R %*% object$Xhe.Y.w))
            else
              (object$Yd + t(object$Xhe %*% R %*% object$Xhe.Y.w)) %*% object$tcholC
          })


#' @rdname randrot
#' @export
setMethod("randrot", "initRandrotW",
          function(object,...){
            ### No excessive initial checks for efficiency purposes.
            R <- randorth(nrow(object$Xhe.Y.w),...)

            R.Xhe.Y.w <- R %*% object$Xhe.Y.w

            n <- length(object$decomp.list)
            Yhe <- t(vapply(seq_len(n), function(i){
              object$decomp.list[[i]]$Xhe %*% R.Xhe.Y.w[,i,drop = FALSE]
            }, numeric(ncol(object$Yd))))

            # de-withening of Y.w
            if(is.null(object$cormat))
              (1/object$w) *  (object$Yd + Yhe)
            else
              (1/object$w) * ((object$Yd + Yhe) %*% object$tcholC)
          })




#' @rdname randrot
#' @export
setMethod("randrot", "initBatchRandrot",
          function(object,...){
            ord1 <- order(unlist(object$split.by))

            ### Perform randrot for each batch an sort back to initial order as
            ### in the design matrix.
            do.call(cbind,
                    lapply(object$batch.obj, randrot, ...)
                    )[,ord1,drop=FALSE]
          })




#' Generate data rotations and calculate statistics on it
#'
#' This function generates rotations of data and calculates the provided \code{statistic} on each rotation and the non-rotated (original) data.
#' This is the central function of the package.
#'
#' @param initialised.obj An initialised random rotation object as returned by \code{\link[randRotation:initRandrot]{initRandrot}} and \code{\link[randRotation:initRandrot]{initBatchRandrot}}.
#' @param R The number of resamples/rotations. Single \code{numeric} larger than 1.
#' @param statistic A function which takes a data matrix (same dimensions as \code{Y} - see also \code{\link[randRotation:initRandrot]{initRandrot}}) as first argument and returns a statistic of interest. Any further arguments are passed to it by \code{...}.
#' We highly recommend using pivotal quantities as \code{statistic} if possible (see also \code{Details} in \code{\link[randRotation:pFdr]{pFdr}}).
#' Note that \code{\link[randRotation:pFdr]{pFdr}} considers larger values of statistics as more significant, so one-tailed tests may require reversal of the sign and two-tailed tests may require taking absolute values, see \code{Examples}.
#' The results of \code{statistic} for each resample are finally combined with \code{as.matrix} and \code{cbind}, so ensure that \code{statistic} returns either a vector or a matrix.
#' Results with multiple columns are possible and handled adequately in subsequent functions (e.g. \code{\link[randRotation:pFdr]{pFdr}}).
#' @param ... Further named arguments for \code{statistic} which are passed unchanged each time it is called.
#' Avoid partial matching to arguments of \code{rotateStat}. See also the \code{Examples}.
#' @param parallel \code{logical} if parallel computation should be performed, see details for use of parallel computing.
#' @param BPPARAM An optional \code{\link[BiocParallel:BiocParallelParam-class]{BiocParallelParam}} instance, see documentation of \code{BiocParallel} package of Bioconductor.
#'
#' @return An object of class \code{\link[randRotation:rotateStat-class]{rotateStat}}.
#' @rdname rotateStat
#' @export
#' @details
#'
#' The function takes an initialised randrot object
#' (\code{\link[randRotation:initRandrot]{initRandrot}}) and a function that
#' calculates a statistic on the data. The statistic function thereby takes the
#' a matrix \code{Y} as first argument. Any further arguments are passed to it
#' by \code{...}.
#'
#' Together with \code{\link[randRotation:pFdr]{pFdr}}, this function implements
#' the \code{workflow} described in \insertCite{Hettegger2021}{randRotation}.
#'
#' Be aware that only data is rotated (see also
#' \code{\link[randRotation:randrot]{randrot}}), so any additional information
#' including \code{weights}, \code{X} etc. need to be provided to
#' \code{statistic}. See also package vignette and \code{Examples}.
#'
#' Parallel processing is implemented with the BiocParallel package of Bioconductor.
#' The default argument \code{\link[BiocParallel:register]{BiocParallel::bpparam()}} for \code{BPPARAM}
#' returns the registered default backend.
#' See package documentation for further information and usage options.
#' If \code{parallel = TRUE} the function calls in \code{statistic} need to be called explicitly
#' with package name and "::". So e.g. calling \code{lmFit} from the
#' \code{limma} package is done with \code{limma::lmFit(...)}, see also the
#' examples in the package vignette.
#'
#' @author Peter Hettegger
#' @references \insertAllCited{}
#' @examples
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(batch = rep(1:3, c(10,10,10)),
#'                    phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' # Initialisation of the random rotation class
#' init1 <- initBatchRandrot(Y = edata, X = mod1, coef.h = 2, batch = pdata$batch)
#' init1
#'
#' # Definition of the batch effect correction procedure with subsequent calculation
#' # of two-sided test statistics
#' statistic <- function(., batch, mod, coef){
#'
#'   # The "capture.output" and "suppressMessages" simply suppress any output
#'   capture.output(suppressMessages(
#'     Y.tmp <- sva::ComBat(., batch = batch, mod)
#'   ))
#'
#'   fit1 <- lm.fit(mod, t(Y.tmp))
#'   abs(coef(fit1)[coef,])
#' }
#'
#' # We calculate test statistics for the second coefficient
#'
#' res1 <- rotateStat(initialised.obj = init1,
#'                     R = 10,
#'                     statistic = statistic,
#'                     batch = pdata$batch, mod = mod1, coef = 2)
#'
#' hist(pFdr(res1))

rotateStat <- function(initialised.obj, R = 10, statistic, ...,
                        parallel = FALSE,
                        BPPARAM = BiocParallel::bpparam()){

  if(R<1) stop("R must be at least 1")

  FUN <- {
    # This definition is necessary for parallel processing
    statistic
    initialised.obj
    function(j, ...) statistic(randrot(initialised.obj, I.matrix = (j == 1)), ...)
    }

  i <- seq_len(R+1)

  if(parallel){
    stats <- BiocParallel::bplapply(X = i, FUN = FUN, ..., BPPARAM = BPPARAM)
  } else {
    stats <- lapply(X = i, FUN = FUN, ...)
  }

  ## The "statistic" function can return results with multiple columns. Thus the
  ## results are organized accordingly in lists (one list element for each
  ## column).

  stats <- lapply(stats, as.matrix)

  s0 <- stats[[1]]
  ncol.s <- ncol(s0)

  stats <- stats[-1]

  stats <- lapply(
    seq_len(ncol.s),
    function(i)
      do.call(cbind,lapply(stats, function(tmp.stat) tmp.stat[,i,drop = FALSE]))
    )

  names(stats) <- colnames(s0)

  new("rotateStat", list(s0 = s0, stats = stats, ncol.s = ncol.s, R = R,
                          call = sys.call(), statistic = statistic,
                          w = (length(weights(initialised.obj)) > 0)))
}


#' Dimensions of an Object
#'
#' Retrieve the dimensions of an object.
#'
#' @param x An object of class
#'   \code{\link[randRotation:initRandrot-class]{initRandrot-class}} or
#'   \code{\link[randRotation:initBatchRandrot-class]{initBatchRandrot-class}}.
#'
#'
#' @export
#' @rdname dim
#' @return Vector of length two with number of \code{features} and number of
#'   \code{samples}. See also
#'   \code{\link[randRotation:initRandrot]{initRandrot}}.
setMethod("dim", "initRandrot", function(x)dim(x$Yd))

#' @export
#' @rdname dim
setMethod("dim", "initBatchRandrot",
          function(x)c(nrow(x$batch.obj[[1]]), sum(lengths(x$split.by))))

#' Dimnames of an Object
#'
#' Retrieve the dimnames of an object.
#'
#' @param x An object of class
#'   \code{\link[randRotation:initRandrot-class]{initRandrot-class}} or
#'   \code{\link[randRotation:initBatchRandrot-class]{initBatchRandrot-class}}.
#'
#'
#' @return A \code{list} with names of \code{features} and \code{samples}, see
#'   \code{\link[randRotation:initRandrot]{initRandrot}}.
#' @export
#' @rdname dimnames
setMethod("dimnames", "initRandrot", function(x)dimnames(x$Yd))


#' @export
#' @rdname dimnames
setMethod("dimnames", "initBatchRandrot",
          function(x)list(rownames(x$batch.obj[[1]]$Yd),
                          unlist(lapply(x$split.by, names), use.names = FALSE)))

#' Random orthogonal matrix
#'
#' Generation of a random orthogonal \code{n x n} matrix.
#'
#' @param n \code{numeric} of length 1 defining the dimensions of the \code{n x n} square matrix.
#' @param type Either \code{"orthonormal"} or \code{"unitary"} defining whether a real orthonormal matrix or a complex unitary matrix should be returned.
#' @param I.matrix If \code{TRUE}, the identity matrix is returned.
#'
#' @return A random orthogonal matrix of dimension \code{n x n}.
#' @export
#' @details
#' A random orthogonal matrix \code{R} is generated in order that \code{t(R)} (for \code{"orthonormal"}) or \code{Conj(t(R))} (for \code{"unitary"}) equals the inverse matrix of \code{R}.
#'
#' This function was adapted from the pracma package (\code{\link[pracma:randortho]{pracma::randortho}}).
#'
#' The random orthogonal matrices are distributed with Haar measure over
#' \code{O(n)}, where \code{O(n)} is the set of orthogonal matrices of order \code{n}.
#' The random orthogonal matrices are basically distributed "uniformly" in the
#' space of random orthogonal matrices of dimension \code{n x n}.
#' See also the \code{Examples} and \insertCite{Stewart1980a,Mezzadri2007}{randRotation}.
#'
#'
#' @references \insertAllCited{}
#' @author Peter Hettegger
#' @importFrom graphics smoothScatter
#' @examples
#'
#' # The following example shows the orthogonality of the random orthogonal matrix:
#' R1 <- randorth(4)
#' zapsmall(t(R1) %*% R1)
#'
#' R1 <- randorth(4, "unitary")
#' zapsmall(Conj(t(R1)) %*% R1)
#'
#' # The following example shows the distribution of 2-dimensional random orthogonal vectors
#' # on the unit circle.
#' tmp1 <- vapply(1:400, function(i)randorth(2)[,1], numeric(2))
#' plot(t(tmp1), xlab = "x", ylab = "y")
#'
randorth <- function (n, type = c("orthonormal", "unitary"), I.matrix = FALSE)
{
  if(I.matrix) return(diag(n))
  ### this function was adapted from the pracma package (randortho function)

  stopifnot(is.numeric(n), length(n) == 1, floor(n) == ceiling(n),
            n >= 0)

  if(n == 0) return(matrix(1,0,0))

  type <- match.arg(type)
  if (type == "orthonormal") {
    z <- matrix(rnorm(n^2), n)
  }
  else {
    z <- (matrix(rnorm(n^2), n) + (0+1i) * matrix(rnorm(n^2), n))
  }
  Z <- qr(z)
  q <- qr.Q(Z)
  r <- qr.R(Z)
  d <- diag(r)
  ph <- d/abs(d)
  q %*% diag(ph, length(ph))
}




#' Generate random permutation matrix for n samples
#'
#' Generate a random permutation matrix for \code{n} samples.
#'
#' @param n Number of samples
#'
#' @return A random permutation matrix of dimension \code{n x n}
#' @export
#' @details This methods generates an orthogonal matrix with only
#'   one entry in each row and column being \code{1}, all other entries being
#'   \code{0}.
#' @examples
#' tmp1 <- randpermut(5)
#' t(tmp1) %*% tmp1
#' @author Peter Hettegger
randpermut <- function(n){
  m1 <- matrix(0,n,n)
  m1[cbind(seq_len(n),sample(n))] <- 1
  m1
}




#' Estimation of degrees of freedom (df) for an arbitrary mapping function
#'
#' This function has been deprecated and will be defunct in the next release !
#' This function estimates the local degrees of freedom (df) of mapped data for an arbitrary mapping function.
#' The estimation is done for a set of selected features.
#'
#' @param data A numerical data matrix.
#' @param features Features for which the df should be estimated (default \code{sample(nrow(data),10)}).
#' @param mapping A mapping function that takes a matrix \code{features x samples} dimensions as first argument and returns a matrix of
#' mapped data with the same dimensions. Any further arguments can be passed to \code{mapping} through \code{...}.
#' @param ... Additional arguments passed to \code{mapping}.
#' @param delta A numeric delta for the finite differences (default \code{sqrt(.Machine$double.eps)}).
#'
#' @return A named numeric vector of estimated df for each feature. Names correspond to \code{features}.
#' @export
#'
#' @details
#' The df are estimated as the rank of the local Jacobian matrix. It is thus the rank of the local linear approximation of the mapping function,
#' where linearisation is performed around \code{data}. The Jacobian matrix \code{J} for a certain feature \code{j} is calculated with finite differences:
#'
#' \code{data2 <- data}
#'
#' \code{data2[j,i] <- data2[j,i] + delta}
#'
#' \code{J[,i] = (mapping(data2, ...) - mapping(data,...))/delta}
#'
#' In the current implementation, the rank of \code{J} is calculated as sum of the singular values of \code{J}.
#' So for each feature, a SVD decomposition of the \code{ncol(data) x ncol(data)} matrix \code{J} is calculated.
#'
#' This function should be considered experimental due to the common numerical issues associated with
#' finite differences and numerical calculation of matrix ranks. So always check results for plausibility.
#'
#' An estimation of df is generated for each feature specified in \code{features}.
#' @examples
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(batch = rep(1:3, c(10,10,10)),
#'                    phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' # The limma::removeBatchEffect function is a commonly used function for batch effect correction:
#' mapping <- function(Y, batch, mod) {
#'   limma::removeBatchEffect(x = Y, batch = batch, design = mod)
#' }
#'
#' #The following 2 lines were commented out, as df_estimate() is deprecated.
#' #dfs <- df_estimate(edata, features = 1, mapping = mapping, batch = pdata$batch, mod = mod1)
#' #dfs

df_estimate <- function(data, features = sample(nrow(data), 10), mapping,..., delta = sqrt(.Machine$double.eps))
{
  .Deprecated()
  if(!is.matrix(data) || !is.numeric(data))stop("data must be a numeric matrix.")
  if(abs(delta) == 0) stop("abs(delta) must be > 0.")

  m <- mapping(data, ...)

  n <- ncol(data)
  dat2 <- data

  dfs <- vapply(features, function(feature){

    # Calculate Jacobian matrix with finite differences
    jacobi <- vapply(seq_len(n), function(i){
      dat2[feature,i] <- dat2[feature,i] + delta
      diff <- mapping(dat2, ...)[feature,] - m[feature,]
      dat2[feature,i] <- dat2[feature,i] - delta
      diff / delta
    }, data[1,])

    sum(svd(jacobi,0,0)$d) # df are the rank of the Jacobian matrix
  }, 1)

  names(dfs) <- features
  dfs
}


#' Internal functions for p-value and FDR estimation
#'
#' @param s0 \code{numeric} vector of original (non-rotated) statistics.
#' @param stats \code{numeric} matrix of rotated statistics.
#' @param beta \code{numeric} between 0 and 1. See \insertCite{Yekutieli1999}{randRotation}.
#' @param na.rm \code{logical}. Should missing values be removed ?
#' @param ref.vector Reference vector defining at which grid points of \code{s0}
#'   and (\code{stats}) the FDRs are approximated. All other points are
#'   approximated by spline interpolation. NAs are removed from ref.vector
#'
#' @return \code{numeric} vector of (adjusted) p-value or FDR estimations for \code{s0}.
#' @rdname fdr_p
#' @keywords internal
#' @references \insertAllCited{}

.fdr.qu <- function(s0, stats, beta = 0.05, na.rm = FALSE,
                    ref.vector = sort(s0, decreasing = TRUE, na.last = TRUE))
{
  ref.vector <- ref.vector[!is.na(ref.vector)]
  stats.max <- max(stats, na.rm = na.rm)
  ref.vector <- ref.vector[ref.vector <= stats.max]
  ref.vector.size <- length(ref.vector)

  r.vector <- vapply(ref.vector,
                     function(i) sum(s0 >= i, na.rm = na.rm), numeric(1))

  qu.value <- vector("numeric", ref.vector.size)
  est.95qu.exc <- vector("numeric", ref.vector.size)
  su.hat <- vector("numeric", ref.vector.size)

  for (i in seq_len(ref.vector.size)) {
    v <- colSums(stats >= ref.vector[i], na.rm = na.rm)
    est.95qu.exc[i] <- quantile(v, prob = (1 - beta),
                                names = FALSE, na.rm = na.rm)
    su.hat[i] <- r.vector[i] - est.95qu.exc[i]
    su.hat[i] <- max(su.hat[i], 0, na.rm = na.rm) # negative su.hat makes no sense
    resamp.q <- ifelse((v + su.hat[i]) != 0, v/(v + su.hat[i]),0)
    qu.value[i] <- mean(resamp.q, na.rm = na.rm)
  }

  qu.value <- cummax(qu.value)


  approx(spline(ref.vector, qu.value), xout = abs(s0), rule = 2)$y
}

#' @rdname fdr_p
#' @keywords internal
.fdr.q <- function(s0, stats, beta = 0.05, na.rm = FALSE,
                   ref.vector = sort(s0, decreasing = TRUE, na.last = TRUE))
{
  ref.vector <- ref.vector[!is.na(ref.vector)]
  stats.max <- max(stats, na.rm = na.rm)
  ref.vector <- ref.vector[ref.vector <= stats.max]
  ref.vector.size <- length(ref.vector)

  r.vector <- vapply(ref.vector,
                     function(i) sum(s0 >= i, na.rm = na.rm), numeric(1))

  q.value <- vector("numeric", ref.vector.size)
  est.95qu.exc <- vector("numeric", ref.vector.size)
  s.hat <- vector("numeric", ref.vector.size)

  for (i in seq_len(ref.vector.size)) {
    v <- colSums(stats >= ref.vector[i], na.rm = na.rm)
    est.95qu.exc[i] <- quantile(v, prob = (1 - beta),
                                names = FALSE, na.rm = na.rm)
    s.hat[i] <- r.vector[i] - mean(v, na.rm = na.rm)
    s.hat[i] <- ifelse(s.hat[i] < est.95qu.exc[i], 0, s.hat[i])
    resamp.q <- ifelse((v + s.hat[i]) != 0, v/(v + s.hat[i]),
                       0)
    q.value[i] <- mean(resamp.q, na.rm = na.rm)
  }
  q.value <- rev(cummin(rev(q.value)))

  approx(spline(ref.vector, q.value), xout = abs(s0), rule = 2)$y
}

#' @param method A p-value or FDR adjustment method, see
#'   \code{\link[randRotation:pFdr]{pFdr}}.
#' @param pooled \code{logical}. \code{TRUE} if marginal distributions are
#'   exchangeable for all features so that rotated stats can be pooled, see
#'   \code{\link[randRotation:pFdr]{pFdr}}.
#' @rdname fdr_p
#' @keywords internal
.pFdr <- function(s0, stats, method, pooled, na.rm, beta){

  if(method %in% c("fdr.q", "fdr.qu") && pooled == FALSE)
    stop("Methods \"fdr.q\" and \"fdr.qu\" can only be used with pooled = TRUE")

  if(pooled == "FALSE"){
    ### The +1 are according to (Phipson and Smyth 2010).
    res <- (rowSums(stats >= s0, na.rm = na.rm)+1)/(rowSums(!is.na(stats))+1)
    res <- p.adjust(p = res, method = method)
  } else {

    if(method == "fdr.q"){
      res <- .fdr.q(s0, stats, beta = beta, na.rm = na.rm)
    } else if(method == "fdr.qu"){
      res <- .fdr.qu(s0, stats, beta = beta, na.rm = na.rm)
    } else {
      if(na.rm == FALSE && any(is.na(stats))){
        #### Calculating p-values with "stats" containing NAs and "na.rm =
        #### FALSE" results in only NAs
        warning("NAs found in rotated stats, but na.rm = FALSE.")
        res <- s0
        res[] <- NA
      } else {
        ### The "-" in ecdf and it's argument is necessary in order that the
        ### function behaves correctly for discrete statistics (i.e. in order
        ### that it is equivalent to ">=" instead of ">")
        ### The solution using ecdf is faster than iterating over all s0.
        ### The m and the +1 are according to (Phipson and Smyth 2010).
        m <- sum(!is.na(stats))
        ps <- (((ecdf(x = -stats)(-s0)) * m) + 1) / (m+1)
        res <- p.adjust(p = ps, method = method)
      }
    }
  }

  if(all(is.na(res))) warning("p-values / FDRs resulted only in NAs")

  res
}



#' Calculate resampling based p-values and FDRs
#'
#' This function calculates either (1) resampling based p-values with subsequent
#' p-value adjustment using \code{\link[stats:p.adjust]{stats::p.adjust}} or (2)
#' resampling based false-discovery-rates (FDRs) for rotated statistics from a
#' \code{\link[randRotation:rotateStat]{rotateStat}} object.
#'
#'
#' @param obj A \code{rotateStat} object as returned by \code{\link[randRotation:rotateStat]{rotateStat}}.
#' @param method Can be either \code{"none"} (default), \code{"fdr.q"}, \code{"fdr.qu"} or any term that can be passed as \code{method} argument to \code{\link[stats:p.adjust]{stats::p.adjust}}, see \code{Details}. If \code{method = "none"}, resampling based
#' p-values without further adjustment are calculated.
#' @param pooled \code{logical}. \code{TRUE} (default) if marginal distributions are exchangeable for all features so that rotated stats can be pooled, see \code{Details}.
#' @param na.rm \code{logical}. \code{NA} values are ignored if set \code{TRUE}. \code{NA} values should be avoided and could e.g. be removed by imputation in original data or by removing features that contain \code{NA} values. Few \code{NA} values do not have a large effect, but many \code{NA} values can lead to wrong estimations of p-values and FDRs. We highly recommend avoiding \code{NA} values.
#' @param beta \code{numeric} between \code{0} and \code{1}. Corresponds to beta in \insertCite{Yekutieli1999}{randRotation}.
#'
#' @return A \code{numeric} matrix of corrected p-values or FDRs with dimension \code{dim(obj$s0)}.
#' @export
#' @seealso \code{\link[randRotation:rotateStat]{rotateStat}}
#' @details Larger values of obj$s0 are considered more significant when
#'   compared to the empirical distribution. E.g. for calculation of resampling
#'   based p-values (with \code{pooled = FALSE}) we in principle use
#'   \code{p.val <- (rowSums(obj$stats >= obj$s0)+1)/(ncol(obj$stats)+1)} according to
#'   \insertCite{Phipson2010}{randRotation}.
#'
#'   \code{method = "fdr.q"} and \code{method = "fdr.qu"} are resampling based
#'   fdr estimates and can only be used with \code{pooled = TRUE}. \code{method
#'   = "fdr.q"} is the FDR local estimator and \code{method = "fdr.qu"} is the
#'   FDR upper limit, see \insertCite{Reiner2003,Yekutieli1999}{randRotation}.
#'   For all other \code{method} arguments resampling based p-values are
#'   calculated and passed to \code{\link[stats:p.adjust]{stats::p.adjust}} for
#'   p-value adjustment. So these methods provide resampling based p-values with
#'   (non-resampling based) p-value adjustment.
#'   \code{method = "fdr.q"} and \code{method = "fdr.qu"} were
#'   adapted from package \code{fdrame}
#'   \insertCite{Fdrame2019,Reiner2003}{randRotation}.
#'
#'   When \code{pooled = TRUE},
#'   marginal distributions of the test statistics are considered exchangeable
#'   for all features. The resampling based p-values of each feature are then
#'   calculated from all rotated statistics (all features, all rotations). For
#'   these cases, if the number of features is reasonably large, usually only
#'   few resamples (argument \code{R} in
#'   \code{\link[randRotation:rotateStat]{rotateStat}}) are required.
#'   We want to emphasize that in order for the marginal distributions to be
#'   exchangeable, the statistics must be a pivotal quantity (i.e. it must be
#'   scale independent).
#'   Pivotal quantities are e.g. t values. Using e.g. linear models
#'   with \code{coef} as statistics is questionable if the different features
#'   are measured on different scales. The resampled coefficients then
#'   have different variances and \code{pooled = TRUE} is not applicable.
#'   We thus highly recommend using pivotal quantities as \code{statistics} in
#'   \code{\link[randRotation:rotateStat]{rotateStat}}
#'   if possible.
#'
#'   When \code{pooled = FALSE} the resampling based p-values are calculcated
#'   for each feature separately. This is required if one expects the resampling
#'   based statistics to be distributed differently for individual features. For
#'   most common applications this should not be the case and the marginal
#'   distribution are exchangeable for all features, hence \code{pooled = TRUE}
#'   by default.
#'
#'   If \code{method = "fdr.q"} or \code{method = "fdr.qu"} and
#'   \code{weights} were specified when initialising the random rotation
#'   object (see parameter \code{initialised.obj} in
#'   \code{\link[randRotation:rotateStat]{rotateStat}}), a warning is displayed.
#'   The correlation structure (dependence structure) of linear model
#'   coefficients between different features is not preserved if
#'   different weights are used for different features.
#'   Methods \code{fdr.q} and \code{fdr.qu} rely on preserved correlation
#'   structure of dependent statistics and thus should not be used if statistics
#'   based on model coefficients (e.g. t statistics of model coefficients) are
#'   used in combination with different weights.
#'
#'   P-values and FDRs are calculated for each column of \code{obj$s0}
#'   separately.
#' @examples # See also '?rotateStat':
#'
#' #set.seed(0)
#'
#' # Dataframe of phenotype data (sample information)
#' # We simulate 2 sample classes processed in 3 batches
#' pdata <- data.frame(batch = rep(1:3, c(10,10,10)),
#'                    phenotype = rep(c("Control", "Cancer"), c(5,5)))
#' features <- 100
#'
#' # Matrix with random gene expression data
#' edata <- matrix(rnorm(features * nrow(pdata)), features)
#' rownames(edata) <- paste("feature", 1:nrow(edata))
#'
#' mod1 <- model.matrix(~phenotype, pdata)
#'
#' # Initialisation of the random rotation class
#' init1 <- initBatchRandrot(Y = edata, X = mod1, coef.h = 2, batch = pdata$batch)
#' init1
#'
#' # Definition of the batch effect correction procedure with subsequent calculation
#' # of two-sided test statistics
#' statistic <- function(., batch, mod, coef){
#'
#'   # The "capture.output" and "suppressMessages" simply suppress any output
#'   capture.output(suppressMessages(
#'     Y.tmp <- sva::ComBat(., batch = batch, mod)
#'   ))
#'
#'   fit1 <- lm.fit(mod, t(Y.tmp))
#'   abs(coef(fit1)[coef,])
#' }
#'
#' # We calculate test statistics for the second coefficient
#'
#' res1 <- rotateStat(initialised.obj = init1,
#'                     R = 10,
#'                     statistic = statistic,
#'                     batch = pdata$batch, mod = mod1, coef = 2)
#'
#' hist(pFdr(res1))
#' @references \insertAllCited{}
#' @author Peter Hettegger
pFdr <- function(obj, method = "none", pooled = TRUE, na.rm = FALSE, beta = 0.05){
  if(!is(obj, "rotateStat")) stop("class(obj) must be \"rotateStat\"")
  if(beta < 0 || beta > 1) stop("beta must be between 0 and 1")
  if(any(is.na(obj$s0)) || any(is.na(unlist(obj$stats))))
    warning("Missing values found in obj$s0 or obj$stats. Missing values are tolerated as much as possible, but should be avoided. See \"pFdr\" help page.")

  method <- match.arg(method, c("fdr.q", "fdr.qu", stats::p.adjust.methods))
  if(isTRUE(obj$w) && (method %in% c("fdr.q", "fdr.qu")))
    warning("Methods fdr.q and fdr.qu should not be used with weights (see ?pFdr).")

  res <- vapply(seq_len(obj$ncol.s),
                 function(i).pFdr(obj$s0[,i,drop = TRUE], obj$stats[[i]], method, pooled, na.rm, beta),
                 obj$s0[,1])

  # For the case that there is only one feature.
  # Otherwise the "colnames(res) <- colnames(obj$s0)" fails
  if(is.null(dim(res)) && (length(res) > 0)){
    if(obj$ncol.s > 1) res <- rbind(res)
    else
      res <- cbind(res)
  }


  colnames(res) <- colnames(obj$s0)
  res
}


#' Quantile-Quantile plot of data sample against uniform theoretical quantiles
#'
#' \code{qqunif} produces a QQ plot of the values in \code{ps} against the
#' theoretical quantiles of the uniform distribution.
#'
#' This function can e.g. be used for comparing p-values against the uniform
#' distribution. The log scale of the x and y axes allow a closer look at low
#' p-values.
#'
#' This function is a modified version of the examples in the
#' \code{\link[stats:qqnorm]{qqnorm}} documentation page.
#'
#' @importFrom graphics abline
#' @param ps \code{numeric} vector of values (e.g. p-values). Values must be
#'   between 0 and 1. Values like \code{NA}, NaN, Inf etc. produce an error.
#' @param log \code{character} indicating whether axis should be plotted in log
#'   scale. Either \code{""}, \code{"x"}, \code{"y"} or \code{"xy"}.
#' @param pch Point symbol, see \code{\link[graphics:par]{par}}.
#' @param xlab Label for the x axis.
#' @param ylab Label for the y axis.
#' @param ... Graphical parameters forwarded to \code{\link[stats:qqnorm]{qqplot}}
#'
#' @return A list of \code{x} and {y} coordinates, as in \code{\link[stats:qqnorm]{qqplot}}.
#' @export
#'
#' @examples
#' qqunif(runif(100))
qqunif <- function(ps, log = "xy", pch = 20, xlab = "theoretical quantiles",
ylab = "sample quantiles", ...){

  if(any(!is.finite(ps) | ps > 1 | ps < 0))
    stop("Values of ps must be between 0 and 1.")

  ## Q-Q plot for Unif data against true theoretical distribution
  res <- qqplot(ppoints(ps), ps, main = expression("Q-Q plot for" ~~ {Unif(0,1)}),
         log = log, pch = pch, xlab = xlab, ylab = ylab, ...)
  abline(0,1, col = 2,lwd=2,lty=2)
  invisible(res)
}

Try the randRotation package in your browser

Any scripts or data that you put into this service are public.

randRotation documentation built on April 14, 2021, 6:01 p.m.