R/ruv-mixed-model.R

#' Remove unwanted variation using mixed model
#'
#' This function is a modified version of limma gls.series
#' function. We changed to gls.series function to include
#' model residuals that are computed after removing variation
#' due to the random effect component. Otherwise, no changes was
#' made to the gls.series algorithm.
#'
#' @param M Data matrix of genes by samples.
#' @param design Design matrix of the model fixed effects.
#' @param block A vector that uniquely identifies levels of each random
#'          effect.
#' @param correlation Consensus correlation coefficient describing the
#'          average correlation between biological replicates. This is
#'          usually computed via the duplicateCorrelation() function in
#'          the limma package.
#'
#' @keywords batch
#'
#' @export
#' @examples
#' ruv_mixed_model()
#'

ruv_mixed_model <- function (M, design = NULL, per_gene = FALSE,
                             ndups = 1, spacing = 1,
                             block = NULL,
                             correlation = NULL,
                             weights = NULL) {

  M <- as.matrix(M)
  narrays <- ncol(M)
  if (is.null(design))
    design <- matrix(1, narrays, 1)
  design <- as.matrix(design)

  if (nrow(design) != narrays)
    stop("Number of rows of design matrix does not match number of arrays")

  if (!is.null(weights)) {
    weights[is.na(weights)] <- 0
    weights <- asMatrixWeights(weights, dim(M))
    M[weights < 1e-15] <- NA
    weights[weights < 1e-15] <- NA
  }

  nbeta <- ncol(design)
  coef.names <- colnames(design)

  # Prepare design matrix for the random effect
  block <- as.vector(block)
  if (length(block) != narrays)
    stop("Length of block does not match number of arrays")
  ub <- unique(block)
  nblocks <- length(ub)
  Z <- matrix(block, narrays, nblocks) == matrix(ub, narrays,
                                                 nblocks, byrow = TRUE)

  # Prepare variance-covariance matrix for the randome effect
  if ( length(correlation) == 1 & per_gene == FALSE ) {
      correlation <- rep(correlation, dim(M)[1])
  } else if ( length(correlation) != 1 & per_gene == FALSE ) {
    correlation <- correlation
  } else if ( length(correlation) == 1 & per_gene == TRUE ) {
    stop("Correlation vector is of length 1. We need correlation coeff.
          for each gene to perform genewise analysis.")
  } else if ( length(correlation) == 1 & per_gene == FALSE ) {
    stop("Correlation vector length is more than 1. We need
          a single correlation coef. to perform gene-wide analysis.")
  }

  ngenes <- nrow(M)
  stdev.unscaled <- matrix(NA, ngenes, nbeta, dimnames = list(rownames(M),
                                                              coef.names))
  # Compute estimates one gene at a time
  beta <- stdev.unscaled
  sigma <- rep(NA, ngenes)
  df.residual <- rep(0, ngenes)
  residuals <- vector("list", ngenes)
  for (i in 1:ngenes) {
    y <- drop(M[i, ])
    o <- is.finite(y)
    y <- y[o]
    n <- length(y)
    cormatrix <- Z %*% (correlation[i] * t(Z))
    diag(cormatrix) <- 1
    if (n > 0) {
      X <- design[o, , drop = FALSE]
      V <- cormatrix[o, o]
      if (!is.null(weights)) {
        wrs <- 1/sqrt(drop(weights[i, o]))
        V <- wrs * t(wrs * t(V))
      }
      # handle cormatrix with negative eigenvalues
      cholV <- tryCatch( chol(V), error = function(err) {
          if (correlation[i] < 1) {
              cormatrix <- Z %*% ( round( correlation[i], 2) * t(Z))
          }
          if (correlation[i] == 1) {
              cormatrix <- Z %*% ( ( correlation[i] - runif(1, 0, 10^(-6))) * t(Z))
          }
          diag(cormatrix) <- 1
          X <- design[o, , drop = FALSE]
          V <- cormatrix[o, o]
          if (!is.null(weights)) {
            wrs <- 1/sqrt(drop(weights[i, o]))
            V <- wrs * t(wrs * t(V))
          }
          return(chol(V))
        } )
      y <- backsolve(cholV, y, transpose = TRUE)
      if (all(X == 0)) {
        df.residual[i] <- n
        sigma[i] <- sqrt(array(1/n, c(1, n)) %*% y^2)
      }
      else {
        X <- backsolve(cholV, X, transpose = TRUE)
        out <- lm.fit(X, y)
        est <- !is.na(out$coefficients)
        beta[i, ] <- out$coefficients
        stdev.unscaled[i, est] <- sqrt(diag(chol2inv(out$qr$qr,
                                                     size = out$rank)))
        df.residual[i] <- out$df.residual
        if (df.residual[i] > 0)
          sigma[i] <- sqrt(array(1/out$df.residual,
                        c(1,n)) %*% out$residuals^2)
        residuals[[i]] <- out$residuals
      }
    }
  }

#  Below computes variance-covariance matrix for the regression
#  coeffcients for the entire set of genes altogether,
#  under the scenaior of a single correlation for the random effecs
#  across genes
#
#  cholV <- chol(cormatrix)
#  QR <- qr(backsolve(cholV, design, transpose = TRUE))
#  cov.coef <- chol2inv(QR$qr, size = QR$rank)
#  est <- QR$pivot[1:QR$rank]
#  dimnames(cov.coef) <- list(coef.names[est], coef.names[est])

  return(
    list(coefficients = beta, stdev.unscaled = stdev.unscaled,
       sigma = sigma, df.residual = df.residual,
       block = block, correlation = correlation,
       residuals = do.call(rbind, residuals) ) )
}
jhsiao999/Humanzee documentation built on May 19, 2019, 9:28 a.m.