R/acPCA.R

Defines functions calAv acPCA

Documented in acPCA

#' Perform AC-PCA for simultaneous dimension reduction and adjustment for confounding variation
#'
#' @param X the n by p data matrix, where n is the number of samples, p is the number of variables. Missing values in X should be labeled as NA. If a whole sample in X is missing, it should be removed.
#' @param Y the n by q confounder matrix, where n is the number of samples, q is the number of confounding factors. Missing values in Y should be labeled as NA.
#' @param lambda the tuning parameter, non-negative.
#' @param nPC number of principal components to compute
#' @param eval True or False. eval=T evaluates the significance of the PCs. Default is F.
#' @param numPerm the number of permutations to evaluate the significance of the PCs. Default is 100.
#' @param alpha the significance level. Default is 0.05. If the kth PC is not significant, we don't consider the PCs after it.
#' If the eigenvalue and variance explained by the PCs give inconsistent results, we choose the maximum number of significant PCs.
#' @param plot True or False. plot=T generates the plots. Default is True.
#' @param centerX center the columns in X. Default is True.
#' @param centerY center the columns in Y. Default is True.
#' @param scaleX scale the columns in X to unit standard deviation. Default is False.
#' @param scaleY scale the columns in Y to unit standard deviation. Default is False.
#' @param kernel the kernel to use: "linear", "gaussian".
#' @param bandwidth bandwidth h for Gaussian kernel. Optional.
#' @return The principal components and the projected data
#' \item{v}{the principal components, p by nPC matrix}
#' \item{Xv}{the projected data, i.e. X times v}
#' \item{eigenvalueX}{eigenvalues for the PCs}
#' \item{varianceX}{variance explained by the PCs}
#' \item{varianceX_perc}{percentage of total variance in X explained by the PCs. If eval=F, NA is returned.}
#' \item{eigenvalueXperm}{eigenvalues for the PCs, permutation. If eval=F, NA is returned.}
#' \item{varianceXperm}{variance explained by the PCs, permutation. If eval=F, NA is returned.}
#' \item{sigPC}{the significant PCs. If eval=F, NA is returned.}
#' \item{...}{Input parameters for the function}
#' @export


acPCA <- function(X, Y, nPC, lambda, eval=F, numPerm=100, alpha=0.05, plot=T, centerX=T, centerY=T, scaleX=F, scaleY=F, kernel=c("linear", "gaussian"), bandwidth=1){
  ####check whether a whole row in X is missing
  Xmis <- apply(X, 1, function(row){sum(!is.na(row))})
  if (sum(Xmis==0)){
    stop(paste("The following samples in X is missing, please remove them in X and Y: rows ", paste(which(Xmis==0), collapse =  " "), sep=""))
  }
  ####check whether the number of samples in X and Y match
  if (dim(X)[1]!=dim(Y)[1]){
    stop("The numbers of samples in X ( nrow(X) ) and Y ( nrow(Y) ) do not match")
  }
  ####check whether lambda is non-negative
  if (lambda<0){
    stop("lambda should be non-negative")
  }
  ####check alpha
  if (alpha<=0 | alpha>1){
    stop("alpha must be between 0 and 1")
  }
  nsam <- dim(X)[1]
  p <- dim(X)[2]
  if (is.null(dim(Y))){
    Y <- matrix(Y, ncol=1)
  }

  ####missing data
  X[is.na(X)] <- mean(X, na.rm=T)
  Y[is.na(Y)] <- mean(Y, na.rm=T)
  X <- scale(X, center = centerX, scale = scaleX)
  Y <- scale(Y, center = centerY, scale = scaleY)

  ####calculate kernel matrix for Y
  K <- calkernel(Y, kernel, bandwidth)

  ####AC-PCA
  #result_acpca <- eigs_sym(calAv, k=nPC, which = "LA", n=p, args=list(X=X, K=K, lambda=lambda))
  A = calAv(args = list(X = X, K = K, lambda = lambda))
  result_acpca = eigen(A,symmetric=TRUE)
  result_acpca$values = result_acpca$values[1:nPC]
  result_acpca$vectors = result_acpca$vectors[,1:nPC]
  v <- matrix(result_acpca$vectors, ncol=nPC)
  ####eigenvalues
  eigenX <- result_acpca$values
  ####the projection, Xv
  Xv <- X%*%v
  ####variance in Xv
  varX <- apply(Xv, 2, var)
  ####total variance in X
  totvar <- sum(apply(X, 2, var))
  ####percentage
  varX_perc <- varX/totvar

  if (eval){
    eigenXperm <- c()
    varXperm <- c()
    varXperm_perc <- c()
    for (i in 1:numPerm){
      Xperm <- X[sample(nrow(X)),]
      #tmp <- eigs_sym(calAv, k = nPC, which = "LA", n = p,
      #                args = list(X = Xperm, K = K, lambda = lambda))
      A = calAv(args = list(X = X, K = K, lambda = lambda))
      tmp = eigen(A,symmetric=TRUE)
      tmp$values = tmp$values[1:nPC]
      tmp$vectors = tmp$vectors[,1:nPC]
      eigenXperm <- rbind(eigenXperm, tmp$values)
      varPC <- apply(Xperm%*%tmp$vectors, 2, var)
      varXperm <- rbind(varXperm, varPC)
      varXperm_perc <- rbind(varXperm_perc, varPC/totvar)
    }
    if (plot){
      par(mfrow=c(1, 2))
      ylimrange <- c(min(c(as.numeric(eigenXperm), eigenX)),
                     max(c(as.numeric(eigenXperm), eigenX)) )
      boxplot(eigenXperm, ylim=ylimrange, xlab="PC", ylab="Eigenvalue", main="")
      points(1:nPC, eigenX, col="red", pch = 4, cex = 1.5, lwd=1.5)

      ylimrange <- c(min(c(as.numeric(varXperm), varX)),
                     max(c(as.numeric(varXperm), varX)) )
      boxplot(varXperm, ylim=ylimrange, xlab="PC", ylab="Variance", main="")
      points(1:nPC, varX, col="red", pch = 4, cex = 1.5, lwd=1.5)

      mtext("Data: red cross; Permutation: boxplot", side = 3, outer = TRUE, line=-2)
    }
    labs1 <- which(eigenX < apply(eigenXperm, 2, function(x){quantile(x, 1-alpha)}))
    if (length(labs1)==0){
      sigPC1 <- 0
    } else {
      sigPC1 <- min(labs1)-1
    }
    labs2 <- which(varX < apply(varXperm, 2, function(x){quantile(x, 1-alpha)}))
    if (length(labs2)==0){
      sigPC2 <- 0
    } else {
      sigPC2 <- min(labs2)-1
    }
    sigPC <- max(sigPC1, sigPC2)
  } else {
    sigPC <- NA
    eigenXperm <- NA
    varXperm <- NA
    varXperm_perc <- NA
  }
  return(list(Xv=Xv, v=v, sigPC=sigPC, eigenX=eigenX,
              varX=varX, varX_perc=varX_perc,
              eigenXperm=eigenXperm, varXperm=varXperm, varXperm_perc=varXperm_perc,
              lambda=lambda, kernel=kernel, bandwidth=bandwidth))
}

calAv <- function(args){
  X <- args$X
  K <- args$K
  lambda <- args$lambda
  #return( crossprod(X, (diag(dim(K)[1])-lambda*K)%*%(X%*%matrix(v, ncol=1))) )
  return( crossprod(X, (diag(dim(K)[1])-lambda*K)%*%X) )
}
YuWang28/acPCoA documentation built on Dec. 18, 2021, 8:20 p.m.