prime.total.effect: Total effect estimation of the environmental mixtures

Documented in Dicker.2013 EigenPrism GCTA.inter GCTA.rr

# Variance esitmation methods for high dimensional data

#' GCTA: Genome-wide complex trait analysis
#'
#' @description
#'
#' \code{GCTA.rr} calculates the \strong{total effect} of the exposure of the environmental mixture.
#'
#' To estimate the the \strong{total effect}, we use a method called GCTA. Genome-wide complex trait analysis (GCTA) Genome-based restricted maximum likelihood (GREML)
#' is a statistical method for variance component estimation in genetics which quantifies the total narrow-sense (additive) contribution
#' to a trait's heritability of a particular subset of genetic variants. More details of GCTA could be found in \href{https://en.wikipedia.org/wiki/Genome-wide_complex_trait_analysis}{here}.
#' More details of total effect estimation and how it can be applied to mixtures can be found in the here.
#'
#' @param y A continuous vector.
#' @param x A numeric matrix.
#' @param target A character value. \code{beta2} is for the total effect and \code{r2} is for the the proportion of the variance explained x.
#' @param bs.iter A numeric value. The number of iterations of bootstrap for variance estimation.
#' @param verbose A boolean variable. If \code{TRUE}, then the function will print the process of bootstrap.
#' @return A list.
#' If \code{target} = \code{r2},
#' \enumerate{
#'   \item \code{r2}: Estimated the proportion of the variance explained x, which is the proportion of the variance in a health outcome that is predictable from the environmental mixtures
#'   \item \code{r2.var}: Estimated variance of the proportion of the variance explained x, calculated by a bootstrap resampling procedure
#'   \item \code{r2.bs.raw}: The raw bootstrap sampling result
#' }
#' If \code{target} = \code{beta2},
#' \enumerate{
#'   \item \code{beta2}: Estimated total effect.
#'   \item \code{beta2.var}: Estimated variance of the total effect via a bootstrap procedure
#'   \item \code{beta2.bs.raw}: The raw bootstrap sampling result
#' }
#' @import rrBLUP svMisc
#' @export
GCTA.rr <- function(x,y,target = c("beta2","r2")[1], bs.iter = 1000, verbose = TRUE){
  x <- std.fn(x)
  res <- mixed.solve(y = y, Z = x)
  # in GCTA setting we have Vu = Vg/p, so to estimate Vg we need multiple p
  G.beta2 <- res$Vu * ncol(x)
  G.r2 <- G.beta2/(G.beta2 + res$Ve)

  # parametric bootstrap to estimate the variance of total effect
  G.beta2.bs <- numeric(bs.iter)
  G.r2.bs <- numeric(bs.iter)
  if (bs.iter < 1) {
    r2.var <- beta2.var <- NA
  } else {
    for(i in 1:bs.iter){
      K <- x %*% t(x) * 1/ncol(x)
      In <- diag(nrow(x))
      Vh <- G.r2*K + (1-G.r2)*In
      y.bs <- MASS::mvrnorm(n = 1, mu = numeric(nrow(x)), Sigma = Vh)
      res <- mixed.solve(y = y.bs, Z = x)
      tmp.beta2 <- res$Vu * ncol(x)
      tmp.r2 <- tmp.beta2/(tmp.beta2 + res$Ve)
      G.beta2.bs[i] <- tmp.beta2
      G.r2.bs[i] <- tmp.r2
      if(verbose == TRUE){
        progress(i, max.value = bs.iter)
      }
    }
  }

  # calculate the variance based on the bootstrap sample
  var.beta2 <- var(G.beta2.bs)
  var.r2 <- var(G.r2.bs)
  if(target == "r2") {
    names(G.r2) <- "proportion of the variance explained x"
    names(var.r2) <- "sample variance via bootstrap"
    result <- list(r2 = G.r2, r2.var = var.r2, bootstrap.sample = G.r2.bs)
  } else {
    names(G.beta2) <- "total effect"
    names(var.beta2) <- "sample variance via bootstrap"
    result <- list(beta2 = G.beta2, beta2.var = var.beta2, bootstrap.sample = G.beta2.bs)
  }
  result
}

#' GCTA.inter
#' The modifed GCTA method to estimate the total effect for interaction terms
#' @param y A continuous vector
#' @param x A matrix for covariates
#' @param interact A numeric parameter to indicate if the model includes interaction terms, 0 means adding the interactions, any non-zero value means no interaction terms
#' @param target A character for estimation target
#' @export

GCTA.inter <- function(y,x, interact=0, target = c("beta2","r2")[1]){
  x <- std_fn(x)
  y <- matrix(y, ncol = 1)
  ## data y[1:nr], x[1:nr,1:nc]
  nr=dim(x)[1]
  nc=dim(x)[2]
  WW=x%*%t(x)/nc
  II=diag(rep(1,nr))-rep(1,nr)%*%t(rep(1,nr))/nr

  RACT=0 # interaction effects
  if(interact==0){
    s <- tryCatch(svd(WW,nu=nr,nv=0,LINPACK=TRUE), error = function(e) e) # return return NA if there is an error for LINPACK
    if(inherits(s, "error")) return(list(G=NA,E=NA,RACT=NA))  # singular value decomposition
    YD=(t(s$u)%*%y)^2
    ID=rep(1,nr)-(t(s$u)%*%rep(1,nr))^2/nr

    # REML estimator
    sigmaG=var(y)[1,1]/2 # need sigmaG to be a numeric instead of a matrix
    sigmaE=sigmaG
    for(i in 1:50){
      nlen=nr-1  # should be nonzero eigenvalues
      A11=sum(s$d[1:nlen]^2/(sigmaE+sigmaG*s$d[1:nlen])^2) # sigmaG is a 1 by 1 matrix?
      A12=sum(s$d[1:nlen]/(sigmaE+sigmaG*s$d[1:nlen])^2)
      A22=sum(1/(sigmaE+sigmaG*s$d[1:nlen])^2)
      B1=sum(s$d[1:nlen]*YD[1:nlen]/(sigmaE+sigmaG*s$d[1:nlen])^2)
      B2=sum(YD[1:nlen]/(sigmaE+sigmaG*s$d[1:nlen])^2)
      den=A11*A22-A12*A12
      sigmaGnew=(A22*B1-A12*B2)/den
      sigmaEnew=(A11*B2-A12*B1)/den

      if(is.na(sigmaGnew) | is.na(sigmaEnew)) return(list(G=NA,E=NA,RACT=NA)) # return NA if there is an error
      #use complementary slackness
      if(sigmaGnew<0 && sigmaEnew>=0){sigmaGnew=0; sigmaEnew=B2/A22}
      if(sigmaGnew>=0 && sigmaEnew<0){sigmaGnew=B1/A11; sigmaEnew=0}
      if(sigmaGnew<0 && sigmaEnew<0){sigmaGnew=0; sigmaEnew=0}

      delta=abs(sigmaGnew-sigmaG)+abs(sigmaEnew-sigmaE)
      sigmaG=sigmaGnew
      sigmaE=sigmaEnew

      if(delta<1e-6){break} #print(round(c(sigmaG,sigmaE),3));
      #print(c(i,sigmaG,sigmaE,sigmaGnew,sigmaEnew,delta))
    }
    #print(c(sigmaG,sigmaE,var(y)))
  }else{
    WW2=WW^2
    WW2=II%*%WW2%*%II
    YY=(y-mean(y))%*%t(y-mean(y))

    sc=rep(0,3)
    inf=matrix(0,ncol=3,nrow=3)

    sc[1]=sum(diag(II%*%YY))
    sc[2]=sum(diag(WW%*%YY))
    sc[3]=sum(diag(WW2%*%YY))

    inf[1,1]=sum(diag(II%*%II))
    inf[1,2]=sum(diag(II%*%WW))
    inf[1,3]=sum(diag(II%*%WW2))
    inf[2,2]=sum(diag(WW%*%WW))
    inf[2,3]=sum(diag(WW%*%WW2))
    inf[3,3]=sum(diag(WW2%*%WW2))

    inf[2,1]=inf[1,2]
    inf[3,1]=inf[1,3]
    inf[3,2]=inf[2,3]

    delta=solve(inf)%*%sc

    sigmaE=delta[1]
    sigmaG=delta[2]
    RACT=delta[3]
  }

  if(target == 'r2'){
    sigmaG <- sigmaG/(sigmaE + sigmaG)
    RACT <- RACT/(sigmaE + sigmaG)
  }
  res <- c(sigmaG, RACT, sigmaE)
  if(target == "r2"){
    names(res) <- c("r2", "r2.inter", "sigmaE")
  } else {names(res) <- c("beta2", "beta2.inter", "sigmaE") }
  res
}

#' Dicker 2013
#'
#' @description
#'
#' This Dicker.2013 function estimate the total effect based on a paper \href{http://www.stat.rutgers.edu/home/ldicker/papers/variance.pdf}{Variance estimation in high-dimensional linear models}
#'
#' @param y A numeric vector
#' @param x A numeric vector
#' @param target A character value. \code{beta2} is for the total effect and \code{r2} is for the the proportion of the variance explained x.
#' @export
#' @return A vector.
#' If \code{target} = \code{r2},
#' \enumerate{
#'   \item \code{r2}: Estimated the proportion of the variance explained x, which is the proportion of the variance in a health outcome that is predictable from the environmental mixtures
#'   \item \code{r2.var}: Estimated variance based on a aysmptotic normal distribution
#' }
#' If \code{target} = \code{beta2},
#' \enumerate{
#'   \item \code{beta2}: Estimated total effect
#'   \item \code{beta2.var}: Estimated variance of the total effect based on a aysmptotic normal distribution
#' }
Dicker.2013 <- function(x, y, target = c("beta2","r2")[1]){
  x <- std.fn(x) # note that we need to center the covariates
  # n <- nrow(x) - 1 # note
  n <- nrow(x)
  d <- ncol(x)
  if(d < n){
    warning("n is larger than p, this method can only work when n is smaller than p")
    return(rep(NA,4))
  }
  norm2.y <- sum(y^2)
  norm2.xy <- sum((t(x)%*%y)^2)
  sigma2.hat <- (d + n + 1)/(n*(n+1))*norm2.y - 1/(n*(n+1))*norm2.xy
  tau2.hat <- -d/(n*(n+1))*norm2.y + 1/(n*(n+1))*norm2.xy
  r2.hat <- tau2.hat/(tau2.hat + sigma2.hat)
  phi2.1 <- 2*(d/n*(sigma2.hat + tau2.hat)^2 + sigma2.hat^2 + tau2.hat^2)
  phi2.2 <- 2*((1+d/n)*(sigma2.hat + tau2.hat)^2 - sigma2.hat^2 + 3*tau2.hat^2)
  phi2.0 <- 2/(tau2.hat + sigma2.hat)^2 * ((1+d/n)*(tau2.hat + sigma2.hat)^2 - sigma2.hat^2)
  sigma2.var <- phi2.1/n
  tau2.var <- phi2.2/n
  r2.var <- phi2.0/n

  # add variable names for better understanding the output

  names(tau2.hat) <- "total effect"
  names(tau2.var) <- "sample variance"
  names(r2.hat) <- "proportion of the variance explained x"
  names(r2.var) <- "sample variance"
  result <- list(sigma2.hat = sigma2.hat,
              sigma2.var = sigma2.var,
              beta2 = tau2.hat,
              beta2.var = tau2.var,
              r2 = r2.hat,
              r2.var = r2.var)
  if(target == "beta2"){
    result[3:4]
  } else if (target == "r2")
    result[5:6]
}



#' EigenPrism
#'
#' @description
#'
#' EigenPrism calculate the total effect based on a paper called
#' \href{https://statweb.stanford.edu/~candes/publications/downloads/EigenPrism.pdf}{EigenPrism: Inference for High-Dimensional Signal-to-Noise Ratios}.
#' The source code of this function could be found at the author's \href{http://lucasjanson.fas.harvard.edu/software.html}{website}.
#'
#'
#' @author Author: Lucas Janson (statweb.stanford.edu/~ljanson)
#'
#' @param x: n by p design matrix; columns will automatically be centered and scaled to variance 1
#' @param y: response vector of length n (will automatically be centered)
#' should not contain intercept column, since both y and x will be centered
#' @param invsqrtSig: if columns of x not independent, p by p positive definite matrix which is the square-root
#' of the inverse of Sig, where Sig is the *correlation* matrix of the x (default is identity)
#' @param alpha: significance level for confidence interval (default = 0.05)
#' @param target: target of estimation/inference
#' \enumerate{
#'   \item \code{beta2} Total effect, which is the squared 2-norm of the coefficient vector: sum(beta^2).
#'   \item \code{r2} Estimated the proportion of the variance explained x, which is the fraction of variance of y explained by x%*%beta: t(beta)%*%Sig%*%beta/var(y)
#' }
#' @param zero.ind: vector of which indices of the weight vector w to constrain to zero (default is none)
#' @param diagnostics: boolean (default = T) for whether to generate diagnostic plots for the V.i, lambda.i, and w.i
#' @return
#' \enumerate{
#'   \item \code{beta2} or \code{r2} estimate: unbiased estimate of the target (for beta2 or r2)
#'   \item CI: 100*(1-alpha)% confidence interval for target
#' }
#' @import cccp
#' @export

EigenPrism <- function(y,
                       x,
                       invsqrtSig=NULL,
                       alpha=0.05,
                       target = c("beta2","r2")[1],
                       zero.ind=c(),
                       diagnostics=F){

  # Get dimensionality of problem
  n = nrow(x)
  p = ncol(x)
  if(n > p) {
    warning("n is larger than p, this method can only work when n is smaller than p")
    return(as.list(rep(NA,3)))}
  # Transform y and x to proper form
  y = y-mean(y)
  # x = scale(x,T,T)*n/(n-1)
  x = std.fn(x)
  if(!is.null(invsqrtSig)) x = x%*%invsqrtSig

  # Take singular value decomposition and rescale singular values
  svd = svd(x)
  lambda = svd$d^2/p

  # Defined cone-constrained linear problem to optimize weights; [v; w] is vector of optimization variables
  q = c(1,rep(0,n)) #coefficient vector in objective function
  A = rbind(c(0,rep(1,n)),c(0,lambda)) #matrix for linear constraints
  b = c(0,1) #vector for linear constraints
  if(target=='sigma2') b = c(1,0) #switch constraints if target is sigma^2
  # Constrain some weights to be zero if desired
  if(!is.null(zero.ind)){
    A = rbind(A,cbind(rep(0,length(zero.ind)),diag(rep(1,n))[zero.ind,]))
    b = c(b,rep(0,length(zero.ind)))
  }
  # Define second-order cone constraints
  soc1 = socc(diag(c(1/4,rep(1,n))),c(-1/2,rep(0,n)),c(1/4,rep(0,n)),1/2)
  soc2 = socc(diag(c(1/4,lambda)),c(-1/2,rep(0,n)),c(1/4,rep(0,n)),1/2)
  prob = dlp(as.vector(q),A,as.vector(b),list(soc1,soc2))
  # Solve optimization problem and extract variables
  opt = cps(prob,ctrl(trace=F))
  v = getx(opt)[1]
  w = getx(opt)[-1]
  # Compute estimate and y's variance
  est = sum(w*(t(svd$u)%*%y)^2)
  yvar = sum(y^2)/n

  # Compute confidence interval
  CI = est + yvar*sqrt(v)*qnorm(1-alpha/2)*c(-1,1)
  if(target=='r2'){
    est = est/yvar
    CI = CI/yvar
    names(est) <- "proportion of the variance explained x"
  } else {
    names(est) <- "total effect"
  }
  names(CI) <- c("confidence interval: lower bound", "confidence interval: upper bound")

  # Generate list with results
  # result=list()
  # result$estimate = est
  # result$CI = CI
  # result = as.list(rep(NA,3))
  # result[1] = est
  # result[2:3] = CI
  if(target=='r2'){
    result <- list(r2 = est,
                   CI1 = CI[1],
                   CI2 = CI[2])
  } else {
    result <- list(beta = est,
                   CI1 = CI[1],
                   CI2 = CI[2])
  }

  # Generate diagnostic plots
  if(diagnostics){
    par(mfrow=c(1,3))

    # Check that eigenvectors are approximately Gaussian
    nV = floor(log10(n))
    srtV = svd$v[,10^(0:nV)]
    labs = c()
    for(i in 1:(nV+1)){
      srtV[,i] = sort(srtV[,i])
      ind = 10^(i-1)
      labs = c(labs,bquote(V[.(ind)]))
    }
    matplot(qnorm((1:p)/(p+1)),srtV,type="l",lwd=2,
            ylab="Quantiles of Eigenvectors",xlab="Gaussian Quantiles",
            main=expression(paste("Check Gaussianity of Eigenvectors ",V[i])))
    legend("topleft",as.expression(labs),col=1:(nV+1),lty=1:(nV+1),lwd=2)

    # Check that there are no outliers in the eigenvalues
    hist(lambda,main=expression(paste("Histogram of Normalized Eigenvalues ",lambda[i])),
         xlab=expression(lambda[i]))

    # Check that the weights are not dominated by just a few values
    srtw = sort(abs(w),T)
    plot(1:n,cumsum(srtw)/sum(srtw),type="l",lwd=2,
         main=expression(paste("Fraction of Total Weight in Largest k ",w[i])),
         xlab="k",ylab="Fraction of Total Weight")
  }

  return(result)
}