R/remlOptimization_algorithms.R
In gremlin: Mixed-Effects REML Incorporating Generalized Inverses

Documented in ai em gradFun reml

###############################
# reml()
###############################
#' REML optimization algorithms for mixed-effect models.
#'
#' Evaluate the REML likelihood and algorithms for iterating to find maximum 
#'   REML estimates.
#'
#' @aliases reml ai em gradFun
#' @param nu,nuvin A \code{list} or \code{vector} of (co)variance parameters to
#'   estimate on the transformed, or nu, scale.
#' @param skel A skeleton for reconstructing the list of (co)variance parameters.
#' @param thetaG,thetaR \code{Integer} vectors indexing the G-structure or
#'   R-structure of the (co)variance parameters.
#' @param sigma2e A \code{numeric} value for the residual variance estimate
#'   when it has been factored out of the Coefficient matrix of the Mixed Model
#'   Equations, thus converting the (co)variance components to ratios
#'   (represented by the variable lambda).
#' @param conv A \code{character} vector of (co)variance parameter constraints.
#' @param sLc A sparse \code{Matrix} containing the symbolic Cholesky
#'   factorization of the coefficient matrix of the Mixed Model Equations.
#' @param Cinv A sparse \code{Matrix} containing the inverse of the Coefficient
#'   matrix to the Mixed Model Equations.
#' @param modMats A \code{list} of the model matrices used to construct the
#'   mixed model equations.
#' @param W,tWW A sparse \code{Matrix} containing the design matrices for the fixed
#'   and random effects (\code{W}) and the cross-product of this (\code{tWW}).
#' @param Bpinv A matrix inverse of the matrix containing the prior specification
#'   for fixed effects.
#' @param nminffx,nminfrfx,rfxlvls \code{Integers} specifying: (1) the difference
#'   between the number of observations and fixed effects (of the full rank fixed
#'   effects design matrix (X), (2) \code{nminffx} minus the total number of
#'   random effects, and (3) a \code{vector} of levels for each term in the 
#'   random effects.
#' @param rfxIncContrib2loglik A \code{numeric} indicating the sum of constraint
#'   contributions to the log-likelihood across all terms in the random effects
#'   that have non-diagonal generalized inverse matrices (\code{ginverse}).
#' @param ndgeninv A \code{logical} vector indicating if each random term is
#'   associated with a generalized inverse (\code{ginverse}).
#' @param RHS A sparse \code{Matrix} containing the Right-Hand Side to the 
#'   Mixed Model Equations.
#' @param sln,r Sparse \code{Matrices} containing the solutions or residuals
#'   of the Mixed Model Equations.
#'
#' @return A \code{list} or \code{matrix} containing any of the previous
#'   parameters described above, or the following that are in addition to or
#'   instead of parameters above:
#'   \describe{
#'     \item{loglik }{The REML log-likelihood.}
#'     \item{tyPy,logDetC }{Components of the REML log-likelihood derived from the 
#'       Cholesky factor of the Coefficient matrix to the Mixed Model Equations.}
#'     \item{Cinv_ii }{A vector containing the diagonal elements of the inverse
#'       of the Coefficient matrix to the Mixed Model Equations (i.e., the
#'       diagonal entries of \code{Cinv}).}
#'     \item{AI }{A \code{matrix} of values containing the Average Information
#'       matrix, or second partial derivatives of the likelihood with respect to
#'       the transformed (co)variance components (nu). The inverse of this matrix
#'       gives the sampling variances of these transformed (co)variance components.}
#'     \item{dLdnu }{A single column \code{matrix} of first derivatives of
#'       the transformed (co)variance parameters (nu) with respect to the
#'       log-Likelihood.}
#'   }
#'
#' @author \email{matthewwolak@@gmail.com}
#' @export
reml <- function(nu, skel, thetaG, sLc,
	modMats, W, Bpinv, nminffx, nminfrfx, rfxlvls, rfxIncContrib2loglik,
	thetaR = NULL,  #<-- non-NULL if lambda==FALSE
	tWW = NULL, RHS = NULL){  #<-- non-NULL if lambda==TRUE

  lambda <- is.null(thetaR)
  #FIXME `[[thetaG+1]]` is just a kluge below: check when/if have > R matrices
  Rinv <- as(solve(nu[[length(thetaG)+1]]), "symmetricMatrix")
  Ginv <- lapply(thetaG, FUN = function(x){as(solve(nu[[x]]), "symmetricMatrix")}) # Meyer 1991, p.77 (<-- also see MMA on p70/eqn4)

  ##1c Now make coefficient matrix of MME
  if(lambda){
    tWKRinvW <- tWW  #<-- if lambda, data "quadratic" with R factored out
    tyRinvy <- crossprod(modMats$y)
  } else{
      ### Rinv Kronecker with Diagonal/I
      KRinv <- kronecker(Rinv, Diagonal(x = 1, n = modMats$Zr@Dim[[2L]]))
      tyRinvy <- as(crossprod(modMats$y, KRinv) %*% modMats$y, "sparseMatrix")
        ### Components of Meyer 1989 eqn 2
        tWKRinv <- crossprod(W, KRinv)
        tWKRinvW <- tWKRinv %*% W  
      RHS <- Matrix(tWKRinv %*% modMats$y, sparse = TRUE)
       ## Meyer '97 eqn 11
       ### (Note different order of RHS from Meyer '89 eqn 6; Meyer '91 eqn 4)
    }

  ##`sapply()` to multiply inverse G_i with ginverse element (e.g., I or geninv)
  if(modMats$nG > 0){
    C <- as(tWKRinvW + bdiag(c(Bpinv,
      sapply(1:modMats$nG, FUN = function(u){kronecker(modMats$listGeninv[[u]], Ginv[[u]])}))), "symmetricMatrix")
  } else C <- as(tWKRinvW + Bpinv, "symmetricMatrix")
  ## Make/Update symbolic factorization with variance parameters from this iteration
  if(is.null(sLc)){
    ##1d Find best order of MMA/C/pivots!
    # Graser et al. 1987 (p1363) when singular C, |C| not invariant to order
    ##of C, therefore same order (of pivots) must be used in each loglik iteration
    ## Hadfield (2010, MCMCglmm paper App B): details on chol(), ordering, and updating
    # supernodal decomposition FALSE
    ## ?Cholesky documentation demos simplicial smaller sized object/more sparse
    ### However, tested `warcolak` trait1 with VA and VD univariate model
    #### `super = FALSE` faster and smaller than when `super = TRUE`
#if(any(eigen(C)$values < 0)) cat(which(eigen(C)$values < 0), "\n") #FIXME
    #TODO test if super=TRUE better
    ## supernodal might be better as coefficient matrix (C) becomes more dense
    #### e.g., with a genomic relatedness matrix (see Masuda et al. 2014 & 2015)
    sLc <- Cholesky(C, perm = TRUE, LDL = FALSE, super = FALSE)
    # original order obtained by: t(P) %*% L %*% P or `crossprod(P, L) %*% P`
  } else sLc <- update(sLc, C)



  # solve MME for BLUEs/BLUPs
  ## Do now, because needed as part of calculating the log-likelihood
#XXX Do I need to solve for BLUEs (see Knight 2008 for just getting BLUPs eqn 2.13-15)
  # chol2inv: Cinv in same permutation as C (not permutation of sLc/sLm)
  #Cinv <<- chol2inv(sLc) #<-- XXX ~10x slower than `solve(C)` atleast for warcolak
#      Cinv <<- solve(a = sLc, b = Ic, system = "A") #<-- XXX comparable speed to `solve(C)` at least for warcolak
  #Cinv <<- solve(C)
  ##XXX NOTE Above Cinv is in original order of C and NOT permutation of M

  sln <- solve(a = sLc, b = RHS, system = "A")
  ## Cholesky is more efficient and computationally stable
  ### see Matrix::CHMfactor-class expand note about fill-in causing many more non-zeroes of very small magnitude to occur
  #### see Matrix file "CHMfactor.R" method for "determinant" (note differences with half the logdet of original matrix) and the functions:
  ##### `ldetL2up` & `destructive_Chol_update`



  # 5 record log-like, check convergence, & determine next varcomps to evaluate  
  ##5a determine log(|C|) and y'Py
  # Boldman and Van Vleck 1991 eqn. 6 (tyPy) and 7 (log|C|)    
  # Meyer 1997 eqn. 13 (tyPy)
  tyPy <- tyRinvy - crossprod(sln, RHS)
  logDetC <- 2 * sum(log(sLc@x[sLc@p+1][1:sLc@Dim[[1L]]]))
  # alternatively see `determinant` method for CHMfactor

  # Factored out residual variance (only for lambda scale)
  sigma2e <- if(lambda) tyPy / nminffx else NA

  # Construct the log-likelihood (Meyer 1997, eqn. 8)
  ## (first put together as `-2log-likelihood`)
  ## 'log(|R|)'
  ### assumes X of full rank
  if(lambda){
    loglik <- nminfrfx * log(sigma2e)
  } else{
      #  Meyer 1997 eqn. 10
      loglik <- modMats$ny * log(nu[[thetaR]])
    }

  ## 'log(|G|)'
  #FIXME: Only works for independent random effects right now!
  if(lambda){
    logDetGfun <- function(x){rfxlvls[x] * log(as.vector(nu[[x]]*sigma2e))}
  } else{
      # Meyer 1997 eqn. 9
      logDetGfun <- function(x){rfxlvls[x] * log(as.vector(nu[[x]]))}
    }
  if(modMats$nG != 0){
    loglik <- loglik + sum(sapply(seq(modMats$nG), FUN = logDetGfun)) + rfxIncContrib2loglik
  }

  ## log(|C|) + tyPy
  ### if `lambda` then nminffx=tyPy/sigma2e, simplified because sigma2e=tyPy/nminffx
  ### and mulitply by -0.5 to calculate `loglik` from `-2loglik`
  loglik <- -0.5 * (loglik + logDetC + if(lambda) nminffx else tyPy)




  # calculate residuals
  r <- modMats$y - W %*% sln

 return(list(loglik = loglik@x,
		sigma2e = if(lambda) sigma2e@x else NA,
		tyPy = tyPy@x, logDetC = logDetC,
		sln = sln, r = r, sLc = sLc))
}  #<-- end `reml()` 


################################################################################



















################################################################################
## Meyer and Smith 1996 for algorithm using derivatives of loglik
### eqn 15-18 (+ eqn 33-42ish) for derivatives of tyPy and logDetC
### Smith 1995 for very technical details
# EM refs: Hofer 1998 eqn 10-12
## note Hofer eqn 12 has no sigma2e in last term of non-residual formula
### ?mistake? in Mrode 2005 (p. 241-245), which does include sigma2e
#' @rdname reml
#' @export
em <- function(nuvin, thetaG, thetaR, conv,
	modMats, nminffx, sLc, ndgeninv, sln, r){

  Cinv_ii <- matrix(0, nrow = nrow(sln), ncol = 1)  #<-- make even if no varcomps
  # if no variance components (except residual), skip to the end
  if(length(thetaG) > 0){
    ## go "backwards" so can fill in Lc with lower triangle of Cinv
    ei <- modMats$nb + sum(sapply(modMats$Zg, FUN = ncol))
    Ig <- Diagonal(n = sLc@Dim[1L], x = 1)
    for(g in rev(thetaG)){
      if(conv[g] == "F") next  #<-- skip if parameter Fixed
      qi <- ncol(modMats$Zg[[g]])
      si <- ei - qi + 1
      # Note: trace of a product == the sum of the element-by-element product
      ## Consequently, don't have to make `Cinv`, just diagonals
#XXX TODO see Knight 2008 thesis eqns 2.36 & 2.42 (and intermediates) for more generalized form of what is in Mrode (i.e., multivariate/covariance matrices instead of single varcomps)
##XXX eqn. 2.44 is the score/gradient! for a varcomp
      trace <- 0
      if(ndgeninv[g]){
        o <- (crossprod(sln[si:ei, , drop = FALSE], modMats$listGeninv[[g]]) %*% sln[si:ei, , drop = FALSE])@x
        for(k in si:ei){
          Cinv_siei_k <- solve(sLc, b = Ig[, k], system = "A")[si:ei, , drop = TRUE]
          Cinv_ii[k] <- Cinv_siei_k[k-si+1]       
          trace <- trace + sum(modMats$listGeninv[[g]][(k-si+1), , drop = TRUE] * Cinv_siei_k)
        }  #<-- end for k
      } else{
          ## first term
          o <- crossprod(sln[si:ei, , drop = FALSE])
          for(k in si:ei){
            Cinv_ii[k] <- solve(sLc, b = Ig[, k], system = "A")[k,]
            trace <- trace + Cinv_ii[k]
          }  #<-- end for k
        }  #<-- end if/else ndgeninv
      #TODO check `*tail(nuv,1)` correctly handles models with covariance matrices
      nuvin[g] <- as(as(matrix((o + trace) / qi),  #XXX was `trace*tail(nuvin,1)` 
        "symmetricMatrix"),
        "dsCMatrix")
      ei <- si-1
    }  #<-- end `for g`
  }  #<-- end if no variance components besides residuals

  if(conv[thetaR] != "F") nuvin[thetaR] <- crossprod(modMats$y, r) / nminffx

 return(list(nuv = nuvin, Cinv_ii = Cinv_ii))
}  #<-- end `em()`
################################################################################















################################################################################
#' @rdname reml
#' @export
ai <- function(nuvin, skel, thetaG,
		   modMats, W, sLc, sln, r,
		   thetaR = NULL,  #<-- non-NULL if lambda==FALSE
		   sigma2e = NULL){  #<-- non-NULL if lambda==TRUE

  lambda <- is.null(thetaR)
  p <- length(nuvin)
  nuin <- vech2matlist(nuvin, skel)

  if(lambda){
    Rinv <- as(solve(matrix(sigma2e)), "symmetricMatrix")
  } else{
      Rinv <- as(solve(nuin[[thetaR]]), "symmetricMatrix")
    }
  B <- matrix(0, nrow = modMats$ny, ncol = p)

  # if no variance components (except residual), skip to the end
  if(length(thetaG) > 0){
    Ginv <- lapply(thetaG, FUN = function(x){as(solve(nuin[[x]]), "symmetricMatrix")})
    si <- modMats$nb+1
    for(g in thetaG){ #FIXME assumes thetaG is same length as nuvin
      qi <- ncol(modMats$Zg[[g]])
      ei <- si - 1 + qi
      # Meyer 1997: eqn. 20 [also f(theta) in Johnson & Thompson 1995 eqn 9c)]
      # Knight 2008: eqn 2.32-2.35 and 3.11-3.13 (p.34)
      #TODO for covariances see Johnson & Thompson 1995 eqn 11b
      Bg <- modMats$Zg[[g]] %*% sln[si:ei, , drop = FALSE] %*% Ginv[[g]] #FIXME is order correct? See difference in order between Johnson & Thompson (e.g., eqn. 11b) and Meyer 1997 eqn 20
      B[cbind(Bg@i+1, g)] <- Bg@x
      si <- ei+1
    }  #<-- end `for g`
  } else g <- p-1  #<-- end if no varcomps TODO: `g` assignment temporary until >1 residual


  #FIXME TODO Check what to do if more than 1 residual variance parameter
  if(g < p){
    if(lambda){
      B[, p] <- (modMats$y %*% Rinv)@x
    } else{
        B[, p] <- (r %*% Rinv)@x
      }
  }
  # Set up modified MME like the MMA of Meyer 1997 eqn. 23
  ## Substitute `B` instead of `y`
  if(lambda){
    BRHS <- Matrix(crossprod(W, B), sparse = TRUE) 
    tBRinvB <- crossprod(B)
  } else{
      # could pass KRinv and tWKRinv (if lambda=FALSE) from reml() into ai() 
      KRinv <- kronecker(Rinv, Diagonal(x = 1, n = modMats$Zr@Dim[[2L]]))
      tWKRinv <- crossprod(W, KRinv)
      BRHS <- Matrix(tWKRinv %*% B, sparse = TRUE)
      tBRinvB <- crossprod(B, KRinv) %*% B
    }
  ## tBPB
  ### Johnson & Thompson 1995 eqns 8,9b,9c
  #### (accomplishes same thing as Meyer 1997 eqns 22-23 for a Cholesky
  #### factorization as Boldman & Van Vleck eqn 6 applied to AI calculation)
  ## how to do this in c++ with Csparse
#   tS <- matrix(NA, nrow = ncol(BRHS), ncol = nrow(BRHS))
#    for(c in 1:p){
#      slv1P <- solve(sLc, BRHS[, c], system = "P")
#      slv1L <- solve(sLc, slv1P, system = "L")
#      slv1Lt <- solve(sLc, slv1L, system = "Lt")
#      tS[c, ] <- solve(sLc, slv1Lt, system = "Pt")@x
#    }
#    tBPB <- tBRinvB - (tS %*% BRHS)
  tBPB <- tBRinvB - crossprod(solve(sLc, BRHS, system = "A"), BRHS)
  AI <- 0.5 * tBPB 
  if(lambda) AI <- AI / sigma2e

 as(AI, "matrix")
}  #<-- end `ai()`
################################################################################    









################################################################################
#' @rdname reml
#' @export
gradFun <- function(nuvin, thetaG, modMats, Cinv, sln,
	sigma2e = NULL,   #<-- non-NULL if lambda==TRUE
	r = NULL, nminfrfx = NULL){  #<-- non-NULL if lambda==FALSE

  lambda <- is.null(r)  #<-- lambda scale model
  p <- length(nuvin)
  dLdnu <- matrix(0, nrow = p, ncol = 1, dimnames = list(names(nuvin), NULL))
  # tee = e'e
  if(!lambda) tee <- crossprod(r)

  # Skip most of below if there are no variance components other than residual
  if(length(thetaG) > 0){
    # `trCinvGeninv_gg` = trace[Cinv_gg %*% geninv_gg]
    # `tugug` = t(u_gg) %*% geninv_gg %*% u_gg
    ## `g` is the gth component of the G-structure to model
    ## `geninv` is the generalized inverse
    ### (not the inverse of the G-matrix/(co)variance components)
    trCinvGeninv_gg <- tugug <- as.list(rep(0, length(thetaG)))
    si <- modMats$nb+1
    for(g in thetaG){
      qi <- ncol(modMats$Zg[[g]])
      ei <- si - 1 + qi
#TODO XXX for using covariance matrices, see Johnson & Thompson 1995 eqn 11a
      ## Johnson & Thompson 1995 equations 9 & 10
      if(class(modMats$listGeninv[[g]]) == "ddiMatrix"){
      ### No generalized inverse associated with the random effects
      ### Johnson & Thompson 1995 eqn 10a
        #### 3rd term in the equation
        tugug[[g]] <- crossprod(sln[si:ei, , drop = FALSE])
        #### 2nd term in the equation
        trCinvGeninv_gg[[g]] <- tr(Cinv[si:ei, si:ei])
      
      } else{
        ### Generalized inverse associated with the random effects
        ### Johnson & Thompson 1995 eqn 9a
          #### 3rd term in the equation
          tugug[[g]] <- crossprod(sln[si:ei, , drop = FALSE], modMats$listGeninv[[g]]) %*% sln[si:ei, , drop = FALSE]
          #### 2nd term in the equation
          # the trace of a product = the sum of the element-by-element product
          ## trace(geninv[[g]] %*% Cinv[si:ei, si:ei]
          ###                       = sum((geninv[[g]] * Cinv[si:ei, si:ei])@x)
          trCinvGeninv_gg[[g]] <- tr(modMats$listGeninv[[g]] %*% Cinv[si:ei, si:ei])
        }  #<-- end if else whether a diagonal g-inverse associated with rfx

      si <- ei+1

    }  #<-- end `for g in thetaG`

    # First derivatives (gradient/score)
    if(lambda){
      for(g in thetaG){
        # Johnson and Thompson 1995 Appendix 2 eqn B3 and eqn 9a and 10a
        dLdnu[g] <- (ncol(modMats$Zg[[g]]) / nuvin[g]) - (1 / nuvin[g]^2) * (trCinvGeninv_gg[[g]] + tugug[[g]] / sigma2e)
      }  #<-- end for g

    } else{  #<-- else when NOT lambda scale
        ## Johnson and Thompson 1995 eqn 9b
#FIXME change `[p]` below to be number of residual (co)variances
        dLdnu[p] <- (nminfrfx / tail(nuvin, 1)) - (tee / tail(nuvin, 1)^2)
        for(g in thetaG){
          dLdnu[p] <- dLdnu[p] + (1 / tail(nuvin, 1)) * (trCinvGeninv_gg[[g]] /nuvin[g]) 
          # Johnson and Thompson 1995 eqn 9a and 10a
          dLdnu[g] <- (ncol(modMats$Zg[[g]]) / nuvin[g]) - (1 / nuvin[g]^2) * (trCinvGeninv_gg[[g]] + tugug[[g]])
        }  #<-- end for g
      }
  } else{  #<-- end if varcomps besides residuals
      # First derivatives (gradient/score) if only residual variance in model
      ## if lambda==TRUE then don't do this at all and `dLdnu` is empty
###FIXME when >1 residual (co)variance need to change whether skip with lambda=TRUE
      if(!lambda) dLdnu[p] <- (nminfrfx / tail(nuvin, 1)) - (tee / tail(nuvin, 1)^2)
    }  #<-- end if/else no varcomps besides residual

 # Johnson and Thompson 1995 don't use -0.5, because likelihood is -2 log likelihood
 ## see `-2` on left-hand side of Johnson & Thompson eqn 3
 -0.5 * dLdnu
}  #<-- end `gradFun()`
##########################
# Changing ginverse elements to `dsCMatrix` doesn't speedup traces, since
## they end up more or less as dense matrices but in dgCMatrix from the product
##########################