radish: Fast gradient-based optimization of resistance surfaces

Documented in mlpe

#' Maximum likelihood population effects
#'
#' A function of class \code{measurement_model} that calculates likelihood,
#' gradient, hessian, and partial derivatives of nuisance parameters and the
#' Laplacian generalized inverse, using the "maximum likelihood population
#' effects" model of Clarke et al (2002) with a non-negative slope
#' between genetic and resistance distance.
#'
#' @param E A submatrix of the generalized inverse of the graph Laplacian (e.g. a covariance matrix)
#' @param S A matrix of observed genetic distances
#' @param phi Nuisance parameters (see details)
#' @param nu Unused
#' @param gradient Compute gradient of negative loglikelihood with regard to \code{phi}?
#' @param hessian Compute Hessian matrix of negative loglikelihood with regard to \code{phi}?
#' @param partial Compute second partial derivatives of negative loglikelihood with regard to \code{phi}, \code{E}, \code{S}?
#' @param nonnegative Force slope to be nonnegative?
#' @param validate Numerical validation via package \code{numDeriv} (very slow, use for debugging small examples)
#'
#' @details The nuisance parameters \code{phi} are the intercept ("alpha"), slope ("beta"), negative log residual
#' deviation ("tau"), and logit-transformed correlation parameter ("rho") of the MLPE regression. If not supplied, \code{phi} is
#' is estimated via maximum likelihood using package \code{corMLPE} (github.com/nspope/corMLPE) and \code{nlme::gls}.
#'
#' TODO: formula
#'
#' @seealso \code{\link{radish_measurement_model}}
#'
#' @return A list containing:
#'  \item{covariance}{rows/columns of the generalized inverse of the graph Laplacian for a subset of target vertices}
#'  \item{objective}{(if \code{objective}) the negative loglikelihood}
#'  \item{fitted}{((if \code{objective}) a matrix of expected genetic distances among target vertices}
#'  \item{boundary}{(if \code{objective}) is the MLE on the boundary (e.g. no genetic structure)?}
#'  \item{gradient}{(if \code{gradient}) gradient of negative loglikelihood with respect to phi}
#'  \item{hessian}{(if \code{hessian}) Hessian matrix of the negative loglikelihood with respect to phi}
#'  \item{gradient_E}{(if \code{partial}) gradient with respect to the generalized inverse of the graph Laplacian}
#'  \item{partial_E}{(if \code{partial}) Jacobian of \code{gradient_E} with respect to phi}
#'  \item{partial_S}{(if \code{partial}) Jacobian of \code{gradient} with respect to S}
#'  \item{jacobian_E}{(if \code{partial}) a function used for reverse algorithmic differentiation}
#'  \item{jacobian_S}{(if \code{partial}) a function used for reverse algorithmic differentiation}
#'
#' @references
#' Clarke et al. TODO
#'
#' @examples
#' library(raster)
#' 
#' data(melip)
#' 
#' covariates <- raster::stack(list(altitude=melip.altitude, forestcover=melip.forestcover))
#' surface <- conductance_surface(covariates, melip.coords, directions = 8)
#'
#' # inverse of graph Laplacian at null model (IBD) 
#' laplacian_inv <- radish_distance(theta = matrix(0, 1, 2), 
#'                                  formula = ~forestcover + altitude,
#'                                  data = surface,
#'                                  radish::loglinear_conductance, 
#'                                  covariance = TRUE)$covariance[,,1]
#' 
#' mlpe(laplacian_inv, melip.Fst) #without 'phi': return MLE of phi
#' mlpe(laplacian_inv, melip.Fst, phi = c(0., 0.5, -0.1))
#'
#' @export

mlpe <- function(E, S, phi, nu = NULL, gradient = TRUE, hessian = TRUE, partial = TRUE, nonnegative = TRUE, validate = FALSE)
{
  symm <- function(X) (X + t(X))/2

  if (missing(phi)) #return starting values and boundaries for optimization of phi
  {
    ones <- matrix(1, nrow(E), 1)
    Ed   <- diag(E)
    R    <- Ed %*% t(ones) + ones %*% t(Ed) - 2 * symm(E)
    Rl   <- R[lower.tri(R)]
    Sl   <- S[lower.tri(S)]
    Ind  <- which(lower.tri(R), arr.ind = TRUE)

    fit  <- nlme::gls(Sl ~ Rl, method = "ML", correlation = corMLPE::corMLPE(form = ~Ind1 + Ind2), 
                      data = data.frame(Sl, Rl, Ind1 = Ind[,1], Ind2 = Ind[,2]))

    if (!nonnegative || coef(fit)[2] > 0) 
    {
      phi <- coef(fit)
      names(phi) <- NULL
      rho <- fit$modelStruct$corStruct[[1]] #already unconstrained
      phi <- c("alpha" = phi[1], "beta" = phi[2], "tau" = -2 * log(sigma(fit)), 
               "rho" = rho)
    }
    else
    {
      fit  <- nlme::gls(Sl ~ 1, method = "ML", correlation = corMLPE::corMLPE(form = ~Ind1 + Ind2), 
                        data = data.frame(Sl, Ind1 = Ind[,1], Ind2 = Ind[,2]))
      phi <- coef(fit)
      names(phi) <- NULL
      rho <- fit$modelStruct$corStruct[[1]] #already unconstrained
      phi <- c("alpha" = phi[1], "beta" = 0, "tau" = -2 * log(sigma(fit)), 
               "rho" = rho)
    }

    return(list(phi   = phi, 
                lower = if (nonnegative) c(-Inf, 0, -Inf, -Inf) 
                        else c(-Inf, -Inf, -Inf, -Inf), 
                upper = c(Inf, Inf, Inf, Inf)))
  }
  else if (!(is.matrix(E)    & 
             is.matrix(S)    & 
             all(dim(E)  == dim(S)) &
             is.numeric(phi) & 
             length(phi) == 4 ))
    stop ("invalid inputs")

  names(phi) <- c("alpha", "beta", "tau", "rho")

  alpha <- phi["alpha"]
  beta  <- phi["beta"]
  tau   <- exp(phi["tau"])
  rho   <- 0.5 * plogis(phi["rho"])

  ones <- matrix(1, nrow(E), 1)
  Ed   <- diag(E)
  R    <- Ed %*% t(ones) + ones %*% t(Ed) - 2 * symm(E)

  Rl  <- R[lower.tri(R)]
  Sl  <- S[lower.tri(S)]
  Ind <- which(lower.tri(R), arr.ind = TRUE)

  unos   <- matrix(1, length(Sl), 1)
  U      <- Matrix::sparseMatrix(i = rep(1:length(Sl), 2), j = c(Ind), x = c(unos))

  if (nrow(E) > length(.mlpe_eigen))
  {
    warning("Cannot use precomputed eigendecomposition, consider updating `.mlpe_eigen`")
    eigUtU <- eigen(as.matrix(Matrix::t(U) %*% U)) #ideally this is precomputed
  }
  else
    eigUtU <- .mlpe_eigen[[nrow(E)]]
  D      <- eigUtU$values
  P      <- eigUtU$vectors
  Dr     <- D/(1-2*rho) + 1/rho

  SigmaInv <- function(x)
  {
    Ax <- 1/(1-2*rho) * x
    x <- Matrix::t(U) %*% Ax
    x <- t(P) %*% x
    x <- x / Dr
    x <- P %*% x
    x <- Ax - 1/(1-2*rho) * U %*% x
    as.matrix(x)
  }

  SigmaLogDet <- sum(log(Dr)) + length(D) * log(rho) + length(Sl) * log(1 - 2*rho)

  # products against inverse correlation matrix
  e       <- Sl - alpha * unos - beta * Rl
  Si_e    <- SigmaInv(e)
  loglik  <- -0.5 * tau * t(e) %*% Si_e + 0.5 * nrow(e) * log(tau) - 0.5 * SigmaLogDet 
  #check here that matches corMLPE

  distance <- matrix(0, nrow(S), ncol(S))
  distance[lower.tri(distance)] <- Rl
  distance <- distance + t(distance)
  fitted <- alpha + beta * distance

  # gradients, hessians, mixed partial derivatives
  if (gradient || hessian || partial)
  {
    dPhi    <- matrix(0, length(phi), 1)
    ddPhi   <- matrix(0, length(phi), length(phi))
    ddEdPhi <- matrix(0, length(Rl),  length(phi))
    ddPhidS <- matrix(0, length(phi), length(Sl))
    rownames(dPhi) <- colnames(ddPhi) <- 
      rownames(ddPhi) <- colnames(ddEdPhi) <- 
        rownames(ddPhidS) <- names(phi)

    drho_Si_e  <- Matrix::t(U) %*% Si_e
    drho_Si_e  <- as.matrix(2*Si_e - U %*% drho_Si_e)
    drho_trans <- rho * (1 - 2*rho)

    # gradient, phi
    dPhi["alpha",] <- t(unos) %*% Si_e * tau
    dPhi["beta",]  <- t(Rl) %*% Si_e * tau
    dPhi["tau",]   <- -0.5 * tau * t(e) %*% Si_e + 0.5 * length(e)
    dPhi["rho",]   <- 
      (-0.5 * tau * t(Si_e) %*% drho_Si_e - 
       0.5 * sum((2*D/(1-2*rho)^2 - 1/rho^2)/Dr) - 
       0.5 * length(D)/rho + length(Sl)/(1-2*rho)) * drho_trans

    if (hessian || partial)
    {
      Si_unos      <- SigmaInv(unos)
      Si_Rl        <- SigmaInv(Rl)
      Si_Sl        <- SigmaInv(Sl)
      Si_drho_Si_e <- SigmaInv(drho_Si_e)

      # hessian, phi x phi
      ddPhi["alpha", "alpha"] <- -tau * t(unos) %*% Si_unos
      ddPhi["alpha",  "beta"] <- -tau * t(unos) %*% Si_Rl
      ddPhi["alpha",   "tau"] <- tau * t(unos) %*% Si_e
      ddPhi["alpha",   "rho"] <- tau * t(Si_unos) %*% drho_Si_e * drho_trans
      ddPhi[ "beta",  "beta"] <- -tau * t(Rl) %*% Si_Rl
      ddPhi[ "beta",   "tau"] <- tau * t(Rl) %*% Si_e
      ddPhi[ "beta",   "rho"] <- tau * t(Si_Rl) %*% drho_Si_e * drho_trans
      ddPhi[  "tau",   "tau"] <- -0.5 * tau * t(e) %*% Si_e
      ddPhi[  "tau",   "rho"] <- -0.5 * tau * t(Si_e) %*% drho_Si_e * drho_trans
      ddPhi[  "rho",   "rho"] <- 
        (-tau * t(drho_Si_e) %*% Si_drho_Si_e +
         -0.5 * sum((8*D/(1-2*rho)^3 + 2/rho^3)/Dr) + 0.5 * sum((2*D/(1-2*rho)^2 - 1/rho^2)^2/Dr^2) + 
         0.5 * length(D)/rho^2 + 2 * length(Sl)/(1-2*rho)^2) * drho_trans^2 + dPhi["rho",] * (1 - 4*rho)
      ddPhi                   <- ddPhi + t(ddPhi)
      diag(ddPhi)             <- diag(ddPhi)/2

      if (partial)
      {
        # gradient wrt E
        dR <- matrix(0, nrow(R), ncol(R))
        dR[lower.tri(dR)] <- 2 * beta * tau * as.vector(Si_e)
        dR <- symm(dR)
        dE <- diag(nrow(R)) * (dR %*% ones %*% t(ones)) - dR

        # hessian offdiagonal, E x phi
        ddEdPhi[, "alpha"] <- -2 * beta * tau * Si_unos
        ddEdPhi[,  "beta"] <- 2 * tau * (Si_e - beta * Si_Rl)
        ddEdPhi[,   "tau"] <- 2 * beta * tau * Si_e
        ddEdPhi[,   "rho"] <- 2 * beta * tau * Si_drho_Si_e * drho_trans
        ddEdPhi            <- apply(ddEdPhi, 2, function(x) { X <- matrix(0,nrow(E),ncol(E)); X[lower.tri(X)] <- x; X <- symm(X); diag(nrow(E)) * (X %*% ones %*% t(ones)) - X })

        # hessian offdiagonal, S x phi
        ddPhidS["alpha",] <- tau * Si_unos
        ddPhidS["beta",]  <- tau * Si_Rl
        ddPhidS["tau",]   <- -tau * Si_e
        ddPhidS["rho",]   <- -tau * Si_drho_Si_e * drho_trans

        # jacobian products (label these properly)
        jacobian_E <- function(dE)
        {
          ddEdE <- diag(dE) %*% t(ones) + ones %*% t(diag(dE)) - 2 * symm(dE)
          ddEdE[lower.tri(ddEdE)] <- SigmaInv(ddEdE[lower.tri(ddEdE)])
          ddEdE[upper.tri(ddEdE)] <- 0
          ddEdE <- ddEdE + t(ddEdE)
          ddEdE <- -beta^2 * tau * ddEdE
          ddEdE <- diag(nrow(dE)) * (ddEdE %*% ones %*% t(ones)) - ddEdE
          -ddEdE
        }

        jacobian_S <- function(dE)
        {
          ddEdE <- diag(dE) %*% t(ones) + ones %*% t(diag(dE)) - 2 * symm(dE)
          ddEdE[lower.tri(ddEdE)] <- SigmaInv(ddEdE[lower.tri(ddEdE)])
          ddEdE[upper.tri(ddEdE)] <- 0
          ddEdE <- ddEdE + t(ddEdE)
          ddEdE <- -beta * tau * ddEdE
          ddEdE <- diag(nrow(dE)) * (ddEdE %*% ones %*% t(ones)) - ddEdE
          diag(ddEdE) <- 0
          -ddEdE
        }
      }
    }
  }

  # numerical validation
  if (validate)
  {
    num_gradient <- numDeriv::grad(function(x) 
                                   mlpe(E = E, 
                                        phi = x, 
                                        S = S)$objective, 
                                   phi)

    num_hessian <- numDeriv::hessian(function(x) 
                                     mlpe(E = E, 
                                          phi = x, 
                                          S = S)$objective, 
                                     phi)

    num_gradient_E <- symm(matrix(numDeriv::grad(function(x) 
                                                 mlpe(E = x, 
                                                      phi = phi, 
                                                      S = S)$objective, 
                                                 E), 
                                  nrow(E), ncol(E)))

    num_partial_E <- numDeriv::jacobian(function(x) 
                                        mlpe(E = E, 
                                             phi = x, 
                                             S = S)$gradient_E, 
                                        phi)

    num_partial_S <- numDeriv::jacobian(function(x) 
                                        mlpe(E = E, 
                                             phi = phi, 
                                             S = x)$gradient, 
                                        S)[,lower.tri(S)]

    num_jacobian_E <- function(X) 
      matrix(c(X) %*% numDeriv::jacobian(function(x) 
                                         mlpe(E = x, 
                                              phi = phi, 
                                              S = S)$gradient_E, 
                                         E), 
             nrow(X), ncol(X))

    num_jacobian_S <- function(X) 
      matrix(c(X) %*% numDeriv::jacobian(function(x) 
                                         mlpe(E = E, 
                                              phi = phi, 
                                              S = x)$gradient_E, 
                                         S), 
             nrow(X), ncol(X))
  }

  list(objective        = -c(loglik), 
       fitted           = fitted,
       boundary         = nonnegative && beta == 0,
       gradient         = if(!gradient) NULL else -dPhi,
       hessian          = if(!hessian)  NULL else -ddPhi,
       gradient_E       = if(!partial)  NULL else -dE, 
       partial_E        = if(!partial)  NULL else -ddEdPhi,   # partial_E[i,k] is d(dl/dE_i)/dPhi_k where i is linearized matrix index
       partial_S        = if(!partial)  NULL else -ddPhidS,   # partial_S[i,k] is d(dl/dPhi_i)/dS_k where k is linearized matrix index
       jacobian_E       = if(!partial)  NULL else jacobian_E, # function mapping vectorized dg/dE to d(dg/dE)/dE
       jacobian_S       = if(!partial)  NULL else jacobian_S, # function mapping vectorized dg/dE to d(dg/dE)/dS
       num_gradient     = if(!validate) NULL else num_gradient,
       num_hessian      = if(!validate) NULL else num_hessian,
       num_gradient_E   = if(!validate) NULL else num_gradient_E,
       num_partial_E    = if(!validate) NULL else num_partial_E,
       num_partial_S    = if(!validate) NULL else num_partial_S,
       num_jacobian_E   = if(!validate) NULL else num_jacobian_E,
       num_jacobian_S   = if(!validate) NULL else num_jacobian_S)
}
class(mlpe) <- "radish_measurement_model"

# pre-compute needed eigendecomposition for 3-100 demes
# eventually I could store this for larger range in /include?
.mlpe_eigen <- lapply(1:300, function(n) {
                        if (n == 1) NA
                        else {
                          Ind    <- which(lower.tri(diag(n)), arr.ind = TRUE)
                          U      <- Matrix::sparseMatrix(i = rep(1:nrow(Ind), 2), j = c(Ind), x = rep(1, nrow(Ind)*2))
                          eigen(as.matrix(Matrix::t(U) %*% U))
                        }
       })