mistnet: Stochastic Neural Networks

#' Prior distributions
#' 
#' Prior distributions regularize the model's weights during training
#' 
#' @export
#' @exportClass prior
prior = setRefClass(
  Class = "prior",
  fields = list(),
  methods = list(
    getLogGrad = function(x) stop({"log gradient not defined for this prior"}),
    sample = function(n){stop("sampler not defined for this prior")},
    update = function(...){stop("update not defined for this prior")}
  )
)

# gaussian.prior can accept a numeric matrix OR a numeric vector
setClassUnion("any.numeric", c("numeric", "matrix"))

#' \code{gaussian.prior} Gaussian prior with mean and sd
#' @rdname prior
#' @export gaussian.prior
#' @exportClass gaussian.prior
gaussian.prior = setRefClass(
  Class = "gaussian.prior",
  fields = list(
    mean = "any.numeric",
    sd = "any.numeric"
  ),
  contains = "prior",
  methods = list(
    getLogGrad = function(x){
      - (x - .self$mean) / .self$sd^2
    },
    sample = function(n){
      structure(
        rnorm(n, mean = mean, sd = sd),
        dim = dim(mean)
      )
    },
    update = function(weights, update.mean, update.sd, min.sd){
      if(update.mean){
        mean <<- rowMeans(weights)
      }
      if(update.sd){
        var.vector = apply(weights, 1, var)
        var.vector = pmax(
          (var.vector + mean(var.vector)) / 2,
          min.sd^2
        )
        sd <<- replicate(ncol(weights), sqrt(var.vector))
      }
    }
  )
)

#' \code{laplace.prior} Laplace "double-exponential" prior with location and scale
#' @rdname prior
#' @export laplace.prior
#' @exportClass laplace.prior
laplace.prior = setRefClass(
  Class = "laplace.prior",
  fields = list(
    location = "numeric",
    scale = "numeric"
  ),
  contains = "prior",
  methods = list(
    getLogGrad = function(x){
      # This will work if the learning rate is small. Otherwise, it could 
      # overshoot past zero. That's probably not a big deal in practice?
      - sign(x - location) / scale
    },
    sample = function(n){
      rexp(n, rate  = 1 / scale) * base::sample(c(-1, 1), size = n, replace = TRUE)
    }
  )
) 


#' \code{logistic.prior} logistic prior distribution with location and scale
#' @rdname prior
#' @export logistic.prior
#' @exportClass logistic.prior
logistic.prior = setRefClass(
  Class = "logistic.prior",
  fields = list(
    location = "numeric",
    scale = "numeric"
  ),
  contains = "prior",
  methods = list(
    getLogGrad = function(x){
      -tanh((x - location)/scale/2) / scale
    },
    sample = function(n){
      rlogis(n, location, scale)
    },
    update = function(weights, update.location, update.scale, min.scale){
      if(update.location){
        location_loss = function(location, w){
          sum(-dlogis(w, location = location, log = TRUE))
        }
        location_vector = apply(
          weights,
          1,
          function(w){
            optimize(
              f = function(w)location_loss(w), 
              interval = quantile(w, c(.01, .99)),
              w = w
            )$minimum
          }
        )
        location <<- replicate(ncol(weights), location_vector)
      }
      if(update.scale){
        scale_loss = function(scale, w){
          sum(-dlogis(w, scale = scale, log = TRUE))
        }
        scale_vector = apply(
          weights,
          1,
          function(w){
            optimize(
              f = scale_loss, 
              interval = c(min.scale, 10 * sd(w)),
              w = w
            )$minimum
          }
        )
        scale <<- replicate(ncol(weights), scale_vector)
      }
    }
  )
) 

#' \code{lognormal.prior} lognormal prior with meanlog and sdlog parameters
#' @rdname prior
#' @export lognormal.prior
#' @exportClass lognormal.prior
lognormal.prior = setRefClass(
  Class = "lognormal.prior",
  fields = list(
    meanlog = "numeric",
    sdlog = "numeric"
  ),
  contains = "prior",
  methods = list(
    getLogGrad = function(x){
      - (sdlog^2 + log(x) - meanlog) / (sdlog^2 * x)
    },
    sample = function(n){
      rlnorm(n, meanlog, sdlog)
    }
  )
)



# GP prior assumes that covariance matrices (K) have already been determined
# by some other analysis.  Can re-initialize with new K & noise_sd after updating
# kernel hyperparameters as often as desired, though. Alternatively, one can just
# update the means, since that should be faster.
#' \code{gp.prior} Gaussian process prior
#' @rdname prior
#' @export gp.prior
#' @exportClass gp.prior
gp.prior = setRefClass(
  Class = "gp.prior",
  fields = list(
    noise_sd = "numeric",
    means = "matrix",
    K = "array",
    L = "array",
    alpha = "array",
    v = "array",
    posterior_var = "array",
    inverse_var = "array"
  ),
  contains = "prior",
  methods = list(
    initialize = function(K, noise_sd, coefs){
      
      if(any(missing(K), missing(noise_sd), missing(coefs))){
        return()
      }
      
      # Notation roughly follows Rasmussen & Williams 2006
      # Gaussian Processes for Machine Learning
      # Algorithm 2.1, page 19
      assert_that(length(noise_sd) == dim(K)[3])
      
      noise_sd <<- noise_sd  # Noise SD is SD of data around the true function.
                             # Vector, with one element per covariance matrix.
      
      K <<- K                # K[ , , j] is the jth prior covariance matrix.
                             # Old comment says this should include noise along,
                             # the diagonal, but now I'm pretty sure that's incorrect?
      
      # Initialize L, v, posterior_var with correct dimensions
      L             <<- K * 0
      v             <<- K * 0
      posterior_var <<- K * 0
      
      I = diag(ncol(K)) # identity matrix, gets scaled by SD & added to diagonal of K
      
      # For each covariance matrix in K...
      for(i in seq_len(dim(K)[[3]])){
        # K matrix plus noise variance along the diagonal
        K_plus = K[ , , i] + I * noise_sd[i]^2
        
        # Cholesky decomposition is transposed to match algorithm in textbook
        L[ , , i] <<- t(chol(K_plus))
        
        # decomposition of variance explained by the data??
        v[ , , i] <<- solve(L[ , , i], K[ , , i])
        
        # posterior variation, used for sampling from posterior
        posterior_var[ , , i] <<- K_plus - crossprod(v[ , , i])
      }
      
      # Set prior mean function
      updateMeans(coefs)
      
    },
    getLogGrad = function(x){
      # Calculate gradient for each row of x independently.
      # sapply's output is transposed compared with what I want
      t_out = sapply(
        1:nrow(x),
        function(i){
          # https://stats.stackexchange.com/questions/90134/gradient-of-gaussian-log-likelihood
          # {1 \over 2}{\partial (\theta - \mu)^T\Sigma^{-1}(\theta - \mu) \over \partial \theta} = \Sigma^{-1}(\theta - \mu).
          - (x[i, ] - means[i, ]) / diag(posterior_var[,,i])
        }
      )
      
      t(t_out)
    },
    update = function(weights, update.mean, update.sd, min.sd){
      if(update.mean){
        updateMeans(weights)
      }
      if(update.sd){
        warning("no update.sd method implemented for gp priors")
      }
    },
    updateMeans = function(coefs, ...){
      # Each row of coefficients has its own posterior mean
      if(length(means) == 0){
        # Initialize means and alphas as NAs if blank
        means <<- matrix(NA, nrow = nrow(coefs), ncol = ncol(coefs))
        alpha <<- matrix(NA, nrow = nrow(coefs), ncol = ncol(coefs))
      }
      for(i in 1:nrow(coefs)){
        y = coefs[i, ]
        centered.y = y - mean(y)
        
        # Does this improperly pull species toward their current values, rather than just
        # toward their neighbors??
        
        # calculate new prior means on centered variables, then add the mean back on
        alpha[i, ] <<- solve(t(L[ , , i]), solve(L[ , , i], centered.y))
        means[i, ] <<- K[, , i] %*% alpha[i, ] + mean(y)
      }
    }
  )
)