R/init.R

#' @title Initialize the parameters of a generalized matrix factorization model
#'
#' @description
#' Provide four initialization methods to set the initial values of
#' a generalized matrix factorization (GMF) model identified by a \code{\link{glm}} family
#' and a linear predictor of the form \eqn{g(\mu) = \eta = X B^\top + A Z^\top + U V^\top},
#' with bijective link function \eqn{g(\cdot)}.
#' See \code{\link{sgdgmf.fit}} for more details on the model specification.
#'
#' @param Y matrix of responses (\eqn{n \times m})
#' @param X matrix of row-specific fixed effects (\eqn{n \times p})
#' @param Z matrix of column-specific fixed effects (\eqn{q \times m})
#' @param ncomp rank of the latent matrix factorization
#' @param family a model family, as in the \code{\link{glm}} interface
#' @param weights matrix of constant weights (\eqn{n \times m})
#' @param offset matrix of constant offset (\eqn{n \times m})
#' @param method optimization method to be used for the initial fit
#' @param type type of residuals to be used for initializing \code{U} via incomplete SVD decomposition
#' @param niter number of iterations to refine the initial estimate (only if \code{method="ols"} or \code{"svd"})
#' @param values a list of custom initial values for \code{B}, \code{A}, \code{U} and \code{V}
#' @param verbose if \code{TRUE}, prints the status of the initialization process
#' @param parallel if \code{TRUE}, allows for parallel computing using the \code{foreach} package (only if \code{method="glm"})
#' @param nthreads number of cores to be used in parallel (only if \code{parallel=TRUE} and \code{method="glm"})
#' @param savedata if \code{TRUE}, stores a copy of the input data
#'
#' @return
#' An \code{initgmf} object, namely a list, containing the initial estimates of the GMF parameters.
#' In particular, the returned object collects the following information:
#' \itemize{
#'   \item \code{Y}: response matrix (only if \code{savedata=TRUE})
#'   \item \code{X}: row-specific covariate matrix (only if \code{savedata=TRUE})
#'   \item \code{Z}: column-specific covariate matrix (only if \code{savedata=TRUE})
#'   \item \code{B}: the estimated col-specific coefficient matrix
#'   \item \code{A}: the estimated row-specific coefficient matrix
#'   \item \code{U}: the estimated factor matrix
#'   \item \code{V}: the estimated loading matrix
#'   \item \code{phi}: the estimated dispersion parameter
#'   \item \code{method}: the selected estimation method
#'   \item \code{family}: the model family
#'   \item \code{ncomp}: rank of the latent matrix factorization
#'   \item \code{type}: type of residuals used for the initialization of \code{U}
#'   \item \code{verbose}: if \code{TRUE}, print the status of the initialization process
#'   \item \code{parallel}: if \code{TRUE}, allows for parallel computing
#'   \item \code{nthreads}: number of cores to be used in parallel
#'   \item \code{savedata}: if \code{TRUE}, stores a copy of the input data
#' }
#'
#' @details
#' If \code{method = "ols"}, the initialization is performed fitting a sequence of linear
#' regressions followed by a residual SVD decomposition.
#' To account for non-Gaussian distribution of the data, regression and
#' decomposition are applied on the transformed response matrix \eqn{Y_h = (g \circ h)(Y)},
#' where \eqn{h(\cdot)} is a function which prevent \eqn{Y_h} to take infinite values.
#' For instance, in the Binomial case \eqn{h(y) = 2 (1-\epsilon) y + \epsilon},
#' while in the Poisson case \eqn{h(y) = y + \epsilon}, where \eqn{\epsilon} is a small
#' positive constant, typically \code{0.1} or \code{0.01}.
#'
#' If \code{method = "glm"}, the initialization is performed by fitting a sequence of
#' generalized linear models followed by a residual SVD decomposition.
#' In particular, to set \eqn{\beta_j}, we use independent GLM fit with \eqn{y_j \sim X \beta_j}.
#' Similarly, to set \eqn{\alpha_i}, we fit the model \eqn{y_i \sim Z \alpha_i + o_i}, with offset \eqn{o_i = B x_i}.
#' Then, we obtain \eqn{U} via SVD on the residuals. Finally, we obtain \eqn{V} via independent GLM fit
#' under the model \eqn{y_j \sim U v_j + o_j}, with offset \eqn{o_i = X \beta_j + A z_j}.
#'
#' Both under \code{method = "ols"} and \code{method = "glm"}, it is possible to specify the
#' parameter \code{type} to change the type of residuals used for the SVD decomposition.
#'
#' If  \code{method = "random"}, the initialization is performed using independent Gaussian
#' random values for all the parameters in the model.
#'
#' If \code{method = "values"}, the initialization is performed using user-specified
#' values provided as an input, which must have compatible dimensions.
#'
#' @examples
#' library(sgdGMF)
#'
#' # Set the data dimensions
#' n = 100; m = 20; d = 5
#'
#' # Generate data using Poisson, Binomial and Gamma models
#' data_pois = sim.gmf.data(n = n, m = m, ncomp = d, family = poisson())
#' data_bin = sim.gmf.data(n = n, m = m, ncomp = d, family = binomial())
#' data_gam = sim.gmf.data(n = n, m = m, ncomp = d, family = Gamma(link = "log"), dispersion = 0.25)
#'
#' # Initialize the GMF parameters assuming 3 latent factors
#' init_pois = sgdgmf.init(data_pois$Y, ncomp = 3, family = poisson(), method = "ols")
#' init_bin = sgdgmf.init(data_bin$Y, ncomp = 3, family = binomial(), method = "ols")
#' init_gam = sgdgmf.init(data_gam$Y, ncomp = 3, family = Gamma(link = "log"), method = "ols")
#'
#' # Get the fitted values in the link and response scales
#' mu_hat_pois = fitted(init_pois, type = "response")
#' mu_hat_bin = fitted(init_bin, type = "response")
#' mu_hat_gam = fitted(init_gam, type = "response")
#'
#' # Compare the results
#' oldpar = par(no.readonly = TRUE)
#' par(mfrow = c(3,3), mar = c(1,1,3,1))
#' image(data_pois$Y, axes = FALSE, main = expression(Y[Pois]))
#' image(data_pois$mu, axes = FALSE, main = expression(mu[Pois]))
#' image(mu_hat_pois, axes = FALSE, main = expression(hat(mu)[Pois]))
#' image(data_bin$Y, axes = FALSE, main = expression(Y[Bin]))
#' image(data_bin$mu, axes = FALSE, main = expression(mu[Bin]))
#' image(mu_hat_bin, axes = FALSE, main = expression(hat(mu)[Bin]))
#' image(data_gam$Y, axes = FALSE, main = expression(Y[Gam]))
#' image(data_gam$mu, axes = FALSE, main = expression(mu[Gam]))
#' image(mu_hat_gam, axes = FALSE, main = expression(hat(mu)[Gam]))
#' par(oldpar)
#'
#' @export sgdgmf.init
sgdgmf.init = function (
    Y,
    X = NULL,
    Z = NULL,
    ncomp = 2,
    family = gaussian(),
    weights = NULL,
    offset = NULL,
    method = c("ols", "glm", "random", "values"),
    type = c("deviance", "pearson", "working", "link"),
    niter = 0,
    values = list(),
    verbose = FALSE,
    parallel = FALSE,
    nthreads = 1,
    savedata = TRUE
) {

  # Set the initialization method
  method = match.arg(method)

  # Initialize the parameters using the selected method
  init = switch(method,
    "ols" = sgdgmf.init.ols(Y, X, Z, ncomp, family, weights, offset, type, verbose),
    "glm" = sgdgmf.init.glm(Y, X, Z, ncomp, family, weights, offset, type, verbose, parallel, nthreads),
    "random" = sgdgmf.init.random(Y, X, Z, ncomp),
    "values" = sgdgmf.init.custom(Y, X, Z, ncomp, family, values, verbose))

  # Data dimensions
  n = nrow(Y)
  m = ncol(Y)
  p = ifelse(is.null(X), 1, ncol(X))
  q = ifelse(is.null(Z), 1, ncol(Z))

  # Save all the initialization options
  init$method = method
  init$family = family
  init$ncomp = ncomp
  init$npar = n * p + m * q + ncomp * (n + m)
  init$type = type
  init$verbose = verbose
  init$parallel = parallel
  init$nthreads = nthreads
  init$savedata = savedata

  if (savedata) {
    init$Y = matrix(NA, nrow = n, ncol = m)
    init$X = matrix(NA, nrow = n, ncol = p)
    init$Z = matrix(NA, nrow = m, ncol = q)
    init$weights = matrix(NA, nrow = n, ncol = m)
    init$offset = matrix(NA, nrow = n, ncol = m)

    init$Y[] = Y
    init$X[] = if (!is.null(X)) X else matrix(1, nrow = n, ncol = p)
    init$Z[] = if (!is.null(Z)) Z else matrix(1, nrow = m, ncol = q)
    init$weights[] = if (!is.null(weights)) weights else matrix(1, nrow = n, ncol = m)
    init$offset[] = if (!is.null(offset)) offset else matrix(0, nrow = n, ncol = m)
  }

  # Set the initialization class
  class(init) = "initgmf"

  # Return the initial estimates
  return (init)
}

#' @rdname sgdgmf.init
#' @keywords internal
sgdgmf.init.ols = function (
    Y,
    X = NULL,
    Z = NULL,
    ncomp = 2,
    family = gaussian(),
    weights = NULL,
    offset = NULL,
    type = c("deviance", "pearson", "working", "link"),
    verbose = FALSE
) {
  # Set the residual type
  type = match.arg(type)

  # Set the covariate matrices
  if (is.null(X)) X = matrix(1, nrow = nrow(Y), ncol = 1)
  if (is.null(Z)) Z = matrix(1, nrow = ncol(Y), ncol = 1)
  if (is.null(weights)) weights = matrix(1, nrow = nrow(Y), ncol = ncol(Y))
  if (is.null(offset)) offset = matrix(0, nrow = nrow(Y), ncol = ncol(Y))

  # Set the minimum and maximum of the data
  minY = min(Y)
  maxY = max(Y)

  # Set the model dimensions
  n = nrow(Y)
  m = ncol(Y)
  p = ncol(X)
  q = ncol(Z)
  d = ncomp

  # Set the data transformation to use for
  # the initialization of the working data
  family = set.family(family)

  # Compute the transformed data
  if (verbose) cat(" Initialization: working data \n")
  if (anyNA(Y)) {
    isna = is.na(Y)
    Y[] = apply(Y, 2, function (x) {
      x[is.na(x)] = mean(x, na.rm = TRUE)
      return (x)
    })
  }
  gY = family$transform(Y)

  # Initialize the parameters
  B = matrix(NA, nrow = m, ncol = p)
  A = matrix(NA, nrow = n, ncol = q)
  U = matrix(NA, nrow = n, ncol = d)
  V = matrix(NA, nrow = m, ncol = d)
  eta = matrix(NA, nrow = n, ncol = m)

  # Fill the initial linear predictor
  eta[] = offset

  # Compute the initial column-specific regression parameters
  if (verbose) cat(" Initialization: column-specific covariates \n")
  B[] = ols.fit.coef(gY, X, offset = eta)
  eta = eta + tcrossprod(X, B)

  # Compute the initial row-specific regression parameter
  if (verbose) cat(" Initialization: row-specific covariates \n")
  A[] = ols.fit.coef(t(gY), Z, offset = t(eta))
  eta = eta + tcrossprod(A, Z)

  # Compute the GLM residuals to be decompose via PCA
  mu = family$linkinv(eta)
  mu[mu > maxY] = maxY
  mu[mu < minY] = minY

  res = switch(type,
    "deviance" = sign(Y - mu) * sqrt(abs(family$dev.resids(Y, mu, 1))),
    "pearson" = (Y - mu) / sqrt(family$variance(mu)),
    "working" = (Y - mu) * family$mu.eta(eta) / abs(family$variance(mu)),
    "link" = (gY - eta))

  if (anyNA(res)) res[is.nan(res) | is.na(res)] = 0

  # Compute the initial latent factors via incomplete SVD
  if (verbose) cat(" Initialization: latent scores \n")
  U[] = RSpectra::svds(res, ncomp)$u

  # Compute the initial loading matrix via OLS
  if (verbose) cat(" Initialization: latent loadings \n")
  V[] = ols.fit.coef(gY, U, offset = eta)
  eta = eta + tcrossprod(U, V)

  # Compute the initial vector of dispersion parameters
  mu = family$linkinv(eta)
  var = family$variance(mu)
  phi = colMeans((Y - mu)^2 / var, na.rm = TRUE)

  # Return the obtained initial values
  list(U = U, V = V, A = A, B = B, phi = phi)
}


#' @rdname sgdgmf.init
#' @keywords internal
sgdgmf.init.glm = function (
    Y,
    X = NULL,
    Z = NULL,
    ncomp = 2,
    family = gaussian(),
    weights = NULL,
    offset = NULL,
    type = c("deviance", "pearson", "working", "link"),
    verbose = FALSE,
    parallel = FALSE,
    nthreads = 1
) {

  # Model dimensions
  n = nrow(Y)
  m = ncol(Y)
  d = ncomp

  # Set the residual type
  type = match.arg(type)

  # Set the data transformation to use for
  # the initialization of the working data
  family = set.family(family)

  # Compute the transformed data
  if (verbose) cat(" Initialization: working data \n")
  if (anyNA(Y)) {
    isna = is.na(Y)
    Y[] = apply(Y, 2, function (x) {
      x[is.na(x)] = mean(x, na.rm = TRUE)
      return (x)
    })
  }

  # Set the covariate matrices
  if (is.null(X)) X = matrix(1, nrow = n, ncol = 1)
  if (is.null(Z)) Z = matrix(1, nrow = m, ncol = 1)
  if (is.null(weights)) weights = matrix(1, nrow = n, ncol = m)
  if (is.null(offset)) offset = matrix(0, nrow = n, ncol = m)

  # Set the minimum and maximum of the data
  minY = min(Y)
  maxY = max(Y)

  # Initialize the mean and variance matrices
  eta = matrix(NA, nrow = n, ncol = m)
  mu  = matrix(NA, nrow = n, ncol = m)
  var = matrix(NA, nrow = n, ncol = m)
  res = matrix(NA, nrow = n, ncol = m)

  # Register and open the connection to the clusters
  clust = NULL
  if (parallel) {
    ncores = parallel::detectCores() - 1
    ncores = max(1, min(nthreads, ncores))
    clust = parallel::makeCluster(ncores)
    doParallel::registerDoParallel(clust)
  }

  # Fill the initial linear predictor
  eta[] = offset

  # Column-specific covariate vector initialization
  if (verbose) cat(" Initialization: column-specific covariates \n")
  B = vglm.fit.coef(Y, X, family = family, weights = weights, offset = eta,
                    parallel = parallel, nthreads = nthreads, clust = clust)

  # Update the linear predictor
  eta[] = eta + tcrossprod(X, B)

  # Row-specific covariate vector initialization
  if (verbose) cat(" Initialization: row-specific covariates \n")
  A = vglm.fit.coef(t(Y), Z, family = family, weights = t(weights), offset = t(eta),
                    parallel = parallel, nthreads = nthreads, clust = clust)

  # Update the linear predictor and the conditional mean matrix
  eta[] = eta + tcrossprod(A, Z)
  mu[] = family$linkinv(eta)

  mu[mu > maxY] = maxY
  mu[mu < minY] = minY

  # Compute the residuals
  res[] = switch(type,
    "deviance" = sign(Y - mu) * sqrt(abs(family$dev.resids(Y, mu, 1))),
    "pearson" = (Y - mu) / sqrt(abs(family$variance(mu))),
    "working" = (Y - mu) * family$mu.eta(eta) / abs(family$variance(mu)),
    "link" = (family$transform(Y) - eta))

  if (anyNA(res)) res[is.na(res) | is.nan(res)] = 0

  # Initialize the latent factors via residual SVD
  if (verbose) cat(" Initialization: latent scores \n")
  U = RSpectra::svds(res, k = ncomp)$u

  # Initialize the loading matrix via GLM regression
  if (verbose) cat(" Initialization: latent loadings \n")
  V = vglm.fit.coef(Y, U, family = family, weights = weights, offset = eta,
                    parallel = parallel, nthreads = nthreads, clust = clust)

  # Close the connection to the clusters
  if (parallel) parallel::stopCluster(clust)

  # Compute the initial vector of dispersion parameters
  eta[] = eta + tcrossprod(U, V)
  mu[] = family$linkinv(eta)
  var[] = family$variance(mu)
  phi = colMeans((Y - mu)^2 / var, na.rm = TRUE)

  # Output
  list(U = U, V = V, A = A, B = B, phi = phi)
}


#' @rdname sgdgmf.init
#' @keywords internal
sgdgmf.init.random = function (
    Y,
    X = NULL,
    Z = NULL,
    ncomp = 2,
    family = gaussian(),
    weights = NULL,
    offset = NULL,
    sigma = 1
) {

  # Derive data dimensions
  n = nrow(Y)
  m = ncol(Y)
  d = ncomp

  # Derive covariate dimensions
  p = 1
  q = 1
  if (!is.null(X)) p = ncol(X) # n x p matrix
  if (!is.null(Z)) q = ncol(Z) # m x q matrix

  # parameter dimensions
  dimU = c(n, d)
  dimV = c(m, d)
  dimB = c(m, p)
  dimA = c(n, q)

  # parameter generation
  sd = 1e-01 * sigma
  U = array(stats::rnorm(prod(dimU)) / prod(dimU) * sd, dimU)
  V = array(stats::rnorm(prod(dimV)) / prod(dimV) * sd, dimV)
  B = array(stats::rnorm(prod(dimB)) / prod(dimB) * sd, dimB)
  A = array(stats::rnorm(prod(dimA)) / prod(dimA) * sd, dimA)
  phi = rep(1, length = m)

  # output
  list(U = U, V = V, A = A, B = B, phi = phi)
}


#' @rdname sgdgmf.init
#' @keywords internal
sgdgmf.init.custom = function (
    Y,
    X = NULL,
    Z = NULL,
    ncomp = 2,
    family = gaussian(),
    values = list(),
    verbose = FALSE
) {

  # Set the covariate matrices
  if (is.null(X)) X = matrix(1, nrow = n, ncol = 1)
  if (is.null(Z)) Z = matrix(1, nrow = n, ncol = 1)

  # Data dimensions
  n = nrow(Y)
  m = ncol(Y)
  p = ncol(X)
  q = ncol(Z)
  d = ncomp

  # Set the error message
  error.message = function (mat, nr, nc)
    gettextf("Incompatible dimensions: dim(%s) != c(%d, %d).", mat, nr, nc)

  # Check if all the dimensions are compatible
  if (any(dim(values$B) != c(m, p))) stop(error.message("B", m, p), call. = FALSE)
  if (any(dim(values$A) != c(n, q))) stop(error.message("A", n, q), call. = FALSE)
  if (any(dim(values$U) != c(n, d))) stop(error.message("U", n, d), call. = FALSE)
  if (any(dim(values$V) != c(m, d))) stop(error.message("V", m, d), call. = FALSE)

  # Compute the initial vector of dispersion parameters
  eta = tcrossprod(cbind(X, values$A, values$U), cbind(values$B, Z, values$V))
  mu = family$linkinv(eta)
  var = family$variance(mu)
  values$phi = colMeans((Y - mu)^2 / var, na.rm = TRUE)

  # Return the obtained initial values
  return(values)
}

Try the sgdGMF package in your browser

Any scripts or data that you put into this service are public.

sgdGMF documentation built on April 3, 2025, 7:37 p.m.