R/crossval.R
In sgdGMF: Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

#' @title Model selection via cross-validation for generalized matrix factorization models
#'
#' @description
#' K-fold cross-validation for generalized matrix factorization (GMF) models.
#'
#' @param Y matrix of responses (\eqn{n \times m})
#' @param X matrix of row fixed effects (\eqn{n \times p})
#' @param Z matrix of column fixed effects (\eqn{q \times m})
#' @param family a \code{glm} family (see \code{\link{family}} for more details)
#' @param ncomps ranks of the latent matrix factorization used in cross-validation (default 1 to 10)
#' @param weights an optional matrix of weights (\eqn{n \times m})
#' @param offset an optional matrix of offset values (\eqn{n \times m}), that specify a known component to be included in the linear predictor.
#' @param method estimation method to minimize the negative penalized log-likelihood
#' @param sampling sub-sampling strategy to use if \code{method = "sgd"}
#' @param penalty list of penalty parameters (see \code{\link{set.penalty}} for more details)
#' @param control.init list of control parameters for the initialization (see \code{\link{set.control.init}} for more details)
#' @param control.alg list of control parameters for the optimization (see \code{\link{set.control.alg}} for more details)
#' @param control.cv list of control parameters for the cross-validation (see \code{\link{set.control.cv}} for more details)
#'
#' @return
#' If \code{refit = FALSE} (see \code{\link{set.control.cv}}), the function returns a list containing \code{control.init},
#' \code{control.alg}, \code{control.cv} and \code{summary.cv}. The latter is a matrix
#' collecting the cross-validation results for each combination of fold and latent
#' dimension.
#'
#' If \code{refit = TRUE} (see \code{\link{set.control.cv}}), the function returns an object of class \code{sgdgmf},
#' obtained by refitting the model on the whole data matrix using the latent dimension
#' selected via cross-validation. The returned object also contains the \code{summary.cv}
#' information along with the other standard output of the \code{\link{sgdgmf.fit}} function.
#'
#' @details
#' Cross-validation is performed by minimizing the estimated out-of-sample error, which
#' can be measured in terms of averaged deviance, AIC or BIC calculated on fold-specific
#' test sets. Within each fold, the test set is defined as a fixed proportion of entries
#' in the response matrix which are held out from the estimation process.
#' To this end, the test set entries are hidden by \code{NA} values when training the
#' model. Then, the predicted, i.e. imputed, values are used to compute the fold-specific
#' out-of-sample error.
#'
#' @examples
#' # Load the sgdGMF package
#' library(sgdGMF)
#'
#' # Set the data dimensions
#' n = 100; m = 20; d = 5
#'
#' # Generate data using Poisson, Binomial and Gamma models
#' data_pois = sim.gmf.data(n = n, m = m, ncomp = d, family = poisson())
#' data_bin = sim.gmf.data(n = n, m = m, ncomp = d, family = binomial())
#' data_gam = sim.gmf.data(n = n, m = m, ncomp = d, family = Gamma(link = "log"), dispersion = 0.25)
#'
#' # Set RUN = TRUE to run the example, it may take some time. To speed up
#' # the computation it is possible to run CV in parallel specifying
#' # control.cv = list(parallel = TRUE, nthreads = <number_of_workers>)
#' # as an argument of sgdgmf.cv()
#' RUN = FALSE
#' if (RUN) {
#'   # Initialize the GMF parameters assuming 3 latent factors
#'   gmf_pois = sgdgmf.cv(data_pois$Y, ncomp = 1:10, family = poisson())
#'   gmf_bin = sgdgmf.cv(data_bin$Y, ncomp = 3, family = binomial())
#'   gmf_gam = sgdgmf.cv(data_gam$Y, ncomp = 3, family = Gamma(link = "log"))
#'
#'   # Get the fitted values in the link and response scales
#'   mu_hat_pois = fitted(gmf_pois, type = "response")
#'   mu_hat_bin = fitted(gmf_bin, type = "response")
#'   mu_hat_gam = fitted(gmf_gam, type = "response")
#'
#'   # Compare the results
#'   oldpar = par(no.readonly = TRUE)
#'   par(mfrow = c(1,3), mar = c(1,1,3,1))
#'   image(data_pois$Y, axes = FALSE, main = expression(Y[Pois]))
#'   image(data_pois$mu, axes = FALSE, main = expression(mu[Pois]))
#'   image(mu_hat_pois, axes = FALSE, main = expression(hat(mu)[Pois]))
#'   image(data_bin$Y, axes = FALSE, main = expression(Y[Bin]))
#'   image(data_bin$mu, axes = FALSE, main = expression(mu[Bin]))
#'   image(mu_hat_bin, axes = FALSE, main = expression(hat(mu)[Bin]))
#'   image(data_gam$Y, axes = FALSE, main = expression(Y[Gam]))
#'   image(data_gam$mu, axes = FALSE, main = expression(mu[Gam]))
#'   image(mu_hat_gam, axes = FALSE, main = expression(hat(mu)[Gam]))
#'   par(oldpar)
#' }
#'
#' @export sgdgmf.cv
sgdgmf.cv = function (
    Y,
    X = NULL,
    Z = NULL,
    family = gaussian(),
    ncomps = seq(from = 1, to = 10, by = 1),
    weights = NULL,
    offset = NULL,
    method = c("airwls", "newton", "sgd"),
    sampling = c("block", "coord", "rnd-block"),
    penalty = list(),
    control.init = list(),
    control.alg = list(),
    control.cv = list()
) {

  method = match.arg(method)

  # Sanity check for the cross-validation options
  control.cv = do.call("set.control.cv", control.cv)
  control.init = do.call("set.control.init", control.init)
  control.alg = set.control.alg(method, sampling, control.alg)

  # Data dimensions
  n = nrow(Y)
  m = ncol(Y)
  p = ifelse(is.null(X), 1, ncol(X)) # if X = NULL, sgdgmf.fit set X = rep(1, n)
  q = ifelse(is.null(Z), 1, ncol(Z)) # if X = NULL, sgdgmf.fit set Z = rep(1, m)

  # Maximum rank
  ncomps = floor(ncomps)
  maxcomp = max(ncomps)

  weights = set.mat.weights(weights, n, m)
  offset = set.mat.offset(offset, n, m)

  criterion = control.cv$criterion
  refit = control.cv$refit
  nfolds = control.cv$nfolds
  common = isTRUE(control.cv$init == "common")
  parallel = control.cv$parallel
  nthreads = control.cv$nthreads

  # Check for control conflicts
  if (method %in% c("airwls", "newton")) {
    if (control.cv$parallel && (control.init$parallel || control.alg$parallel)) {
      stop("Nested parallelization is not allowed.")
    }
  }

  if (control.alg$verbose) {
    control.cv$verbose = TRUE
  }

  if (control.cv$parallel && (control.init$verbose || control.alg$verbose || control.cv$verbose)) {
    control.init$verbose = FALSE
    control.alg$verbose = FALSE
    control.cv$verbose = FALSE
    warning("With 'parallel=TRUE' no output is printed and 'verbose' has been disabled.",
            call. = FALSE, immediate. = TRUE, domain = NULL)
  }

  # Common initialization
  if (common) {
    time.init = proc.time()
    control.init$values = sgdgmf.init(
      Y = Y, X = X, Z = Z, ncomp = maxcomp,
      family = family, method = control.init$method,
      type = control.init$type, niter = control.init$niter,
      values = control.init$values,verbose = control.init$verbose,
      parallel = control.init$parallel, nthreads = control.init$threads)
    control.init$method = "values"
    time.init = as.numeric(proc.time() - time.init)[3]
  }

  # Create a k-fold partition
  data = list()
  f = control.cv$proportion
  for (fold in 1:nfolds) {
    data[[fold]] = partition(Y, f)
  }

  # Cartesian product of groups and folds
  groups = expand.grid(fold = 1:nfolds, ncomp = ncomps)
  niter = nrow(groups)
  iter = NULL

  # Register the clusters
  if (parallel) {
    ncores = parallel::detectCores() - 1
    ncores = max(1, min(nthreads, ncores))
    clust = parallel::makeCluster(ncores)
    doParallel::registerDoParallel(clust)
  }

  if (!parallel) {
    # Sequential estimation loop
    cv = foreach (iter = 1:niter, .combine = "rbind") %do% {

      # Set the number of components and the group
      ncomp = groups$ncomp[iter]
      fold = groups$fold[iter]

      # Set the train and test data
      train = data[[fold]]$train
      test = data[[fold]]$test

      # Run the estimation and compute the GoF statistics
      sgdgmf.cv.step(
        train = train, test = test, X = X, Z = Z, family = family,
        ncomp = ncomp, maxcomp = maxcomp, fold = fold, nfolds = nfolds,
        weights = weights, offset = offset, method = method,
        sampling = sampling, penalty = penalty, control.init = control.init,
        control.alg = control.alg, control.cv = control.cv)
    }
  } else {
    # Parallel estimation loop
    cv = foreach (iter = 1:niter, .packages = c("sgdGMF"), .export = c("sgdgmf.cv.step"), .combine = "rbind") %dopar% {

      # Set the number of components and the group
      ncomp = groups$ncomp[iter]
      fold = groups$fold[iter]

      # Set the train and test data
      train = data[[fold]]$train
      test = data[[fold]]$test

      # Run the estimation and compute the GoF statistics
      sgdgmf.cv.step(
        train = train, test = test, X = X, Z = Z, family = family,
        ncomp = ncomp, maxcomp = maxcomp, fold = fold, nfolds = nfolds,
        weights = weights, offset = offset, method = method,
        sampling = sampling, penalty = penalty, control.init = control.init,
        control.alg = control.alg, control.cv = control.cv)
    }
  }

  # Close the connection to the clusters
  if (parallel) {
    parallel::stopCluster(clust)
  }

  # Rank selection
  if (nfolds > 1) {
    avgcv = data.frame()
    for (ncomp in ncomps) {
      for (fold in 1:nfolds) {
        idx = (cv$ncomp == ncomp) & (cv$fold == fold)
        avgstat = data.frame(
          ncomp = ncomp,
          df = mean(cv$df[idx]),
          dev = mean(cv$dev[idx], na.rm = TRUE),
          mae = mean(cv$mae[idx], na.rm = TRUE),
          mse = mean(cv$mse[idx], na.rm = TRUE),
          aic = mean(cv$aic[idx], na.rm = TRUE),
          bic = mean(cv$bic[idx], na.rm = TRUE)
        )
        avgcv = rbind(avgcv, avgstat)
      }
    }
    ncomp = switch(criterion,
      "dev" = avgcv$ncomp[which.min(avgcv$dev)],
      "aic" = avgcv$ncomp[which.min(avgcv$aic)],
      "bic" = avgcv$ncomp[which.min(avgcv$bic)],
      "mae" = avgcv$ncomp[which.min(avgcv$mae)],
      "mse" = avgcv$ncomp[which.min(avgcv$mse)])
  } else {
    ncomp = switch(criterion,
      "dev" = cv$ncomp[which.min(cv$dev)],
      "aic" = cv$ncomp[which.min(cv$aic)],
      "bic" = cv$ncomp[which.min(cv$bic)],
      "mae" = cv$ncomp[which.min(cv$mae)],
      "mse" = cv$ncomp[which.min(cv$mse)])
  }

  if (refit) {
    # Re-fit the model using the selected optimizer
    if (control.cv$verbose) cat("Final refit with rank =", ncomp, "\n")
    control.init$values$U = control.init$values$U[, 1:ncomp, drop = FALSE]
    control.init$values$V = control.init$values$V[, 1:ncomp, drop = FALSE]

    fit = sgdgmf.fit(
      Y = Y, X = X, Z = Z, family = family,
      ncomp = ncomp, method = method, penalty = penalty,
      control.init = control.init, control.alg = control.alg)

    if (common) fit$exe.time[1] = time.init
  } else {
    # Do not re-fit the model, just return the summary statistics
    fit = list()
    fit$control.init = control.init
    fit$control.alg = control.alg
    fit$control.cv = control.cv
  }

  # Return the cross-validation parameters
  fit$control.cv = control.cv

  # Return the cross-validation statistics
  fit$summary.cv = cv

  # Return the fitted model
  return (fit)
}

#' @title Single step of cross-validation for generalized matrix factorization models
#'
#' @description
#' Internal function running a single step of cross-validation for generalized
#' matrix factorization (GMF) models and calculating some goodness-of-fit measures
#' on the train and test sets.
#'
#' @param train train-set matrix of responses (\eqn{n \times m})
#' @param test test-set matrix of responses (\eqn{n \times m})
#' @param X matrix of row fixed effects (\eqn{n \times p})
#' @param Z matrix of column fixed effects (\eqn{q \times m})
#' @param family a \code{glm} family (see \code{\link{family}} for more details)
#' @param ncomp ranks of the latent matrix factorization used in cross-validation (default 1 to 10)
#' @param maxcomp maximum rank allowed in the cross-validation exploration
#' @param fold integer number identifying the current fold
#' @param nfolds maximum number of folds in the cross-validation
#' @param weights an optional matrix of weights (\eqn{n \times m})
#' @param offset an optional matrix of offset values (\eqn{n \times m}), that specify a known component to be included in the linear predictor.
#' @param method estimation method to minimize the negative penalized log-likelihood
#' @param sampling sub-sampling strategy to use if \code{method = "sgd"}
#' @param penalty list of penalty parameters (see \code{\link{set.penalty}} for more details)
#' @param control.init list of control parameters for the initialization (see \code{\link{set.control.init}} for more details)
#' @param control.alg list of control parameters for the optimization (see \code{\link{set.control.alg}} for more details)
#' @param control.cv list of control parameters for the cross-validation (see \code{\link{set.control.cv}} for more details)
#'
#' @return
#' Returns a \code{data.frame}  containing the current number of latent factors
#' in the model (\code{ncomp}), the fold identifier (\code{fold}), the degrees of
#' freedom, i.e. the number of parameters, of the model (\code{df}), the AIC, BIC
#' and deviance (respectively, \code{aic}, \code{bic}, \code{dev})
#' calculated on the train and test sets.
#'
#' @keywords internal
sgdgmf.cv.step = function (
    train, test, X, Z, family, ncomp, maxcomp, fold,
    nfolds, weights, offset, method, sampling, penalty,
    control.init, control.alg, control.cv
) {
  # Set the model dimensions
  n = nrow(train)
  m = ncol(train)
  p = ifelse(is.null(X), 1, ncol(X))
  q = ifelse(is.null(Z), 1, ncol(Z))

  # Fraction of the data to use as a test set
  f = control.cv$proportion

  # Effective number of parameters
  df = m * p + n * q + (n + m) * ncomp

  # Print the estimation status
  if (control.cv$verbose)
    cat(" Rank:", paste(ncomp, maxcomp, sep = "/"),
        " Fold:", paste(fold, nfolds, sep = "/"), "\n")

  # Select the correct number of columns of the initial U and V matrices
  if (control.init$method == "values") {
    control.init$values$U = control.init$values$U[, 1:ncomp, drop = FALSE]
    control.init$values$V = control.init$values$V[, 1:ncomp, drop = FALSE]
  }

  # Estimated mean matrix
  mu = sgdgmf.fit(
    Y = train, X = X, Z = Z, family = family, ncomp = ncomp, weights = weights,
    offset = offset, method = method, sampling = sampling, penalty = penalty,
    control.init = control.init, control.alg = control.alg)$mu

  # Train and test sample sizes
  n.train = n * m * (1-f)
  n.test = n * m * f

  # Train and test goodness-of-fit measures
  dev.train = sum(weights * family$dev.resids(train, mu, 1), na.rm = TRUE)
  dev.test = sum(weights * family$dev.resids(test, mu, 1), na.rm = TRUE)
  aic.train = dev.train + 2 * df
  bic.train = dev.train + df * log(n.train)
  mae.train = sum(abs(train - mu), na.rm = TRUE)
  mae.test = sum(abs(test - mu), na.rm = TRUE)
  mse.train = sum((train - mu)^2, na.rm = TRUE)
  mse.test = sum((test - mu)^2, na.rm = TRUE)

  # Return a data-frame with all the GoF statistics
  data.frame(
    ncomp = ncomp,
    fold = fold,
    df = df,
    train = n.train,
    test = n.test,
    aic = aic.train / n.train,
    bic = bic.train / n.train,
    dev = dev.test / n.test,
    mae = mae.test / n.test,
    mse = mse.test / n.test
  )
}
Any scripts or data that you put into this service are public.
sgdGMF documentation built on April 3, 2025, 7:37 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sgdGMF
Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

R/crossval.R
In sgdGMF: Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

Try the sgdGMF package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sgdGMF Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

R/crossval.R In sgdGMF: Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

Try the sgdGMF package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sgdGMF
Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent

R/crossval.R
In sgdGMF: Estimation of Generalized Matrix Factorization Models via Stochastic Gradient Descent