R/mmpca.R
In mmpca: Integrative Analysis of Several Related Data Matrices

Documented in mmpca

# x list of data blocks
# k rank of approximation
# p list of view dimensions
# n number of views
# m number of data blocks

#' Multiview principal component analysis
#'
#' Analyzes several related matrices of data.
#'
#' @param x List of matrices to analyze
#' @param inds Matrix containing view indices. The matrix should have two
#'   columns and the same number of rows as the length of \code{x}. The first
#'   (second) column contains the view index of the rows (columns) of the
#'   corresponding matrix.
#' @param k Integer giving the maximum rank of the analysis, i.e. the maximum
#'   number of principal components for each view.
#' @param lambda Vector or matrix of lambda values. The length (or width if it
#'   is a matrix) depends on the number of active penalties (2, 3 or 4). If it
#'   is a matrix, try different lambda values (one try for each row). Default: a
#'   matrix where each column is the sequence \code{exp(seq(-6, 0)))}.
#' @param trace Integer selecting the amount of log messages. 0 (default): no
#'   output, 3: all output.
#' @param max_iter Maximum number of iterations
#' @param init_theta NULL, functions or numeric. NULL (default) use initial
#'   values based on ordinary SVD. If init_theta is a list of three functions
#'   (\code{CMF}, \code{matrix_to_triplets} and \code{getCMFopts} from package
#'   \code{CMF}) use the supplied functions to find initial values with
#'   collaborative matrix factorization (CMF). If init_theta is a numeric vector
#'   it is used as initial value.
#' @param cachepath Character vector with path to directory to store
#'   intermediate results. If NULL (default) intermediate results are not
#'   stored. For caching to work it is required that the random number
#'   generation seed is constant between calls to mmpca, so \code{set.seed}
#'   needs to be called before mmpca.
#' @param enable_rank_selection Boolean deciding if the second penalty that
#'   imposes a low rank model should be enabled.
#' @param enable_sparsity Boolean deciding if the third penalty that imposes
#'   sparsity in V should be enabled.
#' @param enable_variable_selection Boolean deciding if the fourth penalty that
#'   increases the tendency for sparsity structure of different V columns to be
#'   similar. Defaults to FALSE meaning this penalty is not used.
#' @param parallel Boolean deciding if computations should be run on multiple
#'   cores simultaneously.
#'
#' @return A list with components
#'   \item{initial}{initial values used in optimization}
#'   \item{cmf}{solution found with CMF (if init_theta == c(CMF,
#'     matrix_to_triplets, getCMFopts))}
#'   \item{training}{solutions for different values of lambda}
#'   \item{solution}{solution for optimal lambda value}
#'
#' @examples
#' # Create model with three views, two data matrices of low-rank 3
#' max_rank <- 3
#' v <- list(
#'   qr.Q(qr(matrix(rnorm(10 * max_rank), 10, max_rank))),
#'   qr.Q(qr(matrix(rnorm(11 * max_rank), 11, max_rank))),
#'   qr.Q(qr(matrix(rnorm(12 * max_rank), 12, max_rank)))
#' )
#' d <- matrix(
#'   c(1, 1, 1, 1, 1, 0, 1, 0, 1),
#'   nrow = max_rank, ncol = 3
#' )
#' x <- list(
#'   v[[1]] %*% diag(d[, 1] * d[, 2]) %*% t(v[[2]]),
#'   v[[1]] %*% diag(d[, 1] * d[, 3]) %*% t(v[[3]])
#' )
#' inds <- matrix(c(1, 1, 2, 3), 2, 2)
#' result <- mmpca::mmpca(
#'   x, inds, max_rank, parallel = FALSE,
#'   lambda = c(1e-3, 1e-5), enable_sparsity = FALSE,
#'   trace = 3
#' )
#' # Investigate the solution
#' result$solution$D
#'
#' @author Jonatan Kallus, \email{kallus@@chalmers.se}
#' @keywords pca models multivariate
#'
#' @export
mmpca <- function(x, inds, k, lambda = NULL, trace = 0, max_iter = 20000,
    init_theta = NULL, cachepath = NULL, enable_rank_selection = TRUE,
    enable_sparsity = TRUE, enable_variable_selection = FALSE,
    parallel = TRUE) {

  if (enable_variable_selection && !enable_sparsity) {
    stop("Variable selection can not be enabled when sparsity is not enabled.")
  }

  nparam <- 2 + enable_sparsity + enable_variable_selection

  if (is.null(lambda)) {
    lambda <- matrix(rep(exp(seq(-6, 0)), nparam), ncol = nparam)
    if (!enable_rank_selection) {
      lambda[, 2] <- 0
    }
  }

  if (is.null(dim(lambda))) {
    lambda <- matrix(lambda, 1)
  }

  validate_inds(x, inds)

  theta <- init_theta
  p <- init_view_dimensions(x, inds)
  n <- length(p)

  # create cachepath directory unless is exists
  if (!is.null(cachepath)) {
    if (file.exists(cachepath) && !dir.exists(cachepath)) {
      stop("cachepath is not a directory")
    }
    if (!dir.exists(cachepath)) {
      dir.create(cachepath)
    }
  }

  # missing values, training set and test set
  missing_masks <- lapply(x, function(x) !is.na(x))
  for (i in seq_along(x)) {
    x[[i]][is.na(x[[i]])] <- 0
  }
  train_masks <- lapply(x, function(x) {
    0.9 > matrix(stats::runif(prod(dim(x))), nrow(x), ncol(x))
  })
  test_masks <- lapply(train_masks, function(x) !x)
  train_masks <- lapply(seq_along(x), function(i) {
    1 * (train_masks[[i]] & missing_masks[[i]])
  })
  test_masks <- lapply(seq_along(x), function(i) {
    1 * (test_masks[[i]] & missing_masks[[i]])
  })
  missing_masks <- lapply(seq_along(x), function(i) 1 * missing_masks[[i]])

  # rescale data so that biggest sing val is pi^2
  # factor is used to scale found D before returning

  # NOTE: This is done after filling missing values with zeros
  max_sing_val <- max(sapply(x, function(x) svd(x)$d[1]))
  data_scale_factor <- pi^2 / max_sing_val
  x <- lapply(x, function(x) x * data_scale_factor)

  # find scaling factor for lambda
  lambda_factor <- mean(sapply(seq_along(x), function(i) {
    sqrt(sum((x[[i]] * train_masks[[i]])^2))
  }))^(3 / 2)

  # find initial values
  cmf_result <- NULL
  if (
    length(theta) == 3
    && is.function(theta[[1]])
    && is.function(theta[[2]])
    && is.function(theta[[3]])
  ) {
    # find initial values xi and D with CMF
    cmf_fun <- theta
    if (trace) msg("Finding initial values... ")
    if (trace > 1) msg("\n")
    # set NA in data for CMF for missing data and test data
    x_cmf <- lapply(seq_along(x), function(i) {
      xx <- x[[i]]
      xx[test_masks[[i]] | !missing_masks[[i]]] <- NA
      return(xx)
    })
    if (trace > 1) msg("CMF:\n")
    cmf_result <- cmf_cached(x_cmf, inds, k, trace, cachepath, cmf_fun)
    # calculate training and test error of CMF
    cmf_result$loss <- sum(sapply(seq_along(x), function(i) {
      U <- cmf_result$U[inds[i, ]]
      sum(train_masks[[i]] * (x[[i]] - U[[1]] %*% t(U[[2]]))^2)
    }))
    cmf_result$test_loss <- sum(sapply(seq_along(x), function(i) {
      U <- cmf_result$U[inds[i, ]]
      sum(test_masks[[i]] * (x[[i]] - U[[1]] %*% t(U[[2]]))^2)
    }))
    normalize <- function(M) {
      for (i in seq_len(ncol(M))) {
        M[, i] <- M[, i] / sqrt(sum(M[, i]^2))
      }
      return(M)
    }
    V <- lapply(cmf_result$U, normalize)
    # set 0 for test data when initializing singular values
    x_init <- lapply(seq_along(x), function(i) {
      xx <- x[[i]]
      xx[test_masks[[i]]] <- 0
      return(xx)
    })
    singular_values <- init_singular_values(V, x_init, inds)
    if (trace > 1) msg("Inverse Euler transformation...")
    if (parallel) {
      xi <- parallel::mclapply(V, function(v) init_inv_v_cached(v, cachepath),
        mc.preschedule=FALSE)
    } else {
      xi <- lapply(V, function(v) init_inv_v_cached(v, cachepath))
    }
    if (trace > 1) msg("done\n")
    D <- init_d(singular_values)
    theta <- ref_vectorize(xi, D)
    if (trace) {
      msg("done\nCMF training loss: ")
      msg(round(cmf_result$loss, 4))
      msg("\nCMF test loss: ")
      msg(round(cmf_result$test_loss, 4))
      msg("\nOptimizing hyper parameters...\n")
    }
  } else if (is.null(theta)) {
    V <- init_v(x, inds, k)
    singular_values <- init_singular_values(V, x, inds)
    D <- init_d(singular_values)
    if (trace > 1) msg("Inverse Euler transformation...")
    if (parallel) {
      xi <- parallel::mclapply(V, function(v) init_inv_v_cached(v, cachepath),
        mc.preschedule=FALSE)
    } else {
      xi <- lapply(V, function(v) init_inv_v_cached(v, cachepath))
    }
    if (trace > 1) msg("done\n")
    theta <- ref_vectorize(xi, D)
    #theta <- stats::runif(sum(p * k) + k * n, -pi, pi)
  }

  result <- list()
  xiD <- ref_unvectorize(theta, k, max(inds), p)
  result$initial <- list(xi = xiD$xi, D = xiD$D, theta = theta)
  result$cmf <- cmf_result

  if (dim(lambda)[1] > 1) {
    results <- list()

    L <- function(lambda) {
      if (trace > 1) {
        msg("lambda: ", lambda, "\n")
      }
      if (parallel) {
        c_init_parallel()
      }
      res <- optim_mmpca_cached(theta, x, train_masks, inds, k, p, lambda,
        lambda_factor, max_iter, trace > 2, cachepath, 1)
      # change small values to exact zeros in D
      theta <- res[[1]]
      ix <- (1 + sum(p * k)):length(theta)
      theta[ix][abs(theta[ix]) < 1e-5] <- 0
      res[[1]] <- theta
      loss <- c_objective(theta, x, test_masks, inds, k, p,
        rep(0, length(lambda)))
      if (trace > 0) {
        msg("lambda: ", lambda, " -> test loss: ", loss, "\n")
      }

      list(
        value = loss,
        extra = list(
          lambda = lambda,
          theta = theta,
          loss = loss,
          res = res
        )
      )
    }

    if (parallel) {
      c_init_parallel()
      tmp_res <- parallel::mclapply(
        seq_len(nrow(lambda)),
        function(i) L(lambda[i, ]),
        mc.preschedule = FALSE
      )
    } else {
      tmp_res <- lapply(seq_len(nrow(lambda)), function(i) L(lambda[i, ]))
    }

    for (i in seq_along(tmp_res)) {
      results[[i]] <- tmp_res[[i]]$extra
    }

    if (trace) msg("Postprocessing solutions... ")
    solutions <- list()
    losses <- rep(NA, length(results))
    for (i in seq_along(results)) {
      r <- results[[i]]
      losses[i] <- r$loss
      res <- r$res

      solutions[[i]] <- list(lambda=r$lambda, theta=r$theta,
        iterations=res[[2]], status=res[[3]],
        cv_error=r$loss, unpenalized_objective_value=res[[4]],
        stepsize=res[[5]], message=res[[6]])
    }

    # order solutions from worst to best
    result$training <- solutions[order(-losses)]
    theta <- result$training[[length(result$training)]]$theta
    lambda <- result$training[[length(result$training)]]$lambda

    if (trace) msg("done\n")
  }

  # calculate final solution using all data
  if (trace) msg("Calculating final solution using all data... ")
  if (trace > 1) msg("\n")
  res <- optim_mmpca_cached(theta, x, missing_masks, inds, k, p, lambda,
    lambda_factor, max_iter, trace > 1, cachepath,
    ifelse(parallel, parallel::detectCores(), 1))
  # change small values to exact zeros in D
  theta <- res[[1]]
  ix <- (1 + sum(p * k)):length(theta)
  theta[ix][abs(theta[ix]) < 1e-5] <- 0
  res[[1]] <- theta

  xiD <- ref_unvectorize(theta, k, n, p)

  # rescale D
  xiD$D <- xiD$D / sqrt(data_scale_factor)

  component_importances <- calculate_component_importances(
    xiD$D, inds, lapply(x, function(x) x / data_scale_factor)
  )

  # order components by total importance and calculate xhat
  ix <- order(-component_importances$total)
  V <- lapply(xiD$xi, function(xi) c_Vxi(xi)[, ix, drop = FALSE])
  # set exact zeros in V
  for (i in seq_len(n)) {
    V[[i]][abs(V[[i]]) < 1e-5] <- 0
  }
  xiD$D <- xiD$D[ix, ]
  component_importances$block <- component_importances$block[ix, ]
  component_importances$total <- component_importances$total[ix]
  xhat <- list()
  for (j in seq_along(x)) {
    row <- inds[j, 1]
    col <- inds[j, 2]
    xhat[[j]] <- (
      V[[row]] %*% diag(xiD$D[, row] * xiD$D[, col]) %*% t(V[[col]])
    )
  }

  result$solution <- list(
    V = V,
    D = xiD$D,
    xhat = xhat,
    lambda = lambda,
    R2_blockwise = component_importances$block,
    R2_total = component_importances$total,
    test_masks = test_masks,
    iterations = res[[2]],
    status = res[[3]],
    stepsize = res[[5]],
    message = res[[6]]
  )

  if (trace) msg("done\n")

  return(result)
}

optim_mmpca_cached <- function(theta, x, masks, inds, k, p, lambda,
    lambda_factor, max_iter, trace, path, nparallel) {
  hash <- digest::digest(list(x, masks, inds, k, p, lambda_factor))
  lambda_str <- paste(lambda, collapse = "_")
  if (!is.null(path)) {
    filename <- file.path(path, paste(hash, lambda_str, sep = ":"))
    if (file.exists(filename)) {
      return(readRDS(filename))
    }
  }
  # make sparsity factor have same magnitude
  factor <- rep(1, length(lambda))
  if (length(lambda) > 2) {
    factor[3] <- 1 / length(p)
    if (length(lambda) > 3) {
      factor[4] <- 1 / length(p)
    }
  }
  res <- c_optim_mmpca(
    theta,
    x,
    masks,
    inds,
    k,
    p,
    lambda * lambda_factor * factor,
    max_iter,
    trace,
    nparallel
  )

  if (!is.null(path)) {
    saveRDS(res, filename)
  }
  return(res)
}

cmf_cached <- function(data, views, K, trace, path, cmf_fun) {
  hash <- digest::digest(list(data, views, K))
  if (!is.null(path)) {
    filename <- file.path(path, paste(hash, "cmf", sep = ":"))
    if (file.exists(filename)) {
      return(readRDS(filename))
    }
  }
  # make sparsity factor have same magnitude
  res <- cmf(data, views, K, trace, cmf_fun)
  if (!is.null(path)) {
    saveRDS(res, filename)
  }
  return(res)
}

init_inv_v_cached <- function(v, path) {
  hash <- digest::digest(v)
  if (!is.null(path)) {
    filename <- file.path(path, paste(hash, "init_inv_v", sep = ":"))
    if (file.exists(filename)) {
      return(readRDS(filename))
    }
  }
  # make sparsity factor have same magnitude
  res <- init_inv_v(v)
  if (!is.null(path)) {
    saveRDS(res, filename)
  }
  return(res)
}

cmf <- function(data, views, K, trace, cmf_fun) {
  D <- rep(NA, max(views))
  for (i in seq_len(nrow(views))) {
    D[views[i, 1]] <- nrow(data[[i]])
    D[views[i, 2]] <- ncol(data[[i]])
  }
  data <- lapply(data, cmf_fun[[2]])
  opts <- cmf_fun[[3]]()
  opts$verbose <- trace - 1
  cmf_fun[[1]](data, views, K, rep("gaussian", length(data)), D, opts = opts)
}

mmpca_lambda1 <- function(x, inds, k, lambda, nparallel, init = FALSE,
    max_iter = 20000, trace = FALSE) {
  result <- list()
  p <- init_view_dimensions(x, inds)

  # handle missing values in x
  masks <- lapply(x, function(x) 1 * !is.na(x))
  for (i in seq_along(x)) {
    x[[i]][is.na(x[[i]])] <- 0
  }

  if (init) { # find initial values xi and D
    V <- init_v(x, inds, k)
    xi <- lapply(V, init_inv_v)
    singular_values <- init_singular_values(V, x, inds)
    D <- init_d(singular_values)
    theta <- ref_vectorize(xi, D)
  } else { # not using the initialization!
    theta <- seq(0, 1, length=sum(p * k) + k * length(p))
  }
  xiD <- ref_unvectorize(theta, k, max(inds), p)
  solutions <- list()
  result$initial <- list(xi=xiD$xi, D=xiD$D, theta=theta)
  for (i in seq_along(lambda)) {
    if (trace) {
      msg("lambda: ", lambda[i], "\n")
    }

    res <- c_optim_mmpca(
      theta, x, masks, inds, k, p, lambda[i], max_iter, trace, nparallel
    )
    #res <- ref_optim_mmpca(theta, x, inds, k, p, lambda[i], trace)
    theta <- res[[1]]
    xiD <- ref_unvectorize(theta, k, length(p), p)
    component_importances <- calculate_component_importances(xiD$D, inds, x)

    if (i == 1) { # reorder components according to importances
      ix <- order(component_importances$total, decreasing=TRUE)
      for (j in seq_along(xiD$xi)) {
        V <- c_Vxi(xiD$xi[[j]])
        xiD$xi[[j]] <- init_inv_v(V[, ix])
      }
      xiD$D <- xiD$D[ix, ]
      theta <- ref_vectorize(xiD$xi, xiD$D)
      component_importances$block <- component_importances$block[ix, ]
      component_importances$total <- component_importances$total[ix]
    }

    if (trace) {
      msg("iter: ", res[[2]], " status: ", res[[3]])
      msg(" upval: ", res[[4]], " step: ", res[[5]])
      msg(" msg: ", res[[6]], "\n")
    }

    V <- lapply(xiD$xi, function(xi) c_Vxi(xi))
    xhat <- list()
    for (j in seq_along(x)) {
      row <- inds[j, 1]
      col <- inds[j, 2]
      xhat[[j]] <- V[[row]] %*% diag(xiD$D[, row] * xiD$D[, col]) %*%
        t(V[[col]])
    }

    solutions[[i]] <- list(xi=xiD$xi, D=xiD$D, theta=theta, V=V,
      xhat=xhat, iterations=res[[2]], status=res[[3]],
      unpenalized_value=res[[4]], stepsize=res[[5]], message=res[[6]],
      component_importances_blockwise=component_importances$block,
      component_importances_total=component_importances$total)
  }
  result$solutions <- solutions
  result$lambda <- lambda
  return(result)
}

# calculate R^2, the proportion of variance explained
calculate_component_importances <- function(D, inds, x) {
  res <- matrix(NA, nrow(D), nrow(inds))
  normalizer <- 0
  for (i in 1:nrow(inds)) {
    res[, i] <- (D[, inds[i, 1]] * D[, inds[i, 2]])^2
    normalizer <- normalizer + sum(x[[i]]^2)
  }
  list(block=res/normalizer, total=rowSums(res)/normalizer)
}

validate_inds <- function(x, inds) {
  for (i in unique(c(inds))) {
    p <- NA
    for (j in which(inds[, 1] == i)) {
      if (!is.na(p)) {
        if (p != nrow(x[[j]])) {
          stop("Some view has inconsistent size")
        }
      }
      p <- nrow(x[[j]])
    }
    for (j in which(inds[, 2] == i)) {
      if (!is.na(p)) {
        if (p != ncol(x[[j]])) {
          stop("Some view has inconsistent size")
        }
      }
      p <- ncol(x[[j]])
    }
  }
}

msg <- function(...) {
  message(..., appendLF = FALSE)
}