R/reshape.R

Defines functions match_matrix reshape_samples reshape_all_samples melt_reshaped_samples

Documented in match_matrix melt_reshaped_samples reshape_all_samples reshape_samples

#! /usr/bin/env Rscript

## File description -------------------------------------------------------------
## Reshape raw simulated and model output data to a form that is more amenable
## to plotting.
##
## author: kriss1@stanford.edu

#' Identify a permutation that aligns rows of two matrices
#'
#' One way to get confidence intervals for mixtures on the simplex is
#' to first specify the cluster labels for each component, and then
#' study each of these histograms on their own. This is different
#' from, say, smoothing the mixture distribution and identifying
#' modes.
#'
#' The approach taken here is to find the two rows with maximal
#' correlation and put that in the required permutation. Then, remove
#' those rows and repeat.
#'
#' @param X [numeric matrix] A matrix whose rows we want to align with
#'   X.
#' @param Z [numeric matrix] A matrix whose rows we want to align with
#'   Z.
#' @return pi_result [vector] A permutation such that X[pi_result, ] = Z
#' (ideally).
#' @examples
#' X <- matrix(rnorm(100, mean = 4) ^ 2, 20, 5)
#' pi <- sample(1:20)
#' Z <- X[pi, ] + matrix(rnorm(100), 20, 5)
#' pi_hat <- match_matrix(X, Z)
#' cbind(pi_hat, pi)
#' @export
match_matrix <- function(X, Z) {
  n <- nrow(X)
  X_tilde <- X
  Z_tilde <- Z

  rownames(X_tilde) <- seq_len(n)
  rownames(Z_tilde) <- seq_len(n)
  pi_result <- rep(0, n)

  for (i in seq_len(n)) {
    # get maximal correlation in remainding rows
    rho <- cor(t(X_tilde), t(Z_tilde), method = "spearman")
    max_ix0 <- which(rho == max(rho), arr.ind = TRUE)

    max_ix <- c(
      as.integer(rownames(X_tilde)[max_ix0[1]]),
      as.integer(rownames(Z_tilde)[max_ix0[2]])
    )

    # input to resulting permutation, and update rows
    pi_result[max_ix[2]] <- max_ix[1]
    X_tilde <- X_tilde[-max_ix0[1],, drop = F]
    Z_tilde <- Z_tilde[-max_ix0[2],, drop = F]
  }

  pi_result
}

#' Reshape fitted scores for easy plotting
#'
#' Both thetas and betas in NMF are reshaped using very similar code, so we
#' might as well combine it explicitly in one function.
#'
#' @param samples [array] The samples generated by STAN; e.g. obtained by
#'   fit${param}. Two dimensions are for the actual matrix, the third are from
#'   sampling iterations.
#' @param truth [matrix] The truth parameter values, known because we are
#'   working from a simulation.
#' @param dims [character vector] The names to use for the rows and columns of
#'   the parameter matrix.
#' @return reshaped [data.table] The melted scores, with truth along with samples.
#' @importFrom magrittr %>%
#' @importFrom data.table setDT dcast.data.table
#' @importFrom dplyr left_join
#' @importFrom reshape2 melt
#' @export
reshape_samples <- function(samples, truth, dims) {
  ## align latent factors (label switchign problem)
  pi_align <- match_matrix(
    t(sqrt(truth)),
    t(sqrt(apply(samples, c(2, 3), mean)))
  )
  truth <- truth[, pi_align]

  ## now combine into one df
  reshaped <- samples %>%
    melt(varnames = c("iteration", dims)) %>%
    left_join(
      melt(
        truth,
        varnames = dims,
        value.name = "truth"
      )
    )

  reshaped[, dims[1]] <- factor(
    reshaped[, dims[1]],
    order(truth[, 1])
  )

  reshaped %>%
    setDT() %>%
    dcast.data.table(
      sprintf("%s + iteration ~ %s", dims[1], dims[2]),
      value.var = c("value", "truth")
    )
}

#' Wrapper for reshape_samples, applying too many fits
#'
#' We might want to extract the true and fitted values across several nmf
#' experiments. Each experiment individually would require calling
#' reshape_samples.
#'
#' @param fits [list of paths] A list of paths to the fitted STAN samples. They
#'   will be loaded and the associated samples will be extracted.
#' @param config_path [character] The path to the configuration JSON containing
#'   all the experiment information for these fits. It is assumed that the names
#'   in fits include the (necessarily numerical) IDs specified in this JSON
#'   object.
#' @param param [character] The name of the parameter to extract data for.
#' @param dims [character vector] The names to use for the rows and columns of
#'   the parameter matrix.
#' @param sim_seed [integer] The seed used in simulating the parameter of
#'   interest. This is necessary because we don't read the true values anywhere,
#'   we regenerate the parameter values.
#' @return reshaped [data.table] The same kind of object as the output of
#'   reshape_samples(), except over experiments with multiple configurations.
#' @importFrom data.table rbindlist
#' @importFrom jsonlite fromJSON
#' @importFrom stringr str_extract
#' @export
reshape_all_samples <- function(fits,
                                config_path,
                                param,
                                dims,
                                sim_seed = 01112017) {
  expers <- fromJSON(config_path, simplifyVector = TRUE, simplifyDataFrame = FALSE)

  fit_ids <- str_extract(basename(fits), "[^\\.]+")
  exper_ids <- sapply(expers, function(x) { x$id })
  samples <- list()
  for (i in seq_along(fits)) {
    set.seed(sim_seed)
    cur_exper <- expers[[which(exper_ids == fit_ids[[i]])]]
    cur_data <- nmf_sim(cur_exper$sim_opts)

    ## retrieve fitted samples
    samples[[i]] <- reshape_samples(
      get(load(fits[[i]]))[[param]],
      cur_data[[param]],
      dims
    )

    ## join in simulation parameters
    cur_exper$sim_opts$a <- cur_exper$sim_opts$prior_params[1]
    cur_exper$sim_opts$b <- cur_exper$sim_opts$prior_params[2]
    cur_exper$sim_opts$prior_params <- NULL
    cur_config <- data.frame(c(cur_exper$sim_opts, cur_exper$model_opts)) %>%
      rename(K_fit = K.1, zero_inf_prob_fit = zero_inf_prob.1)
    samples[[i]] <- cbind(samples[[i]], cur_config)
  }

  rbindlist(samples)
}

#' Melt reshaped samples
#'
#' While for the scatter / contours figures, it's useful to have the dimensions
#' as separate columns, we'll also want to the melted data when computing
#' explicit errors. This takes the output of reshaped_... and melts it so that
#' it's appropriate for histogramming the errors.
#'
#' @param samples [data.frame] The wide samples data, usually the output of
#'   reshape_all_samples.
#' @return melted_samples [data.frame] The same data as samples, but with
#'   different factor dimensions all stacked.
#' @importFrom magrittr %>%
#' @importFrom reshape2 melt
#' @export
melt_reshaped_samples <- function(samples) {
  melted_samples <- samples %>%
    melt(
      variable.name = "dimension",
      value.name = "estimate",
      measure.vars = c("value_1", "value_2")
    ) %>%
    melt(
      variable.name = "truth_dimension",
      measure.vars = c("truth_1", "truth_2"),
      value.name = "truth"
    )

  melted_samples$truth_dimension <- NULL
  melted_samples$dimension <- gsub("value_", "k=", melted_samples$dimension)
  melted_samples
}
krisrs1128/nmfSim documentation built on May 20, 2019, 1:30 p.m.