#! /usr/bin/env Rscript
## File description -------------------------------------------------------------
## Reshape raw simulated and model output data to a form that is more amenable
## to plotting.
##
## author: kriss1@stanford.edu
#' Identify a permutation that aligns rows of two matrices
#'
#' One way to get confidence intervals for mixtures on the simplex is
#' to first specify the cluster labels for each component, and then
#' study each of these histograms on their own. This is different
#' from, say, smoothing the mixture distribution and identifying
#' modes.
#'
#' The approach taken here is to find the two rows with maximal
#' correlation and put that in the required permutation. Then, remove
#' those rows and repeat.
#'
#' @param X [numeric matrix] A matrix whose rows we want to align with
#' X.
#' @param Z [numeric matrix] A matrix whose rows we want to align with
#' Z.
#' @return pi_result [vector] A permutation such that X[pi_result, ] = Z
#' (ideally).
#' @examples
#' X <- matrix(rnorm(100, mean = 4) ^ 2, 20, 5)
#' pi <- sample(1:20)
#' Z <- X[pi, ] + matrix(rnorm(100), 20, 5)
#' pi_hat <- match_matrix(X, Z)
#' cbind(pi_hat, pi)
#' @export
match_matrix <- function(X, Z) {
n <- nrow(X)
X_tilde <- X
Z_tilde <- Z
rownames(X_tilde) <- seq_len(n)
rownames(Z_tilde) <- seq_len(n)
pi_result <- rep(0, n)
for (i in seq_len(n)) {
# get maximal correlation in remainding rows
rho <- cor(t(X_tilde), t(Z_tilde), method = "spearman")
max_ix0 <- which(rho == max(rho), arr.ind = TRUE)
max_ix <- c(
as.integer(rownames(X_tilde)[max_ix0[1]]),
as.integer(rownames(Z_tilde)[max_ix0[2]])
)
# input to resulting permutation, and update rows
pi_result[max_ix[2]] <- max_ix[1]
X_tilde <- X_tilde[-max_ix0[1],, drop = F]
Z_tilde <- Z_tilde[-max_ix0[2],, drop = F]
}
pi_result
}
#' Reshape fitted scores for easy plotting
#'
#' Both thetas and betas in NMF are reshaped using very similar code, so we
#' might as well combine it explicitly in one function.
#'
#' @param samples [array] The samples generated by STAN; e.g. obtained by
#' fit${param}. Two dimensions are for the actual matrix, the third are from
#' sampling iterations.
#' @param truth [matrix] The truth parameter values, known because we are
#' working from a simulation.
#' @param dims [character vector] The names to use for the rows and columns of
#' the parameter matrix.
#' @return reshaped [data.table] The melted scores, with truth along with samples.
#' @importFrom magrittr %>%
#' @importFrom data.table setDT dcast.data.table
#' @importFrom dplyr left_join
#' @importFrom reshape2 melt
#' @export
reshape_samples <- function(samples, truth, dims) {
## align latent factors (label switchign problem)
pi_align <- match_matrix(
t(sqrt(truth)),
t(sqrt(apply(samples, c(2, 3), mean)))
)
truth <- truth[, pi_align]
## now combine into one df
reshaped <- samples %>%
melt(varnames = c("iteration", dims)) %>%
left_join(
melt(
truth,
varnames = dims,
value.name = "truth"
)
)
reshaped[, dims[1]] <- factor(
reshaped[, dims[1]],
order(truth[, 1])
)
reshaped %>%
setDT() %>%
dcast.data.table(
sprintf("%s + iteration ~ %s", dims[1], dims[2]),
value.var = c("value", "truth")
)
}
#' Wrapper for reshape_samples, applying too many fits
#'
#' We might want to extract the true and fitted values across several nmf
#' experiments. Each experiment individually would require calling
#' reshape_samples.
#'
#' @param fits [list of paths] A list of paths to the fitted STAN samples. They
#' will be loaded and the associated samples will be extracted.
#' @param config_path [character] The path to the configuration JSON containing
#' all the experiment information for these fits. It is assumed that the names
#' in fits include the (necessarily numerical) IDs specified in this JSON
#' object.
#' @param param [character] The name of the parameter to extract data for.
#' @param dims [character vector] The names to use for the rows and columns of
#' the parameter matrix.
#' @param sim_seed [integer] The seed used in simulating the parameter of
#' interest. This is necessary because we don't read the true values anywhere,
#' we regenerate the parameter values.
#' @return reshaped [data.table] The same kind of object as the output of
#' reshape_samples(), except over experiments with multiple configurations.
#' @importFrom data.table rbindlist
#' @importFrom jsonlite fromJSON
#' @importFrom stringr str_extract
#' @export
reshape_all_samples <- function(fits,
config_path,
param,
dims,
sim_seed = 01112017) {
expers <- fromJSON(config_path, simplifyVector = TRUE, simplifyDataFrame = FALSE)
fit_ids <- str_extract(basename(fits), "[^\\.]+")
exper_ids <- sapply(expers, function(x) { x$id })
samples <- list()
for (i in seq_along(fits)) {
set.seed(sim_seed)
cur_exper <- expers[[which(exper_ids == fit_ids[[i]])]]
cur_data <- nmf_sim(cur_exper$sim_opts)
## retrieve fitted samples
samples[[i]] <- reshape_samples(
get(load(fits[[i]]))[[param]],
cur_data[[param]],
dims
)
## join in simulation parameters
cur_exper$sim_opts$a <- cur_exper$sim_opts$prior_params[1]
cur_exper$sim_opts$b <- cur_exper$sim_opts$prior_params[2]
cur_exper$sim_opts$prior_params <- NULL
cur_config <- data.frame(c(cur_exper$sim_opts, cur_exper$model_opts)) %>%
rename(K_fit = K.1, zero_inf_prob_fit = zero_inf_prob.1)
samples[[i]] <- cbind(samples[[i]], cur_config)
}
rbindlist(samples)
}
#' Melt reshaped samples
#'
#' While for the scatter / contours figures, it's useful to have the dimensions
#' as separate columns, we'll also want to the melted data when computing
#' explicit errors. This takes the output of reshaped_... and melts it so that
#' it's appropriate for histogramming the errors.
#'
#' @param samples [data.frame] The wide samples data, usually the output of
#' reshape_all_samples.
#' @return melted_samples [data.frame] The same data as samples, but with
#' different factor dimensions all stacked.
#' @importFrom magrittr %>%
#' @importFrom reshape2 melt
#' @export
melt_reshaped_samples <- function(samples) {
melted_samples <- samples %>%
melt(
variable.name = "dimension",
value.name = "estimate",
measure.vars = c("value_1", "value_2")
) %>%
melt(
variable.name = "truth_dimension",
measure.vars = c("truth_1", "truth_2"),
value.name = "truth"
)
melted_samples$truth_dimension <- NULL
melted_samples$dimension <- gsub("value_", "k=", melted_samples$dimension)
melted_samples
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.