# Constants
DEFAULT_UPPER_BOUND <- 100
#' Create simulated data and learn weights for these data
#'
#' Creates a simulated data set by picking an output for each instance of an
#' input.
#' The probability of picking a particular output is guided by its conditional
#' probability given the input.
#' Learns constraint weights for each simulated data set.
#'
#' This function creates multiple simulated data sets, and learns a set of
#' weights that maximizes the likelihood of data for each simulated data set.
#'
#' To create a simulated data set, one output is randomly chosen for each
#' instance of an input.
#' The probability of picking a particular output, \eqn{O_i}, which arises from
#' input \eqn{I_j} depends on \eqn{Pr(O_i|I_j)}.
#'
#' The function `optimize_weights()` is called to find a set of weights that
#' maximize the likelihood of the simulated data.
#' All optional arguments of `optimize_weights()` that were available for the
#' user to specify biases and bounds are likewise available in this function,
#' `monte_carlo_weights()`.
#'
#' The process of simulating a data set and learning weights that optimize the
#' likelihood of the simulated data is repeated as per the number of specified
#' simulations.
#'
#' @section Why use this function?:
#'
#' This function gives us a way to estimate constraint weights via a Monte
#' Carlo process.
#' For example we might be interested in the effect of temperature on
#' polarizing predicted probabilities, and the resulting constraint weights.
#' This function can produce a distribution of constraint weights for the
#' simulated polarized data, as well as a distribution of constraint weights
#' for the simulated non-polarized ones, thereby allowing a comparison of the
#' two.
#'
#' @param pred_prob A data frame with a column for predicted probabilities.
#' This object should be in the same format as the `predictions` attribute of
#' the object returned by the `predict_probabilities` function.
#' @param num_simul The number of simulations to run.
#' @param bias_file (optional) The path to the file containing mus and sigma
#' for constraint biases. If this argument is provided, the scalar and vector
#' mu and sigma arguments will be ignored. Each row in this file should be
#' the name of the constraint, followed by the mu, followed by the sigma
#' (separated by whatever the relevant separator is; default is commas).
#' @param mu (optional) A scalar or vector that will serve as the mu for each
#' constraint in the bias term. Constraint weights will also be initialized to
#' this value. If a vector, its length must equal the number of constraints in
#' the input file. This value will not be used if `bias_file` is provided.
#' @param sigma (optional) A scalar or vector that will serve as the sigma for
#' each constraint in the bias term. If a vector, its length must equal the
#' number of constraints in the input file. This value will not be used if
#' `bias_file` is provided.
#' @param output_path (optional) A string specifying the path to a file to
#' which the output will be saved. If the file exists it will be overwritten.
#' If this argument isn't provided, the output will not be written to a file.
#' @param out_sep (optional) The delimiter used in the output files.
#' Defaults to commas.
#' @param control_params (optional) A named list of control parameters that
#' will be passed to the \link[stats]{optim} function. See the documentation
#' of that function for details. Note that some parameter settings may
#' interfere with optimization. The parameter `fnscale` will be overwritten
#' with `-1` if specified, since this must be treated as a maximization
#' problem.
#' @param upper_bound (optional) The maximum value for constraint weights.
#' Defaults to 100.
#' @param allow_negative_weights (optional) Whether the optimizer should allow
#' negative weights. Defaults to FALSE.
#' @return A data frame with the following structure:
#' \itemize{
#' \item rows: As many rows as the number of simulations
#' \item columns: As many columns as the number of constraints
#' }
#' @examples
#' # Get paths to toy data file
#' data_file <- system.file(
#' "extdata", "sample_data_frame.csv", package = "maxent.ot"
#' )
#'
#' tableaux_df <- read.csv(data_file)
#'
#' # Fit weights to data with no biases
#' fit_model <- optimize_weights(tableaux_df)
#'
#' # Predict probabilities for the same input with temperature = 2
#' pred_obj <- predict_probabilities(
#' tableaux_df, fit_model$weights, temperature = 2
#' )
#'
#' # Run 5 monte carlo simulations
#' # based on predicted probabilities when temperature = 2,
#' # and learn weights for these 5 simulated data sets
#' monte_carlo_weights(pred_obj$predictions, 5)
#'
#' # Save learned weights to a file
#' tmp_output <- tempfile()
#' monte_carlo_weights(pred_obj$predictions, 5, output_path=tmp_output)
#' @export
# Learns constraint weights for multiple randomly generated SR responses
monte_carlo_weights <- function(pred_prob, num_simul,
bias_file = NA, mu = NA, sigma = NA,
output_path = NA, out_sep = ",",
control_params = NA,
upper_bound = DEFAULT_UPPER_BOUND,
allow_negative_weights = FALSE) {
# Create file that calculates conditional probability over trial
cdnpred_prob <- cdnProb_trial(pred_prob)
# Initialize data frame to store learned weights
num_feats <- ncol(cdnpred_prob) - 6
output <- matrix(nrow = num_simul, ncol = num_feats)
# Learn weights for each simulation
for (i in 1:num_simul) {
# Create a simulated response file
simul_resp_file <- monte_carlo(cdnpred_prob)
# Learn weights for simulated response
curr_model <- optimize_weights(
simul_resp_file, allow_negative_weights=allow_negative_weights
)
# Record learned weights
output[i,] <- curr_model$weights
}
# Put in names of constraints
colnames(output) <- colnames(simul_resp_file)[4:ncol(simul_resp_file)]
# Write to output file if desired
if (!is.na(output_path)) {
utils::write.table(output, file = output_path, sep = out_sep,
row.names = FALSE)
}
return(output)
}
# Function that calculates conditional probability of output given trial
cdnProb_trial <- function (pred_prob) {
# Build ourselves a matrix for efficient computation
# Pre-allocate space
data_matrix <- matrix(0L, nrow = nrow(pred_prob), ncol = ncol(pred_prob))
# Port over the violation profiles and p(SR|UR)
data_matrix[, 1:(ncol(data_matrix) - 2)] <- data.matrix(pred_prob[, 1:(ncol(pred_prob) - 2)])
# Replace empty cells with 0
data_matrix[is.na(data_matrix)] <- 0
# Record trial_id in last column of data_matrix
trial_id <- 0
curr_ur <- ""
sr_ls <- list()
# For each row
for (i in 1:nrow(pred_prob)) {
# If still the same trial as previous row
# Conditions: same UR, SR is is the list of SRs associated with this UR
if (pred_prob[i, 1] == curr_ur && (pred_prob[i, 2] %in% sr_ls)) {
# Record current trial_id
data_matrix[i, ncol(data_matrix)] <- trial_id
# Else: This row begins a new trial
} else {
# Current trial needs a new trial_id
trial_id = trial_id + 1
# Record current trial_id in output file
data_matrix[i, ncol(data_matrix)] <- trial_id
# Store new UR of current trial in curr_ur
curr_ur <- pred_prob[i, 1]
# Start a new list of SRs for current trial in sr_ls
sr_ls <- list(pred_prob[i, 2])
# Track following rows that belong to the same trial
next_row <- i+1
# While the following rows belong to the same trial as row i
# Conditions: such rows have the same UR and different SRs
while (next_row <= nrow(pred_prob) && pred_prob[next_row, 1] == curr_ur && !(pred_prob[next_row, 2] %in% sr_ls)) {
# Add alternative SR options to SR list
sr_ls <- append(sr_ls, pred_prob[next_row, 2])
# Move on to following row
next_row <- next_row+1
}
}
}
# Record probability conditioned on trial in "new" 2nd last column of output
data_matrix[, (ncol(data_matrix)-1)] <- apply(data_matrix, 1,
normalize_trial, data_matrix, (ncol(data_matrix)-2))
# Change observed frequencies to 0
data_matrix[, 3] <- 0
# Re-introduce UR & SR characters
output <- cbind(pred_prob[, 1:2], data_matrix[, 3:ncol(data_matrix)])
# Column names
# Port over column names from pred_prob
names(output) <- colnames(pred_prob)
# Change "Predicted Probability" to "Pred p(SR|UR)"
colnames(output)[colnames(output) == "Predicted Probability"] <- "Pred p(SR|UR)"
# Change "Observed Probability" to "Pred p(SR|trial)"
colnames(output)[colnames(output) == "Observed Probability"] <- "Pred p(SR|trial)"
# Change "Error" to "Trial id"
colnames(output)[colnames(output) == "Error"] <- "Trial id"
return(output)
}
# Helper function that applies normalization over trial
normalize_trial <- function (row, m, col_num) {
return(row[col_num]/sum(m[m[, ncol(m)] == row[ncol(m)], ][, col_num]))
}
# Function that creates 1 simulated response based on probabilities conditioned over trial
# The input_file is the one that has probs over trial
monte_carlo <- function (data_file) {
# Get total number of trials
num_trial <- max(data_file[,ncol(data_file)])
# For each trial, pick 1 SR response
for (i in 1:num_trial) {
# Get row indices of all trials with 'Trial id' == i
id_vec <- which(data_file$'Trial id' == i)
# Get conditional prob over trial for all trials in id_vec
prob_vec <- data_file[id_vec, 'Pred p(SR|trial)']
# Randomly pick 1 SR according to conditional prob
# Currently only 1 random draw per trial
# To support multiple draws per trial in future?
picked_sr_id <- sample(x=id_vec, size=1, prob=prob_vec)
# Record picked SR
data_file[picked_sr_id, 'Freq'] <- 1
}
# Drop last 3 columns so file has only the columns required for optimize_weights()
simulated_resp_file <- data_file[, -(ncol(data_file)-2):-(ncol(data_file))]
return(simulated_resp_file)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.