#' @title Infer Communication Content and Structure
#' @description Performs inference on the content conditional structure of a
#' text valued communication network.
#'
#' @param formula A formula object of the form 'ComNet ~
#' euclidean(d = 2)' where d is the number of dimensions in the latent space
#' that the user would like to include, and ComNet is an object of class
#' 'ComNet' generated by the prepare_data() function. This object will contain
#' all of the relevant information about the corpus so that multiple models may
#' be specified without the need to repreprocess the data. The formula may also
#' include optional terms 'sender("covariate_name")', 'receiver("covariate_name")',
#' 'nodemix("covariate_name", base = value)' and 'netcov("network_covariate")',
#' which are defined analogously to the arguments in the latentnet package.
#' @param interaction_patterns The number of different interaction patterns
#' governing message sending and recieving under the model. Defaults to 4.
#' @param topics The number of topics to be used in the model. Defaults to 40.
#' @param alpha The hyperparameter governing document-topic distributions. Lower
#' values encourage more peaked distributions. Defaults to 1.
#' @param beta The hyperparameter governing the Dirichlet prior on the
#' topic-word distributions. Lower values encourage more peaked distributions.
#' Defaults to 0.01.
#' @param iterations The number of iterations of Metropolis-within-Gibbs
#' sampling to be used in model estimation. Defaults to 1,000.
#' @param metropolis_hastings_iterations The number of Metropolis Hastings
#' iterations to be run during each iteration of Metropolis-within-Gibbs
#' sampling to update interaction pattern parameters. Defaults to 500.
#' @param final_metropolis_hastings_burnin The number of iterations of
#' Metropolis Hastings to run after completing the main iterations of Gibbs
#' sampling to discard before keeping samples. Defaults to 50,000.
#' @param final_metropolis_hastings_iterations The number of iterations to run
#' Metropolis Hastings after completing all main iterations of Gibbs sampling.
#' Defaults to 100,000. This additional number of iterations is required to
#' ensure that the Markov chain of interaction pattern parameters has mixed
#' appropriately and converged to the target distribution. This can occasionally
#' take on the order of 1-10 million iterations, but is often much faster in
#' practice.
#' @param thin The proportion of network samples to keep from the final run of
#' Metropolis Hastings to convergence. Defaults to 1/100, meaning that every
#' 100'th network sample will be returned.
#' @param target_accept_rate The target acceptance rate for the Metropolis
#' Hastings algorithm. Defaults to 0.25 which is standard in the literature.
#' @param tolerance The tolerance for differences between the observed and
#' target Metropolis Hastings accept rates (+-). Defaults to 0.05.
#' @param LSM_proposal_variance The Metropolis Hastings proposal variance for
#' all interaction pattern parameters. Defaults to .5.
#' @param LSM_prior_variance The variance of the multivariate normal prior on
#' all interaction pattern parameters. Defaults to 1.
#' @param LSM_prior_mean The mean of the multivariate normal prior on all
#' interaction pattern parameters. Defaults to 0.
#' @param iterations_before_t_i_p_updates The number of iterations to wait
#' before beginning updates to topic interaction pattern assignments. Defaults
#' to 5. If the user does not wish to update these assignments, the value
#' can be set to a value greater than 'iterations'.
#' @param update_t_i_p_every_x_iterations The number of iterations between
#' updates to topic interaction pattern assignments. Defaults to 5.
#' @param adaptive_metropolis Logical indicating whether adaptive Metropolis
#' should be used (whether the proposal variance should be optimized). Defaults
#' to TRUE.
#' @param adaptive_metropolis_update_size The amount by which the Metroplis Hastings
#' proposal distribution variance is changed (up or down) durring adaptive Metropolis.
#' Defaults to 0.05.
#' @param seed The seed to be used (for replicability across runs). Defaults to
#' 12345.
#' @param adaptive_metropolis_every_x_iterations The nubmer of iterations
#' between proposal variance updates during the final run of Metropolis Hastings
#' to convergence. Defaults to 1000.
#' @param stop_adaptive_metropolis_after_x_updates The number of Metropolis
#' Hastings proposal variance updates to complete during the final run of
#' Metropolis Hastings to convergence before fixing its value.
#' Defualts to 50. Make sure that the selection of this number is such that
#' the proposal variance is fixed after burnin.
#' @param slice_sample_alpha_m Logical indicating whether hyperparameter
#' optimization should be used to determine the optimal value of alpha. Defaults
#' to FALSE. If TRUE, then alpha_m will be slice sampled. This can improve model
#' fit.
#' @param slice_sample_step_size The initial size of the slice to use when slice
#' sampling alpha (hyperparameter optimization). Defaults to 1.
#' @param parallel Argument indicating whether the token topic distributions
#' should be generated in parallel. Defaults to FALSE. Can significantly reduce
#' runtime when training a model with a large number of topics.
#' @param cores The number of cores to be used if the parallel option is set to
#' TRUE. Should not exceed the number of cores available on the machine and will
#' not show performance gains if cores > topics.
#' @param output_directory The directory where the user would like to store
#' output from the model. Defaults to NULL. If NULL, then the current working
#' directory will be used to store output if an output_name_stem is provided.
#' @param output_name_stem Defaults to NULL. If not NULL, then output will be
#' saved to disk using the output_name_stem to differentiate it from output from
#' other model runs.
#' @param generate_plots Logical indicating whether diagnostic and summary plots
#' should be generated, defaults to TRUE.
#' @param verbose Defaults to TRUE, if FALSE, then no output is printed to the
#' screen by the inference code.
#' @return An object of class CCAS containing estimation results.
#' @examples
#' \dontrun{
#' set.seed(12345)
#' # read in data prepared by the prepare_data() function.
#' data(ComNet_data)
#' # specify a formula that we will use for testing.
#' formula <- ComNet_data ~ euclidean(d = 2) +
#' nodemix("Gender", base = "Male")
#' CCAS_Object <- ccas(formula,
#' interaction_patterns = 4,
#' topics = 40,
#' alpha = 1,
#' beta = 0.01,
#' iterations = 20,
#' metropolis_hastings_iterations = 500,
#' final_metropolis_hastings_iterations = 10000,
#' final_metropolis_hastings_burnin = 5000,
#' thin = 1/10,
#' target_accept_rate = 0.25,
#' tolerance = 0.05,
#' adaptive_metropolis_update_size = 0.05,
#' LSM_proposal_variance = .5,
#' LSM_prior_variance = 1,
#' LSM_prior_mean = 0,
#' slice_sample_alpha_m = TRUE,
#' slice_sample_step_size = 1,
#' generate_plots = TRUE,
#' output_directory = NULL,
#' output_name_stem = NULL)
#' }
#' @export
ccas <- function(formula,
interaction_patterns = 4,
topics = 40,
alpha = 1,
beta = 0.01,
iterations = 1000,
metropolis_hastings_iterations = 500,
final_metropolis_hastings_burnin = 50000,
final_metropolis_hastings_iterations = 100000,
thin = 1/100,
target_accept_rate = 0.25,
tolerance = 0.05,
LSM_proposal_variance = .5,
LSM_prior_variance = 1,
LSM_prior_mean = 0,
iterations_before_t_i_p_updates = 5,
update_t_i_p_every_x_iterations = 5,
adaptive_metropolis = TRUE,
adaptive_metropolis_update_size = 0.05,
seed = 12345,
adaptive_metropolis_every_x_iterations = 1000,
stop_adaptive_metropolis_after_x_updates = 50,
slice_sample_alpha_m = FALSE,
slice_sample_step_size = 1,
parallel = FALSE,
cores = 2,
output_directory = NULL,
output_name_stem = NULL,
generate_plots = TRUE,
verbose = TRUE) {
# for now, we only allow a common proposal variance, prior mean, and prior
# variance. In the future, these could be different for each cluster and
# parameter type.
LSM_intercept_proposal_variance = LSM_proposal_variance
LSM_intercept_prior_variance = LSM_prior_variance
LSM_intercept_prior_mean = LSM_prior_mean
LSM_position_proposal_variance = LSM_proposal_variance
LSM_position_prior_variance = LSM_prior_variance
LSM_position_prior_mean = LSM_prior_mean
LSM_coefficient_proposal_variance = LSM_proposal_variance
LSM_coefficient_prior_variance = LSM_prior_variance
LSM_coefficient_prior_mean = LSM_prior_mean
# note that after this point, we will be working with standard deviations,
# since they are what will actually be used in the inference algorithm and
# we want to be precise with what exactly we are passing in.
# set the seed
set.seed(seed)
# set the output directory and save the current one so we can reset it
# afterwards
if (!is.null(output_directory)) {
cur_directory <- getwd()
setwd(output_directory)
}
# set the number of threads to use with parallel
if (parallel) {
RcppParallel::setThreadOptions(numThreads = cores)
}
# possible terms for inclusion in model specification.
possible_structural_terms <- c("euclidean")
possible_covariate_terms <- c("sender","receiver","nodemix")
possible_network_terms <- c("netcov")
# make sure that formula is a formula object
formula <- as.formula(formula)
# if we are slice sampling, then set the inteval to 5, otherwise, it will
# be negative 2, in which case it will never trigger the conditional in our
# main inference algorithm, and thus we will never slice sample alpha_m
slice_sample_alpha_m_every <- -2
if (slice_sample_alpha_m) {
slice_sample_alpha_m_every <- 5
}
# parse the forumla and return a list object containing the ComNet object
# and the latent space model spcification
parsed_specifcation <- parse_specification(formula,
possible_structural_terms,
possible_covariate_terms,
possible_network_terms,
terms_to_parse = "structural")
# get the ComNet object from the specification
ComNet_Object <- parsed_specifcation$ComNet
# now extract indicator of whether we are using covariates
using_covariates <- parsed_specifcation$using_covariates
# if we are not using covariates, we still need to initialize objects of the
# right type to pass in to C++ or the main inference function will throw
# an error.
number_of_covariates <- 0
covariate_array <- array(0,dim = c(10,10,10))
# if we are using covariates, then generate the covariate array:
if (using_covariates) {
temp <- generate_covariate_array(formula,
possible_structural_terms,
possible_covariate_terms,
possible_network_terms,
ComNet_Object)
covariate_array <- temp$covariate_array
number_of_covariates <- temp$number_of_covariates
}
# initialize an object of class CCAS to store everything. This will include
# initializing all latent variables and organizing data in a format that is
# appropriate for the main inference function
CCAS_Object <- new("CCAS",
ComNet_Object = ComNet_Object,
interaction_patterns = interaction_patterns,
number_of_topics = topics,
latent_space_dimensions = parsed_specifcation$d,
target_accept_rate = target_accept_rate,
tolerance = tolerance,
formula = formula,
alpha = alpha,
beta = beta,
iterations = iterations,
metropolis_hastings_iterations = metropolis_hastings_iterations,
final_metropolis_hastings_burnin = final_metropolis_hastings_burnin,
final_metropolis_hastings_iterations = final_metropolis_hastings_iterations,
thin = thin,
number_of_covariates = number_of_covariates,
iterations_before_t_i_p_updates = iterations_before_t_i_p_updates,
update_t_i_p_every_x_iterations = update_t_i_p_every_x_iterations,
perform_adaptive_metropolis = adaptive_metropolis,
adaptive_metropolis_update_size = adaptive_metropolis_update_size,
seed = seed)
# now add in the covariate array (whether or not we are actually including
# them in our model)
CCAS_Object@covariate_array <- covariate_array
# initialie vectors of prior means and standard deviations.
CCAS_Object <- initialize_LSM_priors(CCAS_Object,
LSM_intercept_prior_variance,
LSM_intercept_prior_mean,
LSM_position_prior_variance,
LSM_position_prior_mean,
LSM_coefficient_prior_variance,
LSM_coefficient_prior_mean)
# initialize vectors of proposal standard deviations from our given
# proposal variance. We switch to standard deviations since those are what
# the c++ functions will operate on.
CCAS_Object@LSM_intercept_proposal_standard_deviation <- rep(
sqrt(LSM_intercept_proposal_variance),
interaction_patterns
)
CCAS_Object@LSM_position_proposal_standard_deviation <- rep(
sqrt(LSM_position_proposal_variance),
interaction_patterns
)
CCAS_Object@LSM_coefficient_proposal_standard_deviation <- rep(
sqrt(LSM_coefficient_proposal_variance),
interaction_patterns
)
# as of now, we are using uniform base measures m and n. In the future, these
# could be power law, or some other distribution, but for now we just make
# sure that they sum to alpha/beta.
CCAS_Object@alpha_m <- rep(CCAS_Object@alpha/CCAS_Object@number_of_topics,
CCAS_Object@number_of_topics)
CCAS_Object@beta_n <- rep(CCAS_Object@beta/CCAS_Object@ComNet_Object@vocabulary_size,
CCAS_Object@ComNet_Object@vocabulary_size)
# initialize all latent variable values
CCAS_Object@latent_variables <- initialize_latent_variables(CCAS_Object)
cat("\n#############################################\n\n")
cat("Initialization Complete: Running Inference...\n\n")
cat("#############################################\n\n")
# run inference (Metropolis within Gibbs)
Inference_Results <- model_inference(
CCAS_Object@ComNet_Object@document_authors_zero_indexed,
CCAS_Object@ComNet_Object@document_edge_matrix,
CCAS_Object@latent_variables$LDA_Params$document_topic_counts,
CCAS_Object@latent_variables$LDA_Params$topic_interaction_patterns_zero_indexed,
CCAS_Object@latent_variables$LDA_Params$word_type_topic_counts,
CCAS_Object@latent_variables$LDA_Params$topic_token_counts,
CCAS_Object@latent_variables$LDA_Params$token_topic_assignments,
CCAS_Object@latent_variables$LDA_Params$token_word_types,
CCAS_Object@latent_variables$LSM_Params$intercepts,
CCAS_Object@latent_variables$LSM_Params$coefficients,
CCAS_Object@latent_variables$LSM_Params$positions,
CCAS_Object@covariate_array,
CCAS_Object@alpha_m,
CCAS_Object@beta_n,
CCAS_Object@ComNet_Object@using_covariates,
CCAS_Object@LSM_intercept_prior_mean,
CCAS_Object@LSM_intercept_prior_standard_deviation,
CCAS_Object@LSM_intercept_proposal_standard_deviation,
CCAS_Object@LSM_coefficient_prior_mean,
CCAS_Object@LSM_coefficient_prior_standard_deviation,
CCAS_Object@LSM_coefficient_proposal_standard_deviation,
CCAS_Object@LSM_position_prior_mean,
CCAS_Object@LSM_position_prior_standard_deviation,
CCAS_Object@LSM_position_proposal_standard_deviation,
CCAS_Object@target_accept_rate,
CCAS_Object@tolerance,
CCAS_Object@adaptive_metropolis_update_size,
CCAS_Object@seed,
CCAS_Object@iterations,
CCAS_Object@metropolis_hastings_iterations,
CCAS_Object@ComNet_Object@num_tokens,
CCAS_Object@iterations_before_t_i_p_updates,
CCAS_Object@update_t_i_p_every_x_iterations,
CCAS_Object@perform_adaptive_metropolis,
slice_sample_alpha_m_every,
slice_sample_step_size,
parallel,
verbose)
# make sure all of the output is put into the appropriate named list objects
MCMC_Results <- list(intercepts = Inference_Results[[2]],
coefficients = Inference_Results[[3]],
latent_positions = Inference_Results[[4]],
accept_rates = Inference_Results[[9]],
final_proposal_standard_deviations = Inference_Results[[10]])
# make sure to increment where necessary to get back to 1 indexing
Topic_Model_Results <- list(topic_interaction_patterns = Inference_Results[[1]] + 1,
document_topic_counts = Inference_Results[[5]],
word_type_topic_counts = Inference_Results[[6]],
topic_token_counts = Inference_Results[[7]],
token_topic_assignments = Inference_Results[[8]],
unnoramlized_LDA_log_likelihood = Inference_Results[[11]])
# assign them to the slots in our CCAS Object
CCAS_Object@MCMC_output <- MCMC_Results
CCAS_Object@topic_model_results <- Topic_Model_Results
# increment token topic assignments
for (i in 1:CCAS_Object@ComNet_Object@num_documents) {
CCAS_Object@topic_model_results$token_topic_assignments[[i]] <-
CCAS_Object@topic_model_results$token_topic_assignments[[i]] + 1
}
cat("\n####################################################\n\n")
cat("Main Inference Complete: Running MH to Convergence...\n\n")
cat("####################################################\n\n")
# select the last iteraction of output from MH from the main inference
# to seed our run of MH to convergence
ints <- CCAS_Object@MCMC_output$intercepts[
nrow(CCAS_Object@MCMC_output$intercepts),]
coefs <- CCAS_Object@MCMC_output$coefficients[,,
dim(CCAS_Object@MCMC_output$coefficients)[3]]
ld <- CCAS_Object@latent_space_dimensions
# need to add one back on so we dont over-count the number of dimensions
ind <- dim(CCAS_Object@MCMC_output$latent_positions)[3] - ld + 1
lat_pos <- CCAS_Object@MCMC_output$latent_positions[,,ind:(ind + ld - 1)]
# we also have to determine how many observations to actually store after
# the burnin is complete and the chain has been appropriately thinned. We
# do this outside of C++ since the C++ ceiling function can produce warnings
# that will not pass R CMD Check.
samples_to_store <- ceiling(CCAS_Object@thin *
CCAS_Object@final_metropolis_hastings_iterations)
# now calculate the inverse of the thin parameter to get our sample every
# parameter
sample_every <- (1/CCAS_Object@thin)
if (sample_every < 1) {
sample_every <- 1
cat("You specified a value for thin > 1. You must select a value for thin < 1. All iterations will be kept...\n")
}
# the total iterations is the burnin plus the iterations
iters <- CCAS_Object@final_metropolis_hastings_iterations +
CCAS_Object@final_metropolis_hastings_burnin
# run MH to convergence
final_mh_results <- mh_to_convergence(
CCAS_Object@ComNet_Object@document_authors_zero_indexed,
CCAS_Object@ComNet_Object@document_edge_matrix,
CCAS_Object@topic_model_results$document_topic_counts,
CCAS_Object@topic_model_results$topic_interaction_patterns - 1,
ints,
coefs,
lat_pos,
CCAS_Object@covariate_array,
CCAS_Object@ComNet_Object@using_covariates,
CCAS_Object@LSM_intercept_prior_mean,
CCAS_Object@LSM_intercept_prior_standard_deviation,
CCAS_Object@MCMC_output$final_proposal_standard_deviations,
CCAS_Object@LSM_coefficient_prior_mean,
CCAS_Object@LSM_coefficient_prior_standard_deviation,
CCAS_Object@MCMC_output$final_proposal_standard_deviations,
CCAS_Object@LSM_position_prior_mean,
CCAS_Object@LSM_position_prior_standard_deviation,
CCAS_Object@MCMC_output$final_proposal_standard_deviations,
CCAS_Object@target_accept_rate,
CCAS_Object@tolerance,
CCAS_Object@adaptive_metropolis_update_size,
CCAS_Object@seed,
iters,
adaptive_metropolis_every_x_iterations,
stop_adaptive_metropolis_after_x_updates,
samples_to_store,
sample_every,
CCAS_Object@final_metropolis_hastings_burnin - 1
)
# now update the MCMC_output with the samples from our final run of MH
CCAS_Object@MCMC_output$intercepts = final_mh_results[[1]]
CCAS_Object@MCMC_output$coefficients = final_mh_results[[2]]
CCAS_Object@MCMC_output$latent_positions = final_mh_results[[3]]
CCAS_Object@MCMC_output$final_accept_or_reject = final_mh_results[[4]]
CCAS_Object@MCMC_output$final_proposal_standard_deviations = final_mh_results[[5]]
CCAS_Object@MCMC_output$LSM_log_likelihood = final_mh_results[[6]]
cat("\n####################################\n\n")
cat("Generating Output and Diagnostics...\n\n")
cat("####################################\n\n")
# generate output -- note that we set the working directory before estimation
# (if one was provided) in order to prevent an error at this stage from
# not producing output.
CCAS_Object <- generate_output(
CCAS_Object = CCAS_Object,
output_name_stem = output_name_stem,
generate_plots = generate_plots)
# reset the working directory
if (!is.null(output_directory)) {
setwd(cur_directory)
}
# retrun the CCAS object
return(CCAS_Object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.