Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(conversim)
## ----echo=FALSE---------------------------------------------------------------
library(ggplot2)
combine_sims <- function(similarities, weights = NULL) {
if (is.null(weights)) {
weights <- rep(1, length(similarities))
} else {
weights <- unlist(weights)
}
weighted_sum <- sum(unlist(similarities) * weights)
total_weight <- sum(weights)
return(weighted_sum / total_weight)
}
plot_sims <- function(similarities) {
df <- data.frame(
measure = names(similarities),
score = unlist(similarities)
)
ggplot2::ggplot(df, ggplot2::aes(x = .data$measure, y = .data$score)) +
ggplot2::geom_bar(stat = "identity", fill = "steelblue") +
ggplot2::coord_flip() +
ggplot2::labs(title = "Similarity Scores", x = "Measure", y = "Score") +
ggplot2::theme_minimal() +
ggplot2::scale_y_continuous(limits = c(0, 1))
}
compare_style <- function(stylistic_result) {
features <- names(stylistic_result$text1_features)
speech1_values <- unlist(stylistic_result$text1_features)
speech2_values <- unlist(stylistic_result$text2_features)
df <- data.frame(
feature = rep(features, 2),
value = c(speech1_values, speech2_values),
speech = rep(c("speech1", "speech2"), each = length(features))
)
ggplot2::ggplot(df, ggplot2::aes(x = .data$feature, y = .data$value, fill = .data$speech)) +
ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::labs(title = "Comparison of Stylistic Features", x = "Feature", y = "Value") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}
gen_sim_report <- function(speech1, speech2, topic_method = "lda", semantic_method = "tfidf", glove_path = NULL) {
topic_sim <- topic_similarity(speech1, speech2, method = topic_method)
lexical_sim <- lexical_similarity(speech1, speech2)
semantic_sim <- semantic_similarity(speech1, speech2, method = semantic_method, model_path = glove_path)
structural_sim <- structural_similarity(speech1, speech2)
stylistic_result <- stylistic_similarity(speech1, speech2)
sentiment_sim <- sentiment_similarity(speech1, speech2)
similarities <- list(
topic = topic_sim,
lexical = lexical_sim,
semantic = semantic_sim,
structural = structural_sim,
stylistic = stylistic_result$overall_similarity,
sentiment = sentiment_sim
)
combined_sim <- combine_sims(similarities)
report <- list(
similarities = similarities,
combined_similarity = combined_sim,
similarity_plot = plot_sims(similarities),
stylistic_plot = compare_style(stylistic_result)
)
return(report)
}
print_sim_report <- function(report) {
cat("Similarity Report\n")
cat("=================\n\n")
cat("Individual Similarity Scores:\n")
for (measure in names(report$similarities)) {
cat(sprintf(" %s: %.4f\n", measure, report$similarities[[measure]]))
}
cat("\nCombined Similarity Score: %.4f\n", report$combined_similarity)
cat("\nPlots have been generated for overall similarities and stylistic features.\n")
cat("Use 'report$similarity_plot' and 'report$stylistic_plot' to view them.\n")
}
agg_seq <- function(sequence, num_segments) {
segment_size <- ceiling(length(sequence) / num_segments)
aggregated <- numeric(num_segments)
for (i in 1:num_segments) {
start_idx <- (i - 1) * segment_size + 1
end_idx <- min(i * segment_size, length(sequence))
aggregated[i] <- mean(sequence[start_idx:end_idx], na.rm = TRUE)
}
return(aggregated)
}
combine_sim_seq <- function(similarities, weights = NULL) {
if (length(similarities) == 0) {
stop("At least one similarity measure is required")
}
if (is.null(weights)) {
weights <- rep(1 / length(similarities), length(similarities))
}
sequence_lengths <- sapply(similarities, function(x) length(x$sequence))
if (length(unique(sequence_lengths)) > 1) {
warning("Sequence lengths do not match. Results may be unexpected.")
}
max_length <- max(sequence_lengths)
combined_sequence <- numeric(max_length)
for (i in seq_along(similarities)) {
seq <- c(similarities[[i]]$sequence, rep(NA, max_length - length(similarities[[i]]$sequence)))
combined_sequence <- combined_sequence + seq * weights[i]
}
combined_average <- sum(sapply(seq_along(similarities), function(i) {
similarities[[i]]$average * weights[i]
}))
return(list(sequence = combined_sequence, average = combined_average))
}
norm_sim <- function(similarities) {
min_sim <- min(similarities, na.rm = TRUE)
max_sim <- max(similarities, na.rm = TRUE)
if (min_sim == max_sim) {
return(rep(0, length(similarities)))
}
return((similarities - min_sim) / (max_sim - min_sim))
}
cor_sim_seq <- function(similarities, method = "pearson") {
sequences <- lapply(similarities, function(x) x$sequence)
cor_matrix <- cor(do.call(cbind, sequences), use = "pairwise.complete.obs", method = method)
return(cor_matrix)
}
plot_cor_heatmap <- function(cor_matrix, titles) {
# Create a data frame from the correlation matrix
df <- expand.grid(Measure1 = titles, Measure2 = titles)
df$Correlation <- as.vector(cor_matrix)
ggplot2::ggplot(df, ggplot2::aes(x = .data$Measure1, y = .data$Measure2, fill = .data$Correlation)) +
ggplot2::geom_tile() +
ggplot2::scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
ggplot2::labs(x = "", y = "", fill = "Correlation") +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}
plot_sim_time <- function(similarities, title, y_label) {
df <- data.frame(
dyad = rep(names(similarities), sapply(similarities, length)),
time = unlist(lapply(similarities, seq_along)),
similarity = unlist(similarities)
)
ggplot2::ggplot(df, ggplot2::aes(x = .data$time, y = .data$similarity, color = .data$dyad)) +
ggplot2::geom_line() +
ggplot2::geom_point() +
ggplot2::labs(title = title, x = "Time", y = y_label) +
ggplot2::theme_minimal()
}
calc_sum_stats <- function(similarities) {
if (length(similarities) == 0) {
stop("No data to calculate summary statistics")
}
na_present <- any(sapply(similarities, function(x) any(is.na(x))))
if (na_present) {
warning("NAs present in the data")
}
summary_stats <- lapply(similarities, function(x) {
c(mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE))
})
do.call(rbind, summary_stats)
}
plot_sum_stats <- function(summary_stats, title) {
df <- as.data.frame(summary_stats)
df$dyad <- rownames(df)
df_long <- data.frame(
dyad = rep(df$dyad, each = 4),
statistic = rep(c("mean", "sd", "min", "max"), nrow(df)),
value = c(t(as.matrix(df[, c("mean", "sd", "min", "max")])))
)
ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$value, fill = .data$statistic)) +
ggplot2::geom_bar(stat = "identity", position = "dodge") +
ggplot2::labs(title = title, x = "Dyad", y = "Value") +
ggplot2::theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set2")
}
compare_sim_meas <- function(similarity_list, measure_names) {
if (length(similarity_list) != length(measure_names)) {
stop("The number of similarity lists must match the number of measure names.")
}
result <- data.frame(dyad = rep(names(similarity_list[[1]]),
sapply(similarity_list[[1]], length)))
for (i in seq_along(similarity_list)) {
measure <- measure_names[i]
similarities <- unlist(similarity_list[[i]])
result[[measure]] <- similarities
}
result
}
plot_sim_comp <- function(comparison_df, title) {
measures <- setdiff(names(comparison_df), "dyad")
df_long <- data.frame(
dyad = rep(comparison_df$dyad, length(measures)),
measure = rep(measures, each = nrow(comparison_df)),
similarity = unlist(comparison_df[, measures])
)
ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$similarity, fill = .data$measure)) +
ggplot2::geom_boxplot() +
ggplot2::labs(title = title, x = "Dyad", y = "Similarity") +
ggplot2::theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3")
}
calc_sim_cor <- function(comparison_df) {
cor(comparison_df[, -1], use = "pairwise.complete.obs")
}
plot_sim_cor_heatmap <- function(cor_matrix, title) {
cor_df <- as.data.frame(as.table(cor_matrix))
names(cor_df) <- c("Var1", "Var2", "Correlation")
ggplot2::ggplot(cor_df, ggplot2::aes(x = .data$Var1, y = .data$Var2, fill = .data$Correlation)) +
ggplot2::geom_tile() +
ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1)) +
ggplot2::theme_minimal() +
ggplot2::labs(title = title) +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}
## -----------------------------------------------------------------------------
sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)
combined_score <- combine_sims(sims)
print(combined_score)
# With custom weights
weighted_score <- combine_sims(sims, weights = list(topic = 2, lexical = 1, semantic = 1.5, structural = 1))
print(weighted_score)
## -----------------------------------------------------------------------------
sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)
plot_sims(sims)
## -----------------------------------------------------------------------------
# Simulating the result of stylistic_similarity function
stylistic_result <- list(
text1_features = list(sentence_length = 15, word_length = 5, unique_words = 100),
text2_features = list(sentence_length = 12, word_length = 4, unique_words = 80),
overall_similarity = 0.85
)
compare_style(stylistic_result)
## -----------------------------------------------------------------------------
speech1 <- "This is the first speech. It talks about important topics."
speech2 <- "This is the second speech. It covers similar subjects."
# Note: This function call might not work as-is because it depends on other functions
# that are not defined in the utility files. For demonstration purposes, we'll create
# a mock report.
mock_report <- list(
similarities = list(
topic = 0.8,
lexical = 0.6,
semantic = 0.7,
structural = 0.9,
stylistic = 0.85,
sentiment = 0.75
),
combined_similarity = 0.75,
similarity_plot = plot_sims(list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)),
stylistic_plot = compare_style(stylistic_result)
)
# Print the mock report
print_sim_report(mock_report)
## -----------------------------------------------------------------------------
# Combine similarity measures for a single dyad
sim1 <- list(sequence = c(0.8, 0.7, 0.9), average = 0.8)
sim2 <- list(sequence = c(0.6, 0.8, 0.7), average = 0.7)
combined <- combine_sim_seq(list(sim1, sim2))
print(combined)
# Normalize similarity scores
scores <- c(0.2, 0.5, 0.8, 1.0, 0.3)
normalized <- norm_sim(scores)
print(normalized)
# Aggregate similarity sequence
seq <- c(0.5, 0.6, 0.7, 0.6, 0.8, 0.7, 0.9, 0.8, 0.7, 0.8)
aggregated <- agg_seq(seq, 3)
print(aggregated)
# Calculate correlation between similarity measures
cor_matrix <- cor_sim_seq(list(sim1, sim2))
print(cor_matrix)
## -----------------------------------------------------------------------------
# Create mock data for multiple dyads
similarities <- list(
"1" = c(0.5, 0.6, 0.7),
"2" = c(0.4, 0.5, 0.6)
)
# Plot similarity over time for multiple dyads
plot_sim_time(similarities, "Topic Similarity", "Similarity Score")
# Calculate summary statistics
stats <- calc_sum_stats(similarities)
print(stats)
# Plot summary statistics
plot_sum_stats(stats, "Summary Statistics of Similarities")
# Compare multiple similarity measures
topic_similarities <- list("1" = c(0.5, 0.6, 0.7), "2" = c(0.4, 0.5, 0.6))
lexical_similarities <- list("1" = c(0.6, 0.7, 0.8), "2" = c(0.5, 0.6, 0.7))
comparison_df <- compare_sim_meas(
list(topic_similarities, lexical_similarities),
c("Topic", "Lexical")
)
print(head(comparison_df))
# Plot comparison of multiple similarity measures
plot_sim_comp(comparison_df, "Comparison of Similarity Measures")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.