utilities_functions.R
In conversim: Conversation Similarity Analysis

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(conversim)

## ----echo=FALSE---------------------------------------------------------------
library(ggplot2)
combine_sims <- function(similarities, weights = NULL) {
  if (is.null(weights)) {
    weights <- rep(1, length(similarities))
  } else {
    weights <- unlist(weights)
  }

  weighted_sum <- sum(unlist(similarities) * weights)
  total_weight <- sum(weights)

  return(weighted_sum / total_weight)
}

plot_sims <- function(similarities) {
  df <- data.frame(
    measure = names(similarities),
    score = unlist(similarities)
  )

  ggplot2::ggplot(df, ggplot2::aes(x = .data$measure, y = .data$score)) +
    ggplot2::geom_bar(stat = "identity", fill = "steelblue") +
    ggplot2::coord_flip() +
    ggplot2::labs(title = "Similarity Scores", x = "Measure", y = "Score") +
    ggplot2::theme_minimal() +
    ggplot2::scale_y_continuous(limits = c(0, 1))
}

compare_style <- function(stylistic_result) {
  features <- names(stylistic_result$text1_features)
  speech1_values <- unlist(stylistic_result$text1_features)
  speech2_values <- unlist(stylistic_result$text2_features)

  df <- data.frame(
    feature = rep(features, 2),
    value = c(speech1_values, speech2_values),
    speech = rep(c("speech1", "speech2"), each = length(features))
  )

  ggplot2::ggplot(df, ggplot2::aes(x = .data$feature, y = .data$value, fill = .data$speech)) +
    ggplot2::geom_bar(stat = "identity", position = "dodge") +
    ggplot2::labs(title = "Comparison of Stylistic Features", x = "Feature", y = "Value") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}

gen_sim_report <- function(speech1, speech2, topic_method = "lda", semantic_method = "tfidf", glove_path = NULL) {
  topic_sim <- topic_similarity(speech1, speech2, method = topic_method)
  lexical_sim <- lexical_similarity(speech1, speech2)
  semantic_sim <- semantic_similarity(speech1, speech2, method = semantic_method, model_path = glove_path)
  structural_sim <- structural_similarity(speech1, speech2)
  stylistic_result <- stylistic_similarity(speech1, speech2)
  sentiment_sim <- sentiment_similarity(speech1, speech2)

  similarities <- list(
    topic = topic_sim,
    lexical = lexical_sim,
    semantic = semantic_sim,
    structural = structural_sim,
    stylistic = stylistic_result$overall_similarity,
    sentiment = sentiment_sim
  )

  combined_sim <- combine_sims(similarities)

  report <- list(
    similarities = similarities,
    combined_similarity = combined_sim,
    similarity_plot = plot_sims(similarities),
    stylistic_plot = compare_style(stylistic_result)
  )

  return(report)
}

print_sim_report <- function(report) {
  cat("Similarity Report\n")
  cat("=================\n\n")

  cat("Individual Similarity Scores:\n")
  for (measure in names(report$similarities)) {
    cat(sprintf("  %s: %.4f\n", measure, report$similarities[[measure]]))
  }

  cat("\nCombined Similarity Score: %.4f\n", report$combined_similarity)

  cat("\nPlots have been generated for overall similarities and stylistic features.\n")
  cat("Use 'report$similarity_plot' and 'report$stylistic_plot' to view them.\n")
}

agg_seq <- function(sequence, num_segments) {
  segment_size <- ceiling(length(sequence) / num_segments)
  aggregated <- numeric(num_segments)

  for (i in 1:num_segments) {
    start_idx <- (i - 1) * segment_size + 1
    end_idx <- min(i * segment_size, length(sequence))
    aggregated[i] <- mean(sequence[start_idx:end_idx], na.rm = TRUE)
  }

  return(aggregated)
}

combine_sim_seq <- function(similarities, weights = NULL) {
  if (length(similarities) == 0) {
    stop("At least one similarity measure is required")
  }

  if (is.null(weights)) {
    weights <- rep(1 / length(similarities), length(similarities))
  }

  sequence_lengths <- sapply(similarities, function(x) length(x$sequence))
  if (length(unique(sequence_lengths)) > 1) {
    warning("Sequence lengths do not match. Results may be unexpected.")
  }

  max_length <- max(sequence_lengths)

  combined_sequence <- numeric(max_length)
  for (i in seq_along(similarities)) {
    seq <- c(similarities[[i]]$sequence, rep(NA, max_length - length(similarities[[i]]$sequence)))
    combined_sequence <- combined_sequence + seq * weights[i]
  }

  combined_average <- sum(sapply(seq_along(similarities), function(i) {
    similarities[[i]]$average * weights[i]
  }))

  return(list(sequence = combined_sequence, average = combined_average))
}

norm_sim <- function(similarities) {
  min_sim <- min(similarities, na.rm = TRUE)
  max_sim <- max(similarities, na.rm = TRUE)

  if (min_sim == max_sim) {
    return(rep(0, length(similarities)))
  }

  return((similarities - min_sim) / (max_sim - min_sim))
}

cor_sim_seq <- function(similarities, method = "pearson") {
  sequences <- lapply(similarities, function(x) x$sequence)
  cor_matrix <- cor(do.call(cbind, sequences), use = "pairwise.complete.obs", method = method)
  return(cor_matrix)
}

plot_cor_heatmap <- function(cor_matrix, titles) {
  # Create a data frame from the correlation matrix
  df <- expand.grid(Measure1 = titles, Measure2 = titles)
  df$Correlation <- as.vector(cor_matrix)

  ggplot2::ggplot(df, ggplot2::aes(x = .data$Measure1, y = .data$Measure2, fill = .data$Correlation)) +
    ggplot2::geom_tile() +
    ggplot2::scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
    ggplot2::labs(x = "", y = "", fill = "Correlation") +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}

plot_sim_time <- function(similarities, title, y_label) {
  df <- data.frame(
    dyad = rep(names(similarities), sapply(similarities, length)),
    time = unlist(lapply(similarities, seq_along)),
    similarity = unlist(similarities)
  )

  ggplot2::ggplot(df, ggplot2::aes(x = .data$time, y = .data$similarity, color = .data$dyad)) +
    ggplot2::geom_line() +
    ggplot2::geom_point() +
    ggplot2::labs(title = title, x = "Time", y = y_label) +
    ggplot2::theme_minimal()
}

calc_sum_stats <- function(similarities) {
  if (length(similarities) == 0) {
    stop("No data to calculate summary statistics")
  }

  na_present <- any(sapply(similarities, function(x) any(is.na(x))))
  if (na_present) {
    warning("NAs present in the data")
  }

  summary_stats <- lapply(similarities, function(x) {
    c(mean = mean(x, na.rm = TRUE),
      sd = sd(x, na.rm = TRUE),
      min = min(x, na.rm = TRUE),
      max = max(x, na.rm = TRUE))
  })

  do.call(rbind, summary_stats)
}

plot_sum_stats <- function(summary_stats, title) {
  df <- as.data.frame(summary_stats)
  df$dyad <- rownames(df)
  df_long <- data.frame(
    dyad = rep(df$dyad, each = 4),
    statistic = rep(c("mean", "sd", "min", "max"), nrow(df)),
    value = c(t(as.matrix(df[, c("mean", "sd", "min", "max")])))
  )

  ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$value, fill = .data$statistic)) +
    ggplot2::geom_bar(stat = "identity", position = "dodge") +
    ggplot2::labs(title = title, x = "Dyad", y = "Value") +
    ggplot2::theme_minimal() +
    ggplot2::scale_fill_brewer(palette = "Set2")
}

compare_sim_meas <- function(similarity_list, measure_names) {
  if (length(similarity_list) != length(measure_names)) {
    stop("The number of similarity lists must match the number of measure names.")
  }

  result <- data.frame(dyad = rep(names(similarity_list[[1]]),
                                  sapply(similarity_list[[1]], length)))

  for (i in seq_along(similarity_list)) {
    measure <- measure_names[i]
    similarities <- unlist(similarity_list[[i]])
    result[[measure]] <- similarities
  }

  result
}

plot_sim_comp <- function(comparison_df, title) {
  measures <- setdiff(names(comparison_df), "dyad")
  df_long <- data.frame(
    dyad = rep(comparison_df$dyad, length(measures)),
    measure = rep(measures, each = nrow(comparison_df)),
    similarity = unlist(comparison_df[, measures])
  )

  ggplot2::ggplot(df_long, ggplot2::aes(x = .data$dyad, y = .data$similarity, fill = .data$measure)) +
    ggplot2::geom_boxplot() +
    ggplot2::labs(title = title, x = "Dyad", y = "Similarity") +
    ggplot2::theme_minimal() +
    ggplot2::scale_fill_brewer(palette = "Set3")
}

calc_sim_cor <- function(comparison_df) {
  cor(comparison_df[, -1], use = "pairwise.complete.obs")
}

plot_sim_cor_heatmap <- function(cor_matrix, title) {
  cor_df <- as.data.frame(as.table(cor_matrix))
  names(cor_df) <- c("Var1", "Var2", "Correlation")

  ggplot2::ggplot(cor_df, ggplot2::aes(x = .data$Var1, y = .data$Var2, fill = .data$Correlation)) +
    ggplot2::geom_tile() +
    ggplot2::scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                                  midpoint = 0, limit = c(-1,1)) +
    ggplot2::theme_minimal() +
    ggplot2::labs(title = title) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
}

## -----------------------------------------------------------------------------
sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)
combined_score <- combine_sims(sims)
print(combined_score)

# With custom weights
weighted_score <- combine_sims(sims, weights = list(topic = 2, lexical = 1, semantic = 1.5, structural = 1))
print(weighted_score)

## -----------------------------------------------------------------------------
sims <- list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)
plot_sims(sims)

## -----------------------------------------------------------------------------
# Simulating the result of stylistic_similarity function
stylistic_result <- list(
  text1_features = list(sentence_length = 15, word_length = 5, unique_words = 100),
  text2_features = list(sentence_length = 12, word_length = 4, unique_words = 80),
  overall_similarity = 0.85
)

compare_style(stylistic_result)

## -----------------------------------------------------------------------------
speech1 <- "This is the first speech. It talks about important topics."
speech2 <- "This is the second speech. It covers similar subjects."

# Note: This function call might not work as-is because it depends on other functions
# that are not defined in the utility files. For demonstration purposes, we'll create
# a mock report.

mock_report <- list(
  similarities = list(
    topic = 0.8,
    lexical = 0.6,
    semantic = 0.7,
    structural = 0.9,
    stylistic = 0.85,
    sentiment = 0.75
  ),
  combined_similarity = 0.75,
  similarity_plot = plot_sims(list(topic = 0.8, lexical = 0.6, semantic = 0.7, structural = 0.9)),
  stylistic_plot = compare_style(stylistic_result)
)

# Print the mock report
print_sim_report(mock_report)

## -----------------------------------------------------------------------------
# Combine similarity measures for a single dyad
sim1 <- list(sequence = c(0.8, 0.7, 0.9), average = 0.8)
sim2 <- list(sequence = c(0.6, 0.8, 0.7), average = 0.7)
combined <- combine_sim_seq(list(sim1, sim2))
print(combined)

# Normalize similarity scores
scores <- c(0.2, 0.5, 0.8, 1.0, 0.3)
normalized <- norm_sim(scores)
print(normalized)

# Aggregate similarity sequence
seq <- c(0.5, 0.6, 0.7, 0.6, 0.8, 0.7, 0.9, 0.8, 0.7, 0.8)
aggregated <- agg_seq(seq, 3)
print(aggregated)

# Calculate correlation between similarity measures
cor_matrix <- cor_sim_seq(list(sim1, sim2))
print(cor_matrix)

## -----------------------------------------------------------------------------
# Create mock data for multiple dyads
similarities <- list(
  "1" = c(0.5, 0.6, 0.7),
  "2" = c(0.4, 0.5, 0.6)
)

# Plot similarity over time for multiple dyads
plot_sim_time(similarities, "Topic Similarity", "Similarity Score")

# Calculate summary statistics
stats <- calc_sum_stats(similarities)
print(stats)

# Plot summary statistics
plot_sum_stats(stats, "Summary Statistics of Similarities")

# Compare multiple similarity measures
topic_similarities <- list("1" = c(0.5, 0.6, 0.7), "2" = c(0.4, 0.5, 0.6))
lexical_similarities <- list("1" = c(0.6, 0.7, 0.8), "2" = c(0.5, 0.6, 0.7))
comparison_df <- compare_sim_meas(
  list(topic_similarities, lexical_similarities),
  c("Topic", "Lexical")
)
print(head(comparison_df))

# Plot comparison of multiple similarity measures
plot_sim_comp(comparison_df, "Comparison of Similarity Measures")