knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
This vignette demonstrates how to use the functions provided in the conversation_multidyads.R
file to analyze conversations across multiple dyads. These functions allow you to preprocess conversation data and calculate various similarity measures between conversation participants.
Load the library:
library(lme4) library(ggplot2) library(topicmodels) library(tm) library(slam)
library(conversim)
We'll use the provided dataset "dyad_example_data.Rdata" located in the inst/extdata directory of the package:
data_path <- system.file("extdata", "dyad_example_data.Rdata", package = "conversim") load(data_path) # Display the first few rows and structure of the data head(dyad_example_data) str(dyad_example_data)
Before analyzing the conversations, we need to preprocess the text data:
preprocess_dyads <- function(conversations) { conversations$processed_text <- sapply(conversations$text, function(text) { text <- tolower(text) text <- gsub("[[:punct:]]", "", text) text <- gsub("[[:digit:]]", "", text) text <- gsub("\\s+", " ", trimws(text)) return(text) }) # Remove empty processed texts conversations <- conversations[nchar(conversations$processed_text) > 0, ] return(conversations) }
processed_convs <- preprocess_dyads(dyad_example_data) head(dyad_example_data)
Now, let's calculate various similarity measures for our preprocessed conversations.
topic_sim_dyads <- function(conversations, method = "lda", num_topics = 2, window_size = 3) { if (!requireNamespace("lme4", quietly = TRUE)) { stop("Package 'lme4' is required for this function. Please install it.") } if (!requireNamespace("topicmodels", quietly = TRUE)) { stop("Package 'topicmodels' is required for this function. Please install it.") } if (!requireNamespace("tm", quietly = TRUE)) { stop("Package 'tm' is required for this function. Please install it.") } if (!requireNamespace("slam", quietly = TRUE)) { stop("Package 'slam' is required for this function. Please install it.") } dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window <- dyad_conv$processed_text[i:(i+window_size-1)] # Create a document-term matrix corpus <- tm::Corpus(tm::VectorSource(window)) dtm <- tm::DocumentTermMatrix(corpus) # Check if the DTM is empty or has any empty documents using slam if (sum(slam::col_sums(dtm) > 0) == 0) { similarities <- c(similarities, NA) next } # Remove empty documents using slam dtm <- dtm[slam::row_sums(dtm) > 0, ] if (method == "lda") { tryCatch({ lda_model <- topicmodels::LDA(dtm, k = num_topics, control = list(seed = 1234)) topics <- topicmodels::topics(lda_model) sim <- sum(topics[1:(window_size/2)] == topics[(window_size/2+1):window_size]) / (window_size/2) }, error = function(e) { sim <- NA }) } else { stop("Unsupported method. Only 'lda' is currently implemented.") } similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Remove NA values model_data <- model_data[!is.na(model_data$similarity), ] # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- lme4::fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
topic_sim <- topic_sim_dyads(processed_convs, method = "lda", num_topics = 5, window_size = 3)
lexical_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- conversim::lexical_similarity(window1, window2) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
lexical_sim <- lexical_sim_dyads(processed_convs, window_size = 3)
semantic_similarity <- function(conversation1, conversation2, method = "tfidf", model_path = NULL, dim = 100, window = 5, iter = 5) { # Internal function to calculate cosine similarity cosine_similarity <- function(a, b) { if (length(a) == 0 || length(b) == 0) return(0) sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))) # Ensure the result is between 0 and 1 return((sim + 1) / 2) } # Internal function to load pre-trained GloVe embeddings load_glove <- function(file_path) { tryCatch({ conn <- file(file_path, "r") lines <- readLines(conn) close(conn) split_lines <- strsplit(lines, " ") words <- sapply(split_lines, `[`, 1) vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1]))) rownames(vectors) <- words return(vectors) }, error = function(e) { stop(paste("Error loading GloVe file:", e$message)) }) } # Internal function to calculate sentence embedding sentence_embedding <- function(sentence, word_vectors) { tokens <- unlist(strsplit(sentence, "\\s+")) valid_tokens <- tokens[tokens %in% rownames(word_vectors)] if (length(valid_tokens) == 0) { return(rep(0, ncol(word_vectors))) } embeddings <- word_vectors[valid_tokens, , drop = FALSE] if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors))) return(colMeans(embeddings)) } if (method == "tfidf") { # TF-IDF approach corpus <- c(conversation1, conversation2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) tfidf <- weightTfIdf(dtm) m <- as.matrix(tfidf) # Issue a warning for short conversations or little vocabulary overlap if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) { warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.") } # If the conversations are identical, return 1 if (identical(conversation1, conversation2)) { return(1) } # Ensure we have at least one term in common if (ncol(m) == 0) { return(0) } # Calculate cosine similarity similarity <- cosine_similarity(m[1,], m[2,]) } else if (method == "word2vec" || method == "glove") { # Word2Vec or GloVe approach if (method == "word2vec") { # Train Word2Vec model all_text <- c(conversation1, conversation2) model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1) word_vectors <- as.matrix(model) } else { # method == "glove" if (is.null(model_path)) { stop("Please provide a path to the pre-trained GloVe file.") } # Load pre-trained GloVe vectors word_vectors <- load_glove(model_path) } # Calculate embeddings for each conversation embedding1 <- sentence_embedding(conversation1, word_vectors) embedding2 <- sentence_embedding(conversation2, word_vectors) # Calculate cosine similarity similarity <- cosine_similarity(embedding1, embedding2) } else { stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.") } return(similarity) } semantic_sim_dyads <- function(conversations, method = "tfidf", window_size = 3, ...) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- semantic_similarity(window1, window2, method, ...) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
semantic_sim <- semantic_sim_dyads(processed_convs, method = "tfidf", window_size = 3)
structural_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] length_sim <- 1 turn_lengths <- nchar(dyad_conv$processed_text) turn_length_sim <- 1 - sd(turn_lengths) / mean(turn_lengths) speaker_changes <- sum(dyad_conv$speaker[-1] != dyad_conv$speaker[-nrow(dyad_conv)]) speaker_change_sim <- 1 - abs(speaker_changes - (nrow(dyad_conv) / 2)) / (nrow(dyad_conv) / 2) similarity <- mean(c(length_sim, turn_length_sim, speaker_change_sim)) all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
structural_sim <- structural_sim_dyads(processed_convs)
stylistic_similarity <- function(text1, text2) { # Helper function to calculate features for a single text calculate_features <- function(text) { words <- strsplit(text, " ")[[1]] sentences <- strsplit(text, "\\. ")[[1]] ttr <- length(unique(words)) / length(words) avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]]))) syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w))))) fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59 c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade) } features1 <- calculate_features(text1) features2 <- calculate_features(text2) feature_diff <- abs(features1 - features2) overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2)) normalized1 <- (features1 - mean(features1)) / sd(features1) normalized2 <- (features2 - mean(features2)) / sd(features2) cosine_similarity <- sum(normalized1 * normalized2) / (sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2))) list( text1_features = features1, text2_features = features2, feature_differences = feature_diff, overall_similarity = overall_similarity, cosine_similarity = cosine_similarity ) } stylistic_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- stylistic_similarity(window1, window2)$overall_similarity similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
stylistic_sim <- stylistic_sim_dyads(processed_convs, window_size = 3)
sentiment_sim_dyads <- function(conversations, window_size = 3) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] similarities <- c() for (i in 1:(nrow(dyad_conv) - window_size + 1)) { window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ") window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ") sim <- conversim::sentiment_similarity(window1, window2) similarities <- c(similarities, sim) } all_similarities[[as.character(dyad)]] <- similarities } # Prepare data for multilevel modeling model_data <- data.frame( dyad_id = rep(dyads, sapply(all_similarities, length)), similarity = unlist(all_similarities) ) # Fit multilevel model model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data) # Extract overall average similarity accounting for dyad-level variation overall_average <- fixef(model)[1] return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
sentiment_sim <- sentiment_sim_dyads(processed_convs, window_size = 3)
participant_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] speakers <- table(dyad_conv$speaker) / nrow(dyad_conv) # Calculate entropy as a measure of speaker balance entropy <- -sum(speakers * log(speakers)) max_entropy <- -log(1/length(speakers)) # Normalize entropy to [0, 1] range similarity <- entropy / max_entropy all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
participant_sim <- participant_sim_dyads(processed_convs)
timing_sim_dyads <- function(conversations) { dyads <- unique(conversations$dyad_id) all_similarities <- list() for (dyad in dyads) { dyad_conv <- conversations[conversations$dyad_id == dyad, ] turn_lengths <- nchar(dyad_conv$processed_text) length_sim <- 1 - stats::sd(turn_lengths) / mean(turn_lengths) # Calculate rhythm similarity based on turn length differences rhythm_diffs <- diff(turn_lengths) rhythm_sim <- 1 - stats::sd(rhythm_diffs) / mean(abs(rhythm_diffs)) similarity <- mean(c(length_sim, rhythm_sim)) all_similarities[[as.character(dyad)]] <- similarity } # Calculate overall average using simple mean overall_average <- mean(unlist(all_similarities)) # Print warning about not using multilevel modeling warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.") return(list(similarities_by_dyad = all_similarities, overall_average = overall_average)) }
timing_sim <- timing_sim_dyads(processed_convs)
Let's visualize the results of our similarity analyses using ggplot2. Here's an example of how to plot the topic similarity for each dyad:
topic_sim_df <- data.frame( dyad = rep(names(topic_sim$similarities_by_dyad), sapply(topic_sim$similarities_by_dyad, length)), similarity = unlist(topic_sim$similarities_by_dyad), index = unlist(lapply(topic_sim$similarities_by_dyad, seq_along)) ) ggplot(topic_sim_df, aes(x = index, y = similarity, color = dyad)) + geom_line() + geom_point() + facet_wrap(~dyad, ncol = 2) + labs(title = "Topic Similarity Across Dyads", x = "Conversation Sequence", y = "Similarity Score") + theme_minimal() + theme(legend.position = "none")
knitr::include_graphics("../man/figures/dyadconv_plot.jpeg")
We can also compare different similarity measures across dyads:
comparison_df <- data.frame( dyad = names(topic_sim$similarities_by_dyad), topic = sapply(topic_sim$similarities_by_dyad, mean), lexical = sapply(lexical_sim$similarities_by_dyad, mean), semantic = sapply(semantic_sim$similarities_by_dyad, mean), structural = unlist(structural_sim$similarities_by_dyad), stylistic = sapply(stylistic_sim$similarities_by_dyad, mean), sentiment = sapply(sentiment_sim$similarities_by_dyad, mean), participant = unlist(participant_sim$similarities_by_dyad), timing = unlist(timing_sim$similarities_by_dyad) ) comparison_long <- reshape(comparison_df, varying = list(names(comparison_df)[names(comparison_df) != "dyad"]), v.names = "similarity", timevar = "measure", times = names(comparison_df)[names(comparison_df) != "dyad"], new.row.names = 1:10000, # Adjust this if needed direction = "long") ggplot(comparison_long, aes(x = measure, y = similarity, fill = measure)) + geom_boxplot() + labs(title = "Comparison of Similarity Measures Across Dyads", x = "Similarity Measure", y = "Similarity Score") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
This vignette demonstrates how to use the functions in conversation_multidyads.R
to analyze conversations across multiple dyads using real-world data. These tools allow researchers to examine various aspects of conversation dynamics, including topic coherence, lexical alignment, semantic similarity, and more.
The visualizations provide insights into how different similarity measures vary across dyads and how they compare to each other. This can help in identifying patterns or trends in conversational dynamics.
Remember that the effectiveness of these analyses may depend on the size and nature of your dataset. Always consider the context of your conversations and the limitations of each similarity measure when interpreting the results.
For further analysis, you might consider:
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.