Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----echo=FALSE, message=FALSE------------------------------------------------
library(lme4)
library(ggplot2)
library(topicmodels)
library(tm)
library(slam)
## ----message=FALSE------------------------------------------------------------
library(conversim)
## ----load_data----------------------------------------------------------------
data_path <- system.file("extdata", "dyad_example_data.Rdata", package = "conversim")
load(data_path)
# Display the first few rows and structure of the data
head(dyad_example_data)
str(dyad_example_data)
## ----echo=FALSE---------------------------------------------------------------
preprocess_dyads <- function(conversations) {
conversations$processed_text <- sapply(conversations$text, function(text) {
text <- tolower(text)
text <- gsub("[[:punct:]]", "", text)
text <- gsub("[[:digit:]]", "", text)
text <- gsub("\\s+", " ", trimws(text))
return(text)
})
# Remove empty processed texts
conversations <- conversations[nchar(conversations$processed_text) > 0, ]
return(conversations)
}
## ----preprocess---------------------------------------------------------------
processed_convs <- preprocess_dyads(dyad_example_data)
head(dyad_example_data)
## ----echo=FALSE---------------------------------------------------------------
topic_sim_dyads <- function(conversations, method = "lda", num_topics = 2, window_size = 3) {
if (!requireNamespace("lme4", quietly = TRUE)) {
stop("Package 'lme4' is required for this function. Please install it.")
}
if (!requireNamespace("topicmodels", quietly = TRUE)) {
stop("Package 'topicmodels' is required for this function. Please install it.")
}
if (!requireNamespace("tm", quietly = TRUE)) {
stop("Package 'tm' is required for this function. Please install it.")
}
if (!requireNamespace("slam", quietly = TRUE)) {
stop("Package 'slam' is required for this function. Please install it.")
}
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
similarities <- c()
for (i in 1:(nrow(dyad_conv) - window_size + 1)) {
window <- dyad_conv$processed_text[i:(i+window_size-1)]
# Create a document-term matrix
corpus <- tm::Corpus(tm::VectorSource(window))
dtm <- tm::DocumentTermMatrix(corpus)
# Check if the DTM is empty or has any empty documents using slam
if (sum(slam::col_sums(dtm) > 0) == 0) {
similarities <- c(similarities, NA)
next
}
# Remove empty documents using slam
dtm <- dtm[slam::row_sums(dtm) > 0, ]
if (method == "lda") {
tryCatch({
lda_model <- topicmodels::LDA(dtm, k = num_topics, control = list(seed = 1234))
topics <- topicmodels::topics(lda_model)
sim <- sum(topics[1:(window_size/2)] == topics[(window_size/2+1):window_size]) / (window_size/2)
}, error = function(e) {
sim <- NA
})
} else {
stop("Unsupported method. Only 'lda' is currently implemented.")
}
similarities <- c(similarities, sim)
}
all_similarities[[as.character(dyad)]] <- similarities
}
# Prepare data for multilevel modeling
model_data <- data.frame(
dyad_id = rep(dyads, sapply(all_similarities, length)),
similarity = unlist(all_similarities)
)
# Remove NA values
model_data <- model_data[!is.na(model_data$similarity), ]
# Fit multilevel model
model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data)
# Extract overall average similarity accounting for dyad-level variation
overall_average <- lme4::fixef(model)[1]
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----topic_similarity, message=FALSE------------------------------------------
topic_sim <- topic_sim_dyads(processed_convs, method = "lda", num_topics = 5, window_size = 3)
## ----echo=FALSE---------------------------------------------------------------
lexical_sim_dyads <- function(conversations, window_size = 3) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
similarities <- c()
for (i in 1:(nrow(dyad_conv) - window_size + 1)) {
window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ")
window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ")
sim <- conversim::lexical_similarity(window1, window2)
similarities <- c(similarities, sim)
}
all_similarities[[as.character(dyad)]] <- similarities
}
# Prepare data for multilevel modeling
model_data <- data.frame(
dyad_id = rep(dyads, sapply(all_similarities, length)),
similarity = unlist(all_similarities)
)
# Fit multilevel model
model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data)
# Extract overall average similarity accounting for dyad-level variation
overall_average <- fixef(model)[1]
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----lexical_similarity, message=FALSE----------------------------------------
lexical_sim <- lexical_sim_dyads(processed_convs, window_size = 3)
## ----echo=FALSE---------------------------------------------------------------
semantic_similarity <- function(conversation1, conversation2, method = "tfidf", model_path = NULL, dim = 100, window = 5, iter = 5) {
# Internal function to calculate cosine similarity
cosine_similarity <- function(a, b) {
if (length(a) == 0 || length(b) == 0) return(0)
sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2)))
# Ensure the result is between 0 and 1
return((sim + 1) / 2)
}
# Internal function to load pre-trained GloVe embeddings
load_glove <- function(file_path) {
tryCatch({
conn <- file(file_path, "r")
lines <- readLines(conn)
close(conn)
split_lines <- strsplit(lines, " ")
words <- sapply(split_lines, `[`, 1)
vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1])))
rownames(vectors) <- words
return(vectors)
}, error = function(e) {
stop(paste("Error loading GloVe file:", e$message))
})
}
# Internal function to calculate sentence embedding
sentence_embedding <- function(sentence, word_vectors) {
tokens <- unlist(strsplit(sentence, "\\s+"))
valid_tokens <- tokens[tokens %in% rownames(word_vectors)]
if (length(valid_tokens) == 0) {
return(rep(0, ncol(word_vectors)))
}
embeddings <- word_vectors[valid_tokens, , drop = FALSE]
if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors)))
return(colMeans(embeddings))
}
if (method == "tfidf") {
# TF-IDF approach
corpus <- c(conversation1, conversation2)
dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus)))
tfidf <- weightTfIdf(dtm)
m <- as.matrix(tfidf)
# Issue a warning for short conversations or little vocabulary overlap
if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) {
warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.")
}
# If the conversations are identical, return 1
if (identical(conversation1, conversation2)) {
return(1)
}
# Ensure we have at least one term in common
if (ncol(m) == 0) {
return(0)
}
# Calculate cosine similarity
similarity <- cosine_similarity(m[1,], m[2,])
} else if (method == "word2vec" || method == "glove") {
# Word2Vec or GloVe approach
if (method == "word2vec") {
# Train Word2Vec model
all_text <- c(conversation1, conversation2)
model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1)
word_vectors <- as.matrix(model)
} else { # method == "glove"
if (is.null(model_path)) {
stop("Please provide a path to the pre-trained GloVe file.")
}
# Load pre-trained GloVe vectors
word_vectors <- load_glove(model_path)
}
# Calculate embeddings for each conversation
embedding1 <- sentence_embedding(conversation1, word_vectors)
embedding2 <- sentence_embedding(conversation2, word_vectors)
# Calculate cosine similarity
similarity <- cosine_similarity(embedding1, embedding2)
} else {
stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.")
}
return(similarity)
}
semantic_sim_dyads <- function(conversations, method = "tfidf", window_size = 3, ...) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
similarities <- c()
for (i in 1:(nrow(dyad_conv) - window_size + 1)) {
window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ")
window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ")
sim <- semantic_similarity(window1, window2, method, ...)
similarities <- c(similarities, sim)
}
all_similarities[[as.character(dyad)]] <- similarities
}
# Prepare data for multilevel modeling
model_data <- data.frame(
dyad_id = rep(dyads, sapply(all_similarities, length)),
similarity = unlist(all_similarities)
)
# Fit multilevel model
model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data)
# Extract overall average similarity accounting for dyad-level variation
overall_average <- fixef(model)[1]
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----semantic_similarity, message=FALSE, , warning=FALSE, results='hide'------
semantic_sim <- semantic_sim_dyads(processed_convs, method = "tfidf", window_size = 3)
## ----echo=FALSE---------------------------------------------------------------
structural_sim_dyads <- function(conversations) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
length_sim <- 1
turn_lengths <- nchar(dyad_conv$processed_text)
turn_length_sim <- 1 - sd(turn_lengths) / mean(turn_lengths)
speaker_changes <- sum(dyad_conv$speaker[-1] != dyad_conv$speaker[-nrow(dyad_conv)])
speaker_change_sim <- 1 - abs(speaker_changes - (nrow(dyad_conv) / 2)) / (nrow(dyad_conv) / 2)
similarity <- mean(c(length_sim, turn_length_sim, speaker_change_sim))
all_similarities[[as.character(dyad)]] <- similarity
}
# Calculate overall average using simple mean
overall_average <- mean(unlist(all_similarities))
# Print warning about not using multilevel modeling
warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.")
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----structural_similarity, message=FALSE, warning=FALSE----------------------
structural_sim <- structural_sim_dyads(processed_convs)
## ----echo=FALSE---------------------------------------------------------------
stylistic_similarity <- function(text1, text2) {
# Helper function to calculate features for a single text
calculate_features <- function(text) {
words <- strsplit(text, " ")[[1]]
sentences <- strsplit(text, "\\. ")[[1]]
ttr <- length(unique(words)) / length(words)
avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]])))
syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w)))))
fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59
c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade)
}
features1 <- calculate_features(text1)
features2 <- calculate_features(text2)
feature_diff <- abs(features1 - features2)
overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2))
normalized1 <- (features1 - mean(features1)) / sd(features1)
normalized2 <- (features2 - mean(features2)) / sd(features2)
cosine_similarity <- sum(normalized1 * normalized2) /
(sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2)))
list(
text1_features = features1,
text2_features = features2,
feature_differences = feature_diff,
overall_similarity = overall_similarity,
cosine_similarity = cosine_similarity
)
}
stylistic_sim_dyads <- function(conversations, window_size = 3) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
similarities <- c()
for (i in 1:(nrow(dyad_conv) - window_size + 1)) {
window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ")
window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ")
sim <- stylistic_similarity(window1, window2)$overall_similarity
similarities <- c(similarities, sim)
}
all_similarities[[as.character(dyad)]] <- similarities
}
# Prepare data for multilevel modeling
model_data <- data.frame(
dyad_id = rep(dyads, sapply(all_similarities, length)),
similarity = unlist(all_similarities)
)
# Fit multilevel model
model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data)
# Extract overall average similarity accounting for dyad-level variation
overall_average <- fixef(model)[1]
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----stylistic_similarity, message=FALSE--------------------------------------
stylistic_sim <- stylistic_sim_dyads(processed_convs, window_size = 3)
## ----echo=FALSE---------------------------------------------------------------
sentiment_sim_dyads <- function(conversations, window_size = 3) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
similarities <- c()
for (i in 1:(nrow(dyad_conv) - window_size + 1)) {
window1 <- paste(dyad_conv$processed_text[i:(i+window_size/2-1)], collapse = " ")
window2 <- paste(dyad_conv$processed_text[(i+window_size/2):(i+window_size-1)], collapse = " ")
sim <- conversim::sentiment_similarity(window1, window2)
similarities <- c(similarities, sim)
}
all_similarities[[as.character(dyad)]] <- similarities
}
# Prepare data for multilevel modeling
model_data <- data.frame(
dyad_id = rep(dyads, sapply(all_similarities, length)),
similarity = unlist(all_similarities)
)
# Fit multilevel model
model <- lme4::lmer(similarity ~ 1 + (1|dyad_id), data = model_data)
# Extract overall average similarity accounting for dyad-level variation
overall_average <- fixef(model)[1]
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----sentiment_similarity, message=FALSE--------------------------------------
sentiment_sim <- sentiment_sim_dyads(processed_convs, window_size = 3)
## ----echo=FALSE---------------------------------------------------------------
participant_sim_dyads <- function(conversations) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
speakers <- table(dyad_conv$speaker) / nrow(dyad_conv)
# Calculate entropy as a measure of speaker balance
entropy <- -sum(speakers * log(speakers))
max_entropy <- -log(1/length(speakers))
# Normalize entropy to [0, 1] range
similarity <- entropy / max_entropy
all_similarities[[as.character(dyad)]] <- similarity
}
# Calculate overall average using simple mean
overall_average <- mean(unlist(all_similarities))
# Print warning about not using multilevel modeling
warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.")
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----participant_similarity, message=FALSE, warning=FALSE---------------------
participant_sim <- participant_sim_dyads(processed_convs)
## ----echo=FALSE---------------------------------------------------------------
timing_sim_dyads <- function(conversations) {
dyads <- unique(conversations$dyad_id)
all_similarities <- list()
for (dyad in dyads) {
dyad_conv <- conversations[conversations$dyad_id == dyad, ]
turn_lengths <- nchar(dyad_conv$processed_text)
length_sim <- 1 - stats::sd(turn_lengths) / mean(turn_lengths)
# Calculate rhythm similarity based on turn length differences
rhythm_diffs <- diff(turn_lengths)
rhythm_sim <- 1 - stats::sd(rhythm_diffs) / mean(abs(rhythm_diffs))
similarity <- mean(c(length_sim, rhythm_sim))
all_similarities[[as.character(dyad)]] <- similarity
}
# Calculate overall average using simple mean
overall_average <- mean(unlist(all_similarities))
# Print warning about not using multilevel modeling
warning("Only one observation per dyad. Using simple mean for overall average instead of multilevel modeling.")
return(list(similarities_by_dyad = all_similarities, overall_average = overall_average))
}
## ----timing_similarity, message=FALSE-----------------------------------------
timing_sim <- timing_sim_dyads(processed_convs)
## ----visualization, fig.show='hide'-------------------------------------------
topic_sim_df <- data.frame(
dyad = rep(names(topic_sim$similarities_by_dyad),
sapply(topic_sim$similarities_by_dyad, length)),
similarity = unlist(topic_sim$similarities_by_dyad),
index = unlist(lapply(topic_sim$similarities_by_dyad, seq_along))
)
ggplot(topic_sim_df, aes(x = index, y = similarity, color = dyad)) +
geom_line() +
geom_point() +
facet_wrap(~dyad, ncol = 2) +
labs(title = "Topic Similarity Across Dyads",
x = "Conversation Sequence",
y = "Similarity Score") +
theme_minimal() +
theme(legend.position = "none")
## ----echo=FALSE, out.height='2000px', out.width='800px'-----------------------
knitr::include_graphics("../man/figures/dyadconv_plot.jpeg")
## ----comparison, fig.width=10, fig.height=6-----------------------------------
comparison_df <- data.frame(
dyad = names(topic_sim$similarities_by_dyad),
topic = sapply(topic_sim$similarities_by_dyad, mean),
lexical = sapply(lexical_sim$similarities_by_dyad, mean),
semantic = sapply(semantic_sim$similarities_by_dyad, mean),
structural = unlist(structural_sim$similarities_by_dyad),
stylistic = sapply(stylistic_sim$similarities_by_dyad, mean),
sentiment = sapply(sentiment_sim$similarities_by_dyad, mean),
participant = unlist(participant_sim$similarities_by_dyad),
timing = unlist(timing_sim$similarities_by_dyad)
)
comparison_long <- reshape(comparison_df,
varying = list(names(comparison_df)[names(comparison_df) != "dyad"]),
v.names = "similarity",
timevar = "measure",
times = names(comparison_df)[names(comparison_df) != "dyad"],
new.row.names = 1:10000, # Adjust this if needed
direction = "long")
ggplot(comparison_long, aes(x = measure, y = similarity, fill = measure)) +
geom_boxplot() +
labs(title = "Comparison of Similarity Measures Across Dyads",
x = "Similarity Measure",
y = "Similarity Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.