Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
library(tm)
library(topicmodels)
library(lsa)
library(word2vec)
library(sentimentr)
## ----load-data----------------------------------------------------------------
data_path <- system.file("extdata", "speeches_data.Rdata", package = "conversim")
load(data_path)
# Print a summary of the speeches data
print(summary(speeches_data))
## ----include=FALSE------------------------------------------------------------
preprocess_text <- function(text) {
text <- tolower(text)
text <- gsub("[[:punct:]]", "", text)
text <- gsub("[[:digit:]]", "", text)
text <- gsub("\\s+", " ", trimws(text))
return(text)
}
## -----------------------------------------------------------------------------
# Example usage with our data
original_text <- substr(speeches_data$text[1], 1, 200) # First 200 characters of speech A
preprocessed_text <- preprocess_text(original_text)
print(paste("Original:", original_text))
print(paste("Preprocessed:", preprocessed_text))
## ----include=FALSE------------------------------------------------------------
topic_similarity <- function(conv1, conv2, method = "lda", num_topics = 2) {
corpus <- c(conv1, conv2)
dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus)))
dtm_matrix <- as.matrix(dtm)
if (method == "lda") {
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
topic_dist <- posterior(lda_model)$topics
js_divergence <- function(p, q) {
m <- 0.5 * (p + q)
0.5 * sum(p * log(p / m)) + 0.5 * sum(q * log(q / m))
}
similarity <- 1 - sqrt(js_divergence(topic_dist[1,], topic_dist[2,]))
} else if (method == "lsa") {
if (nrow(dtm_matrix) < num_topics) {
num_topics <- nrow(dtm_matrix)
}
lsa_space <- lsa(dtm_matrix, dims = num_topics)
doc_lsa <- lsa_space$dk
similarity <- cosine(doc_lsa[1,], doc_lsa[2,])
similarity <- (similarity + 1) / 2
} else {
stop("Invalid method. Choose 'lda' or 'lsa'.")
}
return(similarity)
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
lda_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lda", num_topics = 5)
lsa_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lsa", num_topics = 5)
print(paste("LDA Similarity:", lda_similarity))
print(paste("LSA Similarity:", lsa_similarity))
## ----include=FALSE------------------------------------------------------------
lexical_similarity <- function(conv1, conv2) {
words1 <- unique(unlist(strsplit(conv1, " ")))
words2 <- unique(unlist(strsplit(conv2, " ")))
intersection <- length(intersect(words1, words2))
union <- length(union(words1, words2))
return(intersection / union)
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
lex_similarity <- lexical_similarity(speeches_data$text[1], speeches_data$text[2])
print(paste("Lexical Similarity:", lex_similarity))
## ----include=FALSE------------------------------------------------------------
semantic_similarity <- function(conversation1, conversation2, method = "tfidf",
model_path = NULL, dim = 100, window = 5, iter = 5) {
# Internal function to calculate cosine similarity
cosine_similarity <- function(a, b) {
if (length(a) == 0 || length(b) == 0) return(0)
sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2)))
# Ensure the result is between 0 and 1
return((sim + 1) / 2)
}
# Internal function to load pre-trained GloVe embeddings
load_glove <- function(file_path) {
tryCatch({
conn <- file(file_path, "r")
lines <- readLines(conn)
close(conn)
split_lines <- strsplit(lines, " ")
words <- sapply(split_lines, `[`, 1)
vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1])))
rownames(vectors) <- words
return(vectors)
}, error = function(e) {
stop(paste("Error loading GloVe file:", e$message))
})
}
# Internal function to calculate sentence embedding
sentence_embedding <- function(sentence, word_vectors) {
tokens <- unlist(strsplit(sentence, "\\s+"))
valid_tokens <- tokens[tokens %in% rownames(word_vectors)]
if (length(valid_tokens) == 0) {
return(rep(0, ncol(word_vectors)))
}
embeddings <- word_vectors[valid_tokens, , drop = FALSE]
if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors)))
return(colMeans(embeddings))
}
if (method == "tfidf") {
# TF-IDF approach
corpus <- c(conversation1, conversation2)
dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus)))
tfidf <- weightTfIdf(dtm)
m <- as.matrix(tfidf)
# Issue a warning for short conversations or little vocabulary overlap
if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) {
warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.")
}
# If the conversations are identical, return 1
if (identical(conversation1, conversation2)) {
return(1)
}
# Ensure we have at least one term in common
if (ncol(m) == 0) {
return(0)
}
# Calculate cosine similarity
similarity <- cosine_similarity(m[1,], m[2,])
} else if (method == "word2vec" || method == "glove") {
# Word2Vec or GloVe approach
if (method == "word2vec") {
# Train Word2Vec model
all_text <- c(conversation1, conversation2)
model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1)
word_vectors <- as.matrix(model)
} else { # method == "glove"
if (is.null(model_path)) {
stop("Please provide a path to the pre-trained GloVe file.")
}
# Load pre-trained GloVe vectors
word_vectors <- load_glove(model_path)
}
# Calculate embeddings for each conversation
embedding1 <- sentence_embedding(conversation1, word_vectors)
embedding2 <- sentence_embedding(conversation2, word_vectors)
# Calculate cosine similarity
similarity <- cosine_similarity(embedding1, embedding2)
} else {
stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.")
}
return(similarity)
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
tfidf_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "tfidf")
word2vec_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "word2vec")
print(paste("TF-IDF Similarity:", tfidf_similarity))
print(paste("Word2Vec Similarity:", word2vec_similarity))
# Note: For GloVe method, you need to provide a path to pre-trained GloVe vectors
# glove_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "glove", model_path = "path/to/glove/vectors.txt")
## ----include=FALSE------------------------------------------------------------
structural_similarity <- function(conv1, conv2) {
length_sim <- 1 - abs(length(conv1) - length(conv2)) / max(length(conv1), length(conv2))
avg_turn_length1 <- mean(nchar(conv1))
avg_turn_length2 <- mean(nchar(conv2))
turn_length_sim <- 1 - abs(avg_turn_length1 - avg_turn_length2) / max(avg_turn_length1, avg_turn_length2)
return(mean(c(length_sim, turn_length_sim)))
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
struct_similarity <- structural_similarity(strsplit(speeches_data$text[1], "\n")[[1]],
strsplit(speeches_data$text[2], "\n")[[1]])
print(paste("Structural Similarity:", struct_similarity))
## ----include=FALSE------------------------------------------------------------
stylistic_similarity <- function(text1, text2) {
# Helper function to calculate features for a single text
calculate_features <- function(text) {
words <- strsplit(text, " ")[[1]]
sentences <- strsplit(text, "\\. ")[[1]]
ttr <- length(unique(words)) / length(words)
avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]])))
syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w)))))
fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59
c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade)
}
features1 <- calculate_features(text1)
features2 <- calculate_features(text2)
feature_diff <- abs(features1 - features2)
overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2))
normalized1 <- (features1 - mean(features1)) / sd(features1)
normalized2 <- (features2 - mean(features2)) / sd(features2)
cosine_similarity <- sum(normalized1 * normalized2) /
(sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2)))
list(
text1_features = features1,
text2_features = features2,
feature_differences = feature_diff,
overall_similarity = overall_similarity,
cosine_similarity = cosine_similarity
)
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
style_similarity <- stylistic_similarity(speeches_data$text[1], speeches_data$text[2])
print("Stylistic Similarity Results:")
print(style_similarity)
## ----include=FALSE------------------------------------------------------------
sentiment_similarity <- function(conv1, conv2) {
sent1 <- sentiment_by(conv1)$ave_sentiment
sent2 <- sentiment_by(conv2)$ave_sentiment
return(1 - abs(sent1 - sent2) / 2)
}
## -----------------------------------------------------------------------------
# Example usage with our speeches data
sent_similarity <- sentiment_similarity(speeches_data$text[1], speeches_data$text[2])
print(paste("Sentiment Similarity:", sent_similarity))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.