knitr::opts_chunk$set(echo = TRUE) library(tm) library(topicmodels) library(lsa) library(word2vec) library(sentimentr)
This vignette demonstrates the usage of various similarity functions for analyzing speeches. We'll be using example data speeches_data
stored in inst/extdata
to showcase these functions.
First, let's load the example data:
data_path <- system.file("extdata", "speeches_data.Rdata", package = "conversim") load(data_path) # Print a summary of the speeches data print(summary(speeches_data))
Before we begin with the similarity functions, let's look at the preprocess_text
function:
preprocess_text <- function(text) { text <- tolower(text) text <- gsub("[[:punct:]]", "", text) text <- gsub("[[:digit:]]", "", text) text <- gsub("\\s+", " ", trimws(text)) return(text) }
# Example usage with our data original_text <- substr(speeches_data$text[1], 1, 200) # First 200 characters of speech A preprocessed_text <- preprocess_text(original_text) print(paste("Original:", original_text)) print(paste("Preprocessed:", preprocessed_text))
The topic_similarity
function calculates the similarity between two speeches based on their topics:
topic_similarity <- function(conv1, conv2, method = "lda", num_topics = 2) { corpus <- c(conv1, conv2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) dtm_matrix <- as.matrix(dtm) if (method == "lda") { lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234)) topic_dist <- posterior(lda_model)$topics js_divergence <- function(p, q) { m <- 0.5 * (p + q) 0.5 * sum(p * log(p / m)) + 0.5 * sum(q * log(q / m)) } similarity <- 1 - sqrt(js_divergence(topic_dist[1,], topic_dist[2,])) } else if (method == "lsa") { if (nrow(dtm_matrix) < num_topics) { num_topics <- nrow(dtm_matrix) } lsa_space <- lsa(dtm_matrix, dims = num_topics) doc_lsa <- lsa_space$dk similarity <- cosine(doc_lsa[1,], doc_lsa[2,]) similarity <- (similarity + 1) / 2 } else { stop("Invalid method. Choose 'lda' or 'lsa'.") } return(similarity) }
# Example usage with our speeches data lda_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lda", num_topics = 5) lsa_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lsa", num_topics = 5) print(paste("LDA Similarity:", lda_similarity)) print(paste("LSA Similarity:", lsa_similarity))
Note: The difference between LDA (Latent Dirichlet Allocation) topic similarity (0.1694) and LSA (Latent Semantic Analysis) topic similarity (1) can be attributed to several factors:
LDA and LSA use fundamentally different approaches for topic modeling and semantic analysis:
Both LDA and LSA are sensitive to the input parameters, especially the number of topics chosen. The code used five topics for both methods, which may have been more appropriate for LDA than for LSA in this particular case.
Although both speeches are about climate change, they focus on different aspects of the topic. LDA might be better suited to capture these nuanced differences in topic distribution, whereas LSA may oversimplify the analysis due to the shared overall theme and vocabulary.
The lexical_similarity
function calculates the similarity between two speeches based on their shared unique words:
lexical_similarity <- function(conv1, conv2) { words1 <- unique(unlist(strsplit(conv1, " "))) words2 <- unique(unlist(strsplit(conv2, " "))) intersection <- length(intersect(words1, words2)) union <- length(union(words1, words2)) return(intersection / union) }
# Example usage with our speeches data lex_similarity <- lexical_similarity(speeches_data$text[1], speeches_data$text[2]) print(paste("Lexical Similarity:", lex_similarity))
The semantic_similarity
function calculates the semantic similarity between two speeches using different methods:
semantic_similarity <- function(conversation1, conversation2, method = "tfidf", model_path = NULL, dim = 100, window = 5, iter = 5) { # Internal function to calculate cosine similarity cosine_similarity <- function(a, b) { if (length(a) == 0 || length(b) == 0) return(0) sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))) # Ensure the result is between 0 and 1 return((sim + 1) / 2) } # Internal function to load pre-trained GloVe embeddings load_glove <- function(file_path) { tryCatch({ conn <- file(file_path, "r") lines <- readLines(conn) close(conn) split_lines <- strsplit(lines, " ") words <- sapply(split_lines, `[`, 1) vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1]))) rownames(vectors) <- words return(vectors) }, error = function(e) { stop(paste("Error loading GloVe file:", e$message)) }) } # Internal function to calculate sentence embedding sentence_embedding <- function(sentence, word_vectors) { tokens <- unlist(strsplit(sentence, "\\s+")) valid_tokens <- tokens[tokens %in% rownames(word_vectors)] if (length(valid_tokens) == 0) { return(rep(0, ncol(word_vectors))) } embeddings <- word_vectors[valid_tokens, , drop = FALSE] if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors))) return(colMeans(embeddings)) } if (method == "tfidf") { # TF-IDF approach corpus <- c(conversation1, conversation2) dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus))) tfidf <- weightTfIdf(dtm) m <- as.matrix(tfidf) # Issue a warning for short conversations or little vocabulary overlap if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) { warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.") } # If the conversations are identical, return 1 if (identical(conversation1, conversation2)) { return(1) } # Ensure we have at least one term in common if (ncol(m) == 0) { return(0) } # Calculate cosine similarity similarity <- cosine_similarity(m[1,], m[2,]) } else if (method == "word2vec" || method == "glove") { # Word2Vec or GloVe approach if (method == "word2vec") { # Train Word2Vec model all_text <- c(conversation1, conversation2) model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1) word_vectors <- as.matrix(model) } else { # method == "glove" if (is.null(model_path)) { stop("Please provide a path to the pre-trained GloVe file.") } # Load pre-trained GloVe vectors word_vectors <- load_glove(model_path) } # Calculate embeddings for each conversation embedding1 <- sentence_embedding(conversation1, word_vectors) embedding2 <- sentence_embedding(conversation2, word_vectors) # Calculate cosine similarity similarity <- cosine_similarity(embedding1, embedding2) } else { stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.") } return(similarity) }
# Example usage with our speeches data tfidf_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "tfidf") word2vec_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "word2vec") print(paste("TF-IDF Similarity:", tfidf_similarity)) print(paste("Word2Vec Similarity:", word2vec_similarity)) # Note: For GloVe method, you need to provide a path to pre-trained GloVe vectors # glove_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "glove", model_path = "path/to/glove/vectors.txt")
The structural_similarity
function calculates the similarity between two speeches based on their structure:
structural_similarity <- function(conv1, conv2) { length_sim <- 1 - abs(length(conv1) - length(conv2)) / max(length(conv1), length(conv2)) avg_turn_length1 <- mean(nchar(conv1)) avg_turn_length2 <- mean(nchar(conv2)) turn_length_sim <- 1 - abs(avg_turn_length1 - avg_turn_length2) / max(avg_turn_length1, avg_turn_length2) return(mean(c(length_sim, turn_length_sim))) }
# Example usage with our speeches data struct_similarity <- structural_similarity(strsplit(speeches_data$text[1], "\n")[[1]], strsplit(speeches_data$text[2], "\n")[[1]]) print(paste("Structural Similarity:", struct_similarity))
The stylistic_similarity
function calculates various stylistic features and their similarity between two speeches:
stylistic_similarity <- function(text1, text2) { # Helper function to calculate features for a single text calculate_features <- function(text) { words <- strsplit(text, " ")[[1]] sentences <- strsplit(text, "\\. ")[[1]] ttr <- length(unique(words)) / length(words) avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]]))) syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w))))) fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59 c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade) } features1 <- calculate_features(text1) features2 <- calculate_features(text2) feature_diff <- abs(features1 - features2) overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2)) normalized1 <- (features1 - mean(features1)) / sd(features1) normalized2 <- (features2 - mean(features2)) / sd(features2) cosine_similarity <- sum(normalized1 * normalized2) / (sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2))) list( text1_features = features1, text2_features = features2, feature_differences = feature_diff, overall_similarity = overall_similarity, cosine_similarity = cosine_similarity ) }
# Example usage with our speeches data style_similarity <- stylistic_similarity(speeches_data$text[1], speeches_data$text[2]) print("Stylistic Similarity Results:") print(style_similarity)
The sentiment_similarity
function calculates the sentiment similarity between two speeches:
sentiment_similarity <- function(conv1, conv2) { sent1 <- sentiment_by(conv1)$ave_sentiment sent2 <- sentiment_by(conv2)$ave_sentiment return(1 - abs(sent1 - sent2) / 2) }
# Example usage with our speeches data sent_similarity <- sentiment_similarity(speeches_data$text[1], speeches_data$text[2]) print(paste("Sentiment Similarity:", sent_similarity))
This vignette has demonstrated the usage of various similarity functions for analyzing speeches using the provided speeches_data.Rdata
. These functions can be used individually or combined to create a comprehensive similarity analysis between different speeches in your dataset.
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.