inst/doc/Analyzing_Similarities_between_Two_Long_Speeches.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
library(tm)
library(topicmodels)
library(lsa)
library(word2vec)
library(sentimentr)

## ----load-data----------------------------------------------------------------
data_path <- system.file("extdata", "speeches_data.Rdata", package = "conversim")
load(data_path)

# Print a summary of the speeches data
print(summary(speeches_data))

## ----include=FALSE------------------------------------------------------------
preprocess_text <- function(text) {
  text <- tolower(text)
  text <- gsub("[[:punct:]]", "", text)
  text <- gsub("[[:digit:]]", "", text)
  text <- gsub("\\s+", " ", trimws(text))
  return(text)
}

## -----------------------------------------------------------------------------
# Example usage with our data
original_text <- substr(speeches_data$text[1], 1, 200)  # First 200 characters of speech A
preprocessed_text <- preprocess_text(original_text)
print(paste("Original:", original_text))
print(paste("Preprocessed:", preprocessed_text))

## ----include=FALSE------------------------------------------------------------
topic_similarity <- function(conv1, conv2, method = "lda", num_topics = 2) {
  corpus <- c(conv1, conv2)
  dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus)))
  dtm_matrix <- as.matrix(dtm)

  if (method == "lda") {
    lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
    topic_dist <- posterior(lda_model)$topics

    js_divergence <- function(p, q) {
      m <- 0.5 * (p + q)
      0.5 * sum(p * log(p / m)) + 0.5 * sum(q * log(q / m))
    }

    similarity <- 1 - sqrt(js_divergence(topic_dist[1,], topic_dist[2,]))

  } else if (method == "lsa") {
    if (nrow(dtm_matrix) < num_topics) {
      num_topics <- nrow(dtm_matrix)
    }
    lsa_space <- lsa(dtm_matrix, dims = num_topics)
    doc_lsa <- lsa_space$dk

    similarity <- cosine(doc_lsa[1,], doc_lsa[2,])
    similarity <- (similarity + 1) / 2

  } else {
    stop("Invalid method. Choose 'lda' or 'lsa'.")
  }

  return(similarity)
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
lda_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lda", num_topics = 5)
lsa_similarity <- topic_similarity(speeches_data$text[1], speeches_data$text[2], method = "lsa", num_topics = 5)

print(paste("LDA Similarity:", lda_similarity))
print(paste("LSA Similarity:", lsa_similarity))

## ----include=FALSE------------------------------------------------------------
lexical_similarity <- function(conv1, conv2) {
  words1 <- unique(unlist(strsplit(conv1, " ")))
  words2 <- unique(unlist(strsplit(conv2, " ")))

  intersection <- length(intersect(words1, words2))
  union <- length(union(words1, words2))

  return(intersection / union)
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
lex_similarity <- lexical_similarity(speeches_data$text[1], speeches_data$text[2])
print(paste("Lexical Similarity:", lex_similarity))

## ----include=FALSE------------------------------------------------------------
semantic_similarity <- function(conversation1, conversation2, method = "tfidf",
                                          model_path = NULL, dim = 100, window = 5, iter = 5) {
  # Internal function to calculate cosine similarity
  cosine_similarity <- function(a, b) {
    if (length(a) == 0 || length(b) == 0) return(0)
    sim <- sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2)))
    # Ensure the result is between 0 and 1
    return((sim + 1) / 2)
  }

  # Internal function to load pre-trained GloVe embeddings
  load_glove <- function(file_path) {
    tryCatch({
      conn <- file(file_path, "r")
      lines <- readLines(conn)
      close(conn)
      split_lines <- strsplit(lines, " ")
      words <- sapply(split_lines, `[`, 1)
      vectors <- t(sapply(split_lines, function(x) as.numeric(x[-1])))
      rownames(vectors) <- words
      return(vectors)
    }, error = function(e) {
      stop(paste("Error loading GloVe file:", e$message))
    })
  }

  # Internal function to calculate sentence embedding
  sentence_embedding <- function(sentence, word_vectors) {
    tokens <- unlist(strsplit(sentence, "\\s+"))
    valid_tokens <- tokens[tokens %in% rownames(word_vectors)]
    if (length(valid_tokens) == 0) {
      return(rep(0, ncol(word_vectors)))
    }
    embeddings <- word_vectors[valid_tokens, , drop = FALSE]
    if (nrow(embeddings) == 0) return(rep(0, ncol(word_vectors)))
    return(colMeans(embeddings))
  }

  if (method == "tfidf") {
    # TF-IDF approach
    corpus <- c(conversation1, conversation2)
    dtm <- DocumentTermMatrix(Corpus(VectorSource(corpus)))
    tfidf <- weightTfIdf(dtm)
    m <- as.matrix(tfidf)

    # Issue a warning for short conversations or little vocabulary overlap
    if (nchar(conversation1) < 50 || nchar(conversation2) < 50 || ncol(m) < 5) {
      warning("The 'tfidf' method may not provide highly meaningful results for short conversations or those with little vocabulary overlap. Consider using 'word2vec' or 'glove' methods for more robust results.")
    }

    # If the conversations are identical, return 1
    if (identical(conversation1, conversation2)) {
      return(1)
    }
    # Ensure we have at least one term in common
    if (ncol(m) == 0) {
      return(0)
    }
    # Calculate cosine similarity
    similarity <- cosine_similarity(m[1,], m[2,])

  } else if (method == "word2vec" || method == "glove") {
    # Word2Vec or GloVe approach
    if (method == "word2vec") {
      # Train Word2Vec model
      all_text <- c(conversation1, conversation2)
      model <- word2vec(x = all_text, dim = dim, iter = iter, window = window, min_count = 1)
      word_vectors <- as.matrix(model)
    } else { # method == "glove"
      if (is.null(model_path)) {
        stop("Please provide a path to the pre-trained GloVe file.")
      }
      # Load pre-trained GloVe vectors
      word_vectors <- load_glove(model_path)
    }

    # Calculate embeddings for each conversation
    embedding1 <- sentence_embedding(conversation1, word_vectors)
    embedding2 <- sentence_embedding(conversation2, word_vectors)

    # Calculate cosine similarity
    similarity <- cosine_similarity(embedding1, embedding2)
  } else {
    stop("Invalid method. Choose 'tfidf', 'word2vec', or 'glove'.")
  }

  return(similarity)
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
tfidf_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "tfidf")
word2vec_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "word2vec")

print(paste("TF-IDF Similarity:", tfidf_similarity))
print(paste("Word2Vec Similarity:", word2vec_similarity))

# Note: For GloVe method, you need to provide a path to pre-trained GloVe vectors
# glove_similarity <- semantic_similarity(speeches_data$text[1], speeches_data$text[2], method = "glove", model_path = "path/to/glove/vectors.txt")

## ----include=FALSE------------------------------------------------------------
structural_similarity <- function(conv1, conv2) {
  length_sim <- 1 - abs(length(conv1) - length(conv2)) / max(length(conv1), length(conv2))

  avg_turn_length1 <- mean(nchar(conv1))
  avg_turn_length2 <- mean(nchar(conv2))
  turn_length_sim <- 1 - abs(avg_turn_length1 - avg_turn_length2) / max(avg_turn_length1, avg_turn_length2)

  return(mean(c(length_sim, turn_length_sim)))
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
struct_similarity <- structural_similarity(strsplit(speeches_data$text[1], "\n")[[1]], 
                                           strsplit(speeches_data$text[2], "\n")[[1]])
print(paste("Structural Similarity:", struct_similarity))

## ----include=FALSE------------------------------------------------------------
stylistic_similarity <- function(text1, text2) {
  # Helper function to calculate features for a single text
  calculate_features <- function(text) {
    words <- strsplit(text, " ")[[1]]
    sentences <- strsplit(text, "\\. ")[[1]]

    ttr <- length(unique(words)) / length(words)
    avg_sentence_length <- mean(sapply(sentences, function(s) length(strsplit(s, " ")[[1]])))
    syllables <- sum(sapply(words, function(w) max(1, nchar(gsub("[^aeiouAEIOU]", "", w)))))
    fk_grade <- 0.39 * (length(words) / length(sentences)) + 11.8 * (syllables / length(words)) - 15.59

    c(ttr = ttr, avg_sentence_length = avg_sentence_length, fk_grade = fk_grade)
  }

  features1 <- calculate_features(text1)
  features2 <- calculate_features(text2)
  feature_diff <- abs(features1 - features2)
  overall_similarity <- 1 - mean(feature_diff / pmax(features1, features2))

  normalized1 <- (features1 - mean(features1)) / sd(features1)
  normalized2 <- (features2 - mean(features2)) / sd(features2)
  cosine_similarity <- sum(normalized1 * normalized2) /
    (sqrt(sum(normalized1^2)) * sqrt(sum(normalized2^2)))

  list(
    text1_features = features1,
    text2_features = features2,
    feature_differences = feature_diff,
    overall_similarity = overall_similarity,
    cosine_similarity = cosine_similarity
  )
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
style_similarity <- stylistic_similarity(speeches_data$text[1], speeches_data$text[2])
print("Stylistic Similarity Results:")
print(style_similarity)

## ----include=FALSE------------------------------------------------------------
sentiment_similarity <- function(conv1, conv2) {
  sent1 <- sentiment_by(conv1)$ave_sentiment
  sent2 <- sentiment_by(conv2)$ave_sentiment

  return(1 - abs(sent1 - sent2) / 2)
}

## -----------------------------------------------------------------------------
# Example usage with our speeches data
sent_similarity <- sentiment_similarity(speeches_data$text[1], speeches_data$text[2])
print(paste("Sentiment Similarity:", sent_similarity))

Try the conversim package in your browser

Any scripts or data that you put into this service are public.

conversim documentation built on Sept. 20, 2024, 5:09 p.m.