tests/testthat/test-clustering_similarity.R

# Create sample test data
create_test_data <- function(n = 10) {
  # Create sample abstracts with known patterns for testing
  topics <- list(
    computer_science = c("algorithm", "computer", "data", "programming", "software"),
    biology = c("cell", "dna", "genome", "protein", "biology"),
    physics = c("energy", "quantum", "particle", "relativity", "physics")
  )

  set.seed(123) # For reproducibility

  # Function to generate a random abstract based on a topic
  generate_abstract <- function(topic) {
    topic_words <- topics[[topic]]
    other_words <- c("the", "and", "in", "of", "with", "to", "for", "a", "is", "on")

    # Generate a random number of sentences (3-5)
    num_sentences <- sample(3:5, 1)

    sentences <- vector("character", num_sentences)
    for (i in 1:num_sentences) {
      # For each sentence, include 1-3 topic words and 3-6 other words
      num_topic_words <- sample(1:3, 1)
      num_other_words <- sample(3:6, 1)

      topic_selection <- sample(topic_words, num_topic_words, replace = TRUE)
      other_selection <- sample(other_words, num_other_words, replace = TRUE)

      all_words <- c(topic_selection, other_selection)
      # Shuffle words
      all_words <- sample(all_words)

      # Create a sentence
      sentences[i] <- paste0(paste(all_words, collapse = " "), ".")
    }

    # Combine sentences into an abstract
    abstract <- paste(sentences, collapse = " ")
    return(abstract)
  }

  # Assign topics to articles
  topics_vec <- sample(names(topics), n, replace = TRUE)

  # Generate abstracts
  abstracts <- sapply(topics_vec, generate_abstract)

  # Create a data frame
  data.frame(
    doc_id = 1:n,
    title = paste("Article", 1:n),
    publication_year = sample(2010:2023, n, replace = TRUE),
    abstract = abstracts,
    topic = topics_vec,
    stringsAsFactors = FALSE
  )
}

# Test for vec_preprocess function
test_that("vec_preprocess handles text data correctly", {
  # Create test data
  test_data <- create_test_data(5)

  # Test preprocessing
  result <- vec_preprocess(test_data, text_column = "abstract")

  # Check that the function returns the expected structure
  expect_s3_class(result, "data.frame")
  expect_true("terms" %in% colnames(result))
  expect_equal(nrow(result), nrow(test_data))

  # Check that terms are properly extracted
  expect_true(all(sapply(result$terms, function(df) is.data.frame(df))))

  # Check handling of missing values
  test_data_na <- test_data
  test_data_na$abstract[1] <- NA
  result_na <- vec_preprocess(test_data_na, text_column = "abstract")
  expect_equal(nrow(result_na), nrow(test_data) - 1)

  # Check stopword removal
  result_with_stopwords <- vec_preprocess(test_data, remove_stopwords = FALSE)
  result_without_stopwords <- vec_preprocess(test_data, remove_stopwords = TRUE)

  # We would expect more unique terms with stopwords included
  total_unique_terms_with <- length(unique(unlist(lapply(result_with_stopwords$terms, function(df) df$word))))
  total_unique_terms_without <- length(unique(unlist(lapply(result_without_stopwords$terms, function(df) df$word))))

  expect_true(total_unique_terms_with >= total_unique_terms_without)

  # Test custom stopwords
  custom_stopword_list <- c("algorithm", "computer", "data")
  result_custom_stopwords <- vec_preprocess(test_data, custom_stopwords = custom_stopword_list)

  # None of the custom stopwords should appear in the processed terms
  all_words <- unique(unlist(lapply(result_custom_stopwords$terms, function(df) df$word)))
  expect_true(!any(custom_stopword_list %in% all_words))

  # Test word length filtering
  result_min_length <- vec_preprocess(test_data, min_word_length = 5)
  all_words_min_length <- unique(unlist(lapply(result_min_length$terms, function(df) df$word)))
  expect_true(all(nchar(all_words_min_length) >= 5))
})

# Test for calc_doc_sim function
test_that("calc_doc_sim calculates document similarity correctly", {
  # Create test data
  test_data <- create_test_data(8)

  # Calculate similarity matrix
  sim_matrix <- calc_doc_sim(test_data, text_column = "abstract")

  # Check that the function returns a matrix
  expect_type(sim_matrix, "double")
  expect_true(is.matrix(sim_matrix))

  # Check matrix dimensions
  expect_equal(nrow(sim_matrix), nrow(test_data))
  expect_equal(ncol(sim_matrix), nrow(test_data))

  # Check that diagonal values are 1 (document is identical to itself)
  diag_values <- diag(sim_matrix)
  expect_true(all(abs(diag_values - 1) < 1e-10))

  # Check that all values are in [0, 1]
  expect_true(all(sim_matrix >= 0 & sim_matrix <= 1))

  # Check that matrix is symmetric
  expect_true(isSymmetric(sim_matrix))

  # Check that documents with the same topic have higher similarity
  topic_groups <- split(1:nrow(test_data), test_data$topic)

  # Only check if there are at least two documents with the same topic
  for (group in topic_groups) {
    if (length(group) >= 2) {
      # Calculate average similarity within the group
      within_sim <- mean(sim_matrix[group, group][lower.tri(sim_matrix[group, group])])

      # Calculate average similarity between this group and other documents
      other_docs <- setdiff(1:nrow(test_data), group)
      if (length(other_docs) > 0) {
        between_sim <- mean(sim_matrix[group, other_docs])

        # Documents in the same topic should be more similar on average
        expect_true(within_sim >= between_sim)
      }
    }
  }
})

# Test for cluster_docs function
test_that("cluster_docs clusters documents correctly", {
  # Create test data with clear topic separation
  test_data <- create_test_data(15)

  # Set number of clusters to match number of topics
  n_clusters <- length(unique(test_data$topic))

  # Cluster documents
  clustered_data <- cluster_docs(test_data, text_column = "abstract",
                                 n_clusters = n_clusters,
                                 random_seed = 42)

  # Check that the function returns a data frame
  expect_s3_class(clustered_data, "data.frame")

  # Check that cluster assignments are added
  expect_true("cluster" %in% colnames(clustered_data))

  # Check that all documents are assigned to a cluster
  expect_true(all(!is.na(clustered_data$cluster)))

  # Check that cluster numbers are in the expected range
  expect_true(all(clustered_data$cluster >= 1 & clustered_data$cluster <= n_clusters))

  # Check that cluster_terms attribute is present
  expect_true(!is.null(attr(clustered_data, "cluster_terms")))

  # Check that cluster_summaries attribute is present
  expect_true(!is.null(attr(clustered_data, "cluster_summaries")))

  # Check that all clusters have assigned documents
  for (i in 1:n_clusters) {
    expect_true(any(clustered_data$cluster == i))
  }

  # Optional: Evaluate cluster quality using topic labels as ground truth
  # This assumes that the algorithm should ideally group documents by topic

  # For each cluster, find the dominant topic
  cluster_topic_counts <- lapply(1:n_clusters, function(cluster_id) {
    table(clustered_data$topic[clustered_data$cluster == cluster_id])
  })

  # Calculate purity (percentage of correctly clustered documents)
  correct_assignments <- sum(sapply(cluster_topic_counts, max))
  purity <- correct_assignments / nrow(clustered_data)

  # We expect purity to be better than random assignment
  expect_true(purity > 1/n_clusters)
})

# Test for find_similar_docs function
test_that("find_similar_docs finds similar documents correctly", {
  # Create test data
  test_data <- create_test_data(10)

  # Find similar documents for the first document
  target_id <- 1
  n_similar <- 3
  similar_docs <- find_similar_docs(test_data, doc_id = target_id,
                                    text_column = "abstract",
                                    n_similar = n_similar)

  # Check that the function returns a data frame
  expect_s3_class(similar_docs, "data.frame")

  # Check that the number of returned documents is correct
  expect_equal(nrow(similar_docs), n_similar)

  # Check that the target document is not in the results
  expect_true(!target_id %in% similar_docs$doc_id)

  # Check that similarity scores are in [0, 1]
  expect_true(all(similar_docs$similarity_score >= 0 &
                    similar_docs$similarity_score <= 1))

  # Check that the results are sorted by similarity score in descending order
  expect_true(all(diff(similar_docs$similarity_score) <= 0))

  # Check that additional metadata is included if available
  expect_true(all(c("title", "publication_year") %in% colnames(similar_docs)))

  # Test behavior with non-existent document ID
  expect_error(find_similar_docs(test_data, doc_id = 999))

  # Test behavior with documents having the same topic
  # Documents with the same topic should be more similar
  topic_of_target <- test_data$topic[test_data$doc_id == target_id]

  # Get topics of similar documents using base R instead of dplyr
  similar_doc_indices <- match(similar_docs$doc_id, test_data$doc_id)
  similar_topics <- test_data$topic[similar_doc_indices]

  # Count how many similar documents have the same topic
  same_topic_count <- sum(similar_topics == topic_of_target)

  # We would expect at least one of the similar documents to have the same topic
  # Note: This is a probabilistic test and might fail occasionally
  expect_true(same_topic_count > 0)
})

Try the LBDiscover package in your browser

Any scripts or data that you put into this service are public.

LBDiscover documentation built on June 16, 2025, 5:09 p.m.