Nothing
# Create sample test data
create_test_data <- function(n = 10) {
# Create sample abstracts with known patterns for testing
topics <- list(
computer_science = c("algorithm", "computer", "data", "programming", "software"),
biology = c("cell", "dna", "genome", "protein", "biology"),
physics = c("energy", "quantum", "particle", "relativity", "physics")
)
set.seed(123) # For reproducibility
# Function to generate a random abstract based on a topic
generate_abstract <- function(topic) {
topic_words <- topics[[topic]]
other_words <- c("the", "and", "in", "of", "with", "to", "for", "a", "is", "on")
# Generate a random number of sentences (3-5)
num_sentences <- sample(3:5, 1)
sentences <- vector("character", num_sentences)
for (i in 1:num_sentences) {
# For each sentence, include 1-3 topic words and 3-6 other words
num_topic_words <- sample(1:3, 1)
num_other_words <- sample(3:6, 1)
topic_selection <- sample(topic_words, num_topic_words, replace = TRUE)
other_selection <- sample(other_words, num_other_words, replace = TRUE)
all_words <- c(topic_selection, other_selection)
# Shuffle words
all_words <- sample(all_words)
# Create a sentence
sentences[i] <- paste0(paste(all_words, collapse = " "), ".")
}
# Combine sentences into an abstract
abstract <- paste(sentences, collapse = " ")
return(abstract)
}
# Assign topics to articles
topics_vec <- sample(names(topics), n, replace = TRUE)
# Generate abstracts
abstracts <- sapply(topics_vec, generate_abstract)
# Create a data frame
data.frame(
doc_id = 1:n,
title = paste("Article", 1:n),
publication_year = sample(2010:2023, n, replace = TRUE),
abstract = abstracts,
topic = topics_vec,
stringsAsFactors = FALSE
)
}
# Test for vec_preprocess function
test_that("vec_preprocess handles text data correctly", {
# Create test data
test_data <- create_test_data(5)
# Test preprocessing
result <- vec_preprocess(test_data, text_column = "abstract")
# Check that the function returns the expected structure
expect_s3_class(result, "data.frame")
expect_true("terms" %in% colnames(result))
expect_equal(nrow(result), nrow(test_data))
# Check that terms are properly extracted
expect_true(all(sapply(result$terms, function(df) is.data.frame(df))))
# Check handling of missing values
test_data_na <- test_data
test_data_na$abstract[1] <- NA
result_na <- vec_preprocess(test_data_na, text_column = "abstract")
expect_equal(nrow(result_na), nrow(test_data) - 1)
# Check stopword removal
result_with_stopwords <- vec_preprocess(test_data, remove_stopwords = FALSE)
result_without_stopwords <- vec_preprocess(test_data, remove_stopwords = TRUE)
# We would expect more unique terms with stopwords included
total_unique_terms_with <- length(unique(unlist(lapply(result_with_stopwords$terms, function(df) df$word))))
total_unique_terms_without <- length(unique(unlist(lapply(result_without_stopwords$terms, function(df) df$word))))
expect_true(total_unique_terms_with >= total_unique_terms_without)
# Test custom stopwords
custom_stopword_list <- c("algorithm", "computer", "data")
result_custom_stopwords <- vec_preprocess(test_data, custom_stopwords = custom_stopword_list)
# None of the custom stopwords should appear in the processed terms
all_words <- unique(unlist(lapply(result_custom_stopwords$terms, function(df) df$word)))
expect_true(!any(custom_stopword_list %in% all_words))
# Test word length filtering
result_min_length <- vec_preprocess(test_data, min_word_length = 5)
all_words_min_length <- unique(unlist(lapply(result_min_length$terms, function(df) df$word)))
expect_true(all(nchar(all_words_min_length) >= 5))
})
# Test for calc_doc_sim function
test_that("calc_doc_sim calculates document similarity correctly", {
# Create test data
test_data <- create_test_data(8)
# Calculate similarity matrix
sim_matrix <- calc_doc_sim(test_data, text_column = "abstract")
# Check that the function returns a matrix
expect_type(sim_matrix, "double")
expect_true(is.matrix(sim_matrix))
# Check matrix dimensions
expect_equal(nrow(sim_matrix), nrow(test_data))
expect_equal(ncol(sim_matrix), nrow(test_data))
# Check that diagonal values are 1 (document is identical to itself)
diag_values <- diag(sim_matrix)
expect_true(all(abs(diag_values - 1) < 1e-10))
# Check that all values are in [0, 1]
expect_true(all(sim_matrix >= 0 & sim_matrix <= 1))
# Check that matrix is symmetric
expect_true(isSymmetric(sim_matrix))
# Check that documents with the same topic have higher similarity
topic_groups <- split(1:nrow(test_data), test_data$topic)
# Only check if there are at least two documents with the same topic
for (group in topic_groups) {
if (length(group) >= 2) {
# Calculate average similarity within the group
within_sim <- mean(sim_matrix[group, group][lower.tri(sim_matrix[group, group])])
# Calculate average similarity between this group and other documents
other_docs <- setdiff(1:nrow(test_data), group)
if (length(other_docs) > 0) {
between_sim <- mean(sim_matrix[group, other_docs])
# Documents in the same topic should be more similar on average
expect_true(within_sim >= between_sim)
}
}
}
})
# Test for cluster_docs function
test_that("cluster_docs clusters documents correctly", {
# Create test data with clear topic separation
test_data <- create_test_data(15)
# Set number of clusters to match number of topics
n_clusters <- length(unique(test_data$topic))
# Cluster documents
clustered_data <- cluster_docs(test_data, text_column = "abstract",
n_clusters = n_clusters,
random_seed = 42)
# Check that the function returns a data frame
expect_s3_class(clustered_data, "data.frame")
# Check that cluster assignments are added
expect_true("cluster" %in% colnames(clustered_data))
# Check that all documents are assigned to a cluster
expect_true(all(!is.na(clustered_data$cluster)))
# Check that cluster numbers are in the expected range
expect_true(all(clustered_data$cluster >= 1 & clustered_data$cluster <= n_clusters))
# Check that cluster_terms attribute is present
expect_true(!is.null(attr(clustered_data, "cluster_terms")))
# Check that cluster_summaries attribute is present
expect_true(!is.null(attr(clustered_data, "cluster_summaries")))
# Check that all clusters have assigned documents
for (i in 1:n_clusters) {
expect_true(any(clustered_data$cluster == i))
}
# Optional: Evaluate cluster quality using topic labels as ground truth
# This assumes that the algorithm should ideally group documents by topic
# For each cluster, find the dominant topic
cluster_topic_counts <- lapply(1:n_clusters, function(cluster_id) {
table(clustered_data$topic[clustered_data$cluster == cluster_id])
})
# Calculate purity (percentage of correctly clustered documents)
correct_assignments <- sum(sapply(cluster_topic_counts, max))
purity <- correct_assignments / nrow(clustered_data)
# We expect purity to be better than random assignment
expect_true(purity > 1/n_clusters)
})
# Test for find_similar_docs function
test_that("find_similar_docs finds similar documents correctly", {
# Create test data
test_data <- create_test_data(10)
# Find similar documents for the first document
target_id <- 1
n_similar <- 3
similar_docs <- find_similar_docs(test_data, doc_id = target_id,
text_column = "abstract",
n_similar = n_similar)
# Check that the function returns a data frame
expect_s3_class(similar_docs, "data.frame")
# Check that the number of returned documents is correct
expect_equal(nrow(similar_docs), n_similar)
# Check that the target document is not in the results
expect_true(!target_id %in% similar_docs$doc_id)
# Check that similarity scores are in [0, 1]
expect_true(all(similar_docs$similarity_score >= 0 &
similar_docs$similarity_score <= 1))
# Check that the results are sorted by similarity score in descending order
expect_true(all(diff(similar_docs$similarity_score) <= 0))
# Check that additional metadata is included if available
expect_true(all(c("title", "publication_year") %in% colnames(similar_docs)))
# Test behavior with non-existent document ID
expect_error(find_similar_docs(test_data, doc_id = 999))
# Test behavior with documents having the same topic
# Documents with the same topic should be more similar
topic_of_target <- test_data$topic[test_data$doc_id == target_id]
# Get topics of similar documents using base R instead of dplyr
similar_doc_indices <- match(similar_docs$doc_id, test_data$doc_id)
similar_topics <- test_data$topic[similar_doc_indices]
# Count how many similar documents have the same topic
same_topic_count <- sum(similar_topics == topic_of_target)
# We would expect at least one of the similar documents to have the same topic
# Note: This is a probabilistic test and might fail occasionally
expect_true(same_topic_count > 0)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.