Nothing
# Function implementations for the text_preprocessing module
#' Environment to store dictionary cache data
#' @keywords internal
.dict_cache_env <- new.env(parent = emptyenv())
#' Get dictionary cache environment
#' @return The environment containing cached dictionary data
#' @export
get_dict_cache <- function() {
.dict_cache_env
}
#' Preprocess article text
#'
#' This function preprocesses article text for further analysis.
#'
#' @param text_data A data frame containing article text data (title, abstract, etc.).
#' @param text_column Name of the column containing text to process.
#' @param remove_stopwords Logical. If TRUE, removes stopwords.
#' @param custom_stopwords Character vector of additional stopwords to remove.
#' @param stem_words Logical. If TRUE, applies stemming to words.
#' @param min_word_length Minimum word length to keep.
#' @param max_word_length Maximum word length to keep.
#'
#' @return A data frame with processed text and extracted terms.
#' @export
preprocess_text <- function(text_data, text_column = "abstract",
remove_stopwords = TRUE,
custom_stopwords = NULL,
stem_words = FALSE,
min_word_length = 3,
max_word_length = 50) {
# Add ID column if not present
if (!"doc_id" %in% colnames(text_data)) {
text_data$doc_id <- seq_len(nrow(text_data))
}
# Check if text column exists
if (!text_column %in% colnames(text_data)) {
stop("Text column '", text_column, "' not found in the data")
}
# Create a copy of the data
processed_data <- text_data
# Ensure text is character
processed_data[[text_column]] <- as.character(processed_data[[text_column]])
# Remove missing values
processed_data <- processed_data[!is.na(processed_data[[text_column]]), ]
# Load standard English stopwords if needed
stopword_list <- character(0)
if (remove_stopwords) {
# Define a basic set of English stopwords if tidytext not available
stopword_list <- c(
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "from", "had",
"has", "have", "he", "her", "his", "i", "in", "is", "it", "its", "of", "on",
"or", "that", "the", "this", "to", "was", "were", "which", "with", "you"
)
# Add custom stopwords if provided
if (!is.null(custom_stopwords)) {
stopword_list <- c(stopword_list, custom_stopwords)
}
}
# Function to tokenize text
tokenize_text <- function(text) {
# Convert to lowercase
text <- tolower(text)
# Replace non-alphanumeric characters with spaces
text <- gsub("[^a-zA-Z0-9]", " ", text)
# Split by whitespace
words <- unlist(strsplit(text, "\\s+"))
# Remove empty strings
words <- words[words != ""]
# Apply length filtering
words <- words[nchar(words) >= min_word_length & nchar(words) <= max_word_length]
# Remove stopwords if requested
if (remove_stopwords) {
words <- words[!words %in% stopword_list]
}
# Apply stemming if requested
if (stem_words) {
if (!requireNamespace("SnowballC", quietly = TRUE)) {
stop("The SnowballC package is required for stemming. Install it with: install.packages('SnowballC')")
}
words <- SnowballC::wordStem(words, language = "english")
}
# Return the tokenized words
return(words)
}
message("Tokenizing text...")
# Process each document
terms_list <- list()
for (i in seq_len(nrow(processed_data))) {
text <- processed_data[[text_column]][i]
if (!is.na(text) && text != "") {
# Tokenize the text
words <- tokenize_text(text)
# Count word frequencies
word_counts <- table(words)
# Convert to data frame
if (length(word_counts) > 0) {
terms_df <- data.frame(
word = names(word_counts),
count = as.numeric(word_counts),
stringsAsFactors = FALSE
)
terms_list[[i]] <- terms_df
} else {
terms_list[[i]] <- data.frame(word = character(0), count = numeric(0), stringsAsFactors = FALSE)
}
} else {
terms_list[[i]] <- data.frame(word = character(0), count = numeric(0), stringsAsFactors = FALSE)
}
}
# Add terms to the processed data
processed_data$terms <- terms_list
return(processed_data)
}
#' Extract and classify entities from text with multi-domain types
#'
#' This function extracts entities from text and optionally assigns them to specific
#' semantic categories based on dictionaries.
#'
#' @param text_data A data frame containing article text data.
#' @param text_column Name of the column containing text to process.
#' @param dictionary Combined dictionary or list of dictionaries for entity extraction.
#' @param case_sensitive Logical. If TRUE, matching is case-sensitive.
#' @param overlap_strategy How to handle terms that match multiple dictionaries: "priority", "all", or "longest".
#' @param sanitize_dict Logical. If TRUE, sanitizes the dictionary before extraction.
#'
#' @return A data frame with extracted entities, their types, and positions.
#' @export
extract_entities <- function(text_data, text_column = "abstract",
dictionary = NULL,
case_sensitive = FALSE,
overlap_strategy = c("priority", "all", "longest"),
sanitize_dict = TRUE) {
# Match overlap_strategy argument
overlap_strategy <- match.arg(overlap_strategy)
# Check inputs
if (is.null(dictionary)) {
stop("Dictionary must be provided")
}
# Check if text column exists
if (!text_column %in% colnames(text_data)) {
stop("Text column '", text_column, "' not found in the data")
}
# Sanitize dictionary if requested
if (sanitize_dict) {
original_count <- nrow(dictionary)
dictionary <- sanitize_dictionary(dictionary, term_column = "term")
if (nrow(dictionary) < original_count) {
message("Dictionary sanitized: ", nrow(dictionary), " of ", original_count, " terms retained")
}
# If dictionary is empty after sanitization, stop
if (nrow(dictionary) == 0) {
stop("No terms remain in the dictionary after sanitization")
}
}
# Add ID column if not present
if (!"doc_id" %in% colnames(text_data)) {
text_data$doc_id <- seq_len(nrow(text_data))
}
# Initialize results data frame
results <- data.frame(
doc_id = integer(),
entity = character(),
entity_type = character(),
start_pos = integer(),
end_pos = integer(),
sentence = character(),
stringsAsFactors = FALSE
)
# Function to split text into sentences for better context
split_into_sentences <- function(text) {
# Simple sentence splitting - can be enhanced with more rules
sentences <- unlist(strsplit(text, "(?<=[.!?])\\s+", perl = TRUE))
# Remove empty sentences
sentences <- sentences[sentences != ""]
# Calculate start and end positions for each sentence
positions <- list()
start_pos <- 1
for (i in seq_along(sentences)) {
end_pos <- start_pos + nchar(sentences[i]) - 1
positions[[i]] <- c(start_pos, end_pos)
start_pos <- end_pos + 2 # +2 to account for delimiter and space
}
return(list(sentences = sentences, positions = positions))
}
# Process each document
message("Extracting entities from ", nrow(text_data), " documents...")
for (i in seq_len(nrow(text_data))) {
if (i %% 100 == 0) {
message("Processing document ", i, " of ", nrow(text_data))
}
doc_id <- text_data$doc_id[i]
text <- text_data[[text_column]][i]
# Skip empty or NA text
if (is.na(text) || text == "") {
next
}
# Split text into sentences for better context
text_sentences <- split_into_sentences(text)
sentences <- text_sentences$sentences
sentence_positions <- text_sentences$positions
# Search for entities in each sentence
for (s in seq_along(sentences)) {
sentence <- sentences[s]
sentence_start <- sentence_positions[[s]][1]
# Text to search (handle case sensitivity)
text_to_search <- if (case_sensitive) sentence else tolower(sentence)
# Track all matches to handle overlaps
all_matches <- list()
# Process each term in the dictionary
for (j in seq_len(nrow(dictionary))) {
term <- dictionary$term[j]
term_type <- dictionary$type[j]
# Skip empty terms
if (is.na(term) || term == "") {
next
}
# For case-insensitive matching, convert term to lowercase
search_term <- if (case_sensitive) term else tolower(term)
# Create pattern with word boundaries
pattern <- paste0("\\b", search_term, "\\b")
# Find all matches
matches <- gregexpr(pattern, text_to_search, fixed = FALSE, perl = TRUE)
# If matches found
if (matches[[1]][1] != -1) {
match_starts <- matches[[1]]
match_ends <- match_starts + attr(matches[[1]], "match.length") - 1
for (m in seq_along(match_starts)) {
# Calculate positions in the original text
start_pos <- sentence_start + match_starts[m] - 1
end_pos <- sentence_start + match_ends[m] - 1
# Add to all matches
all_matches[[length(all_matches) + 1]] <- list(
term = term,
type = term_type,
priority = j, # Lower index = higher priority
start_pos = start_pos,
end_pos = end_pos,
length = end_pos - start_pos + 1,
sentence = sentence
)
}
}
}
# Handle overlapping matches based on strategy
final_matches <- list()
if (length(all_matches) > 0) {
# Sort matches by start position
all_matches <- all_matches[order(sapply(all_matches, function(x) x$start_pos))]
if (overlap_strategy == "all") {
# Keep all matches, including overlapping ones
final_matches <- all_matches
} else {
# Check for overlaps
current_end <- 0
for (m in seq_along(all_matches)) {
match <- all_matches[[m]]
if (match$start_pos > current_end) {
# No overlap with previous matches
final_matches[[length(final_matches) + 1]] <- match
current_end <- match$end_pos
} else {
# Overlap - apply strategy
if (overlap_strategy == "longest") {
# Check if current match is longer than the last accepted match
if (match$length > (final_matches[[length(final_matches)]]$length)) {
# Replace last match with this longer one
final_matches[[length(final_matches)]] <- match
current_end <- match$end_pos
}
} else if (overlap_strategy == "priority") {
# Check if current match has higher priority (lower priority number)
if (match$priority < final_matches[[length(final_matches)]]$priority) {
# Replace last match with higher priority one
final_matches[[length(final_matches)]] <- match
current_end <- match$end_pos
}
}
}
}
}
}
# Add final matches to results
for (match in final_matches) {
results <- rbind(results, data.frame(
doc_id = doc_id,
entity = match$term,
entity_type = match$type,
start_pos = match$start_pos,
end_pos = match$end_pos,
sentence = match$sentence,
stringsAsFactors = FALSE
))
}
}
}
# If no entities found, return empty data frame with correct structure
if (nrow(results) == 0) {
message("No entities found")
return(data.frame(
doc_id = integer(),
entity = character(),
entity_type = character(),
start_pos = integer(),
end_pos = integer(),
sentence = character(),
stringsAsFactors = FALSE
))
}
# Count entity occurrences by type
entity_counts <- table(results$entity_type)
message("Extracted ", nrow(results), " entity mentions:")
for (type in names(entity_counts)) {
message(" ", type, ": ", entity_counts[type])
}
# Add frequency count as a column
entity_frequency <- as.data.frame(table(results$entity, results$entity_type))
colnames(entity_frequency) <- c("entity", "entity_type", "frequency")
# Merge frequency back into results
results <- merge(results, entity_frequency, by = c("entity", "entity_type"))
# Sort by frequency and doc_id
results <- results[order(-results$frequency, results$doc_id), ]
return(results)
}
#' Load biomedical dictionaries with improved error handling
#'
#' This function loads pre-defined biomedical dictionaries or fetches terms from MeSH/UMLS.
#'
#' @param dictionary_type Type of dictionary to load. For local dictionaries, limited to "disease", "drug", "gene".
#' For MeSH and UMLS, expanded to include more semantic categories.
#' @param custom_path Optional path to a custom dictionary file.
#' @param source The source to fetch terms from: "local", "mesh", or "umls".
#' @param api_key UMLS API key for authentication (required if source = "umls").
#' @param n_terms Number of terms to fetch.
#' @param mesh_query Additional query to filter MeSH terms (only if source = "mesh").
#' @param semantic_type_filter Filter by semantic type (used mainly with UMLS).
#' @param sanitize Logical. If TRUE, sanitizes the dictionary terms.
#' @param extended_mesh Logical. If TRUE and source is "mesh", uses PubMed search for additional terms.
#' @param mesh_queries Named list of MeSH queries for different categories (only if extended_mesh = TRUE).
#'
#' @return A data frame containing the dictionary.
#' @export
load_dictionary <- function(dictionary_type = NULL,
custom_path = NULL,
source = c("local", "mesh", "umls"),
api_key = NULL,
n_terms = 200,
mesh_query = NULL,
semantic_type_filter = NULL,
sanitize = TRUE,
extended_mesh = FALSE,
mesh_queries = NULL) {
source <- match.arg(source)
# Define allowed dictionary types for local source (limited to original three)
local_dict_types <- c("disease", "drug", "gene")
# Define expanded dictionary types for MeSH and UMLS
mesh_dict_types <- c(
"disease", "drug", "gene", "protein", "chemical", "pathway",
"anatomy", "organism", "biological_process", "cell", "tissue",
"symptom", "diagnostic_procedure", "therapeutic_procedure",
"phenotype", "molecular_function"
)
umls_dict_types <- c(
"disease", "drug", "gene", "protein", "chemical", "pathway",
"anatomy", "organism", "biological_process", "cellular_component",
"molecular_function", "diagnostic_procedure", "therapeutic_procedure",
"phenotype", "symptom", "cell", "tissue", "mental_process",
"physiologic_function", "laboratory_procedure"
)
# Validate dictionary_type based on source
if (!is.null(dictionary_type)) {
if (source == "local" && !(dictionary_type %in% local_dict_types)) {
# Instead of stopping, switch to mesh source for expanded types
message("Type '", dictionary_type, "' not supported for local source. Switching to MeSH source.")
source <- "mesh"
} else if (source == "mesh" && !(dictionary_type %in% mesh_dict_types)) {
stop("For MeSH dictionaries, dictionary_type must be one of: ", paste(mesh_dict_types, collapse = ", "))
} else if (source == "umls" && !(dictionary_type %in% umls_dict_types)) {
stop("For UMLS dictionaries, dictionary_type must be one of: ", paste(umls_dict_types, collapse = ", "))
}
}
# If custom path is provided, use it
if (!is.null(custom_path)) {
if (!file.exists(custom_path)) {
stop("Custom dictionary file not found: ", custom_path)
}
ext <- tools::file_ext(custom_path)
if (ext == "csv") {
dict <- utils::read.csv(custom_path, stringsAsFactors = FALSE)
} else if (ext == "rds") {
dict <- readRDS(custom_path)
} else {
stop("Unsupported file format: ", ext, ". Supported formats: csv, rds")
}
# Check if the dictionary has the required columns
required_cols <- c("term", "type")
if (!all(required_cols %in% colnames(dict))) {
stop("Dictionary must have columns: ", paste(required_cols, collapse = ", "))
}
# Apply sanitization if requested
if (sanitize) {
dict <- sanitize_dictionary(dict)
}
return(dict)
}
# Define semantic types based on dictionary_type if not provided
if (is.null(semantic_type_filter)) {
if (source == "umls") {
semantic_type_filter <- get_umls_semantic_types(dictionary_type)
}
}
# Fetch terms from different sources
if (source == "mesh") {
# Regular MeSH loading
base_dict <- load_from_mesh(dictionary_type, n_terms, mesh_query)
# If extended_mesh is requested, add terms from PubMed/MeSH searches
if (extended_mesh && !is.null(mesh_queries)) {
pubmed_dict <- load_mesh_terms_from_pubmed(
mesh_queries = mesh_queries,
max_results = n_terms / 2, # Split the max results between direct and extended
sanitize = FALSE # We'll sanitize the combined dictionary later
)
if (nrow(pubmed_dict) > 0) {
# Combine dictionaries
base_dict <- rbind(base_dict, pubmed_dict)
# Remove duplicates based on term
base_dict <- base_dict[!duplicated(base_dict$term), ]
}
}
# Apply sanitization if requested
if (sanitize) {
base_dict <- sanitize_dictionary(base_dict)
}
return(base_dict)
} else if (source == "umls") {
if (is.null(api_key)) {
message("UMLS API key is required for fetching terms from UMLS. Switching to MeSH.")
# Recursively call with mesh source
return(load_dictionary(dictionary_type, custom_path = custom_path,
source = "mesh", n_terms = n_terms,
mesh_query = mesh_query, sanitize = sanitize))
}
umls_dict <- load_from_umls(dictionary_type, api_key, n_terms, semantic_type_filter)
# Apply sanitization if requested
if (sanitize) {
umls_dict <- sanitize_dictionary(umls_dict)
}
return(umls_dict)
} else {
# Local source
# Check if dictionary type is supported by local source
if (!is.null(dictionary_type) && !(dictionary_type %in% local_dict_types)) {
message("Dictionary type '", dictionary_type, "' not supported by local source. Trying MeSH source.")
# Recursively call with mesh source
return(load_dictionary(dictionary_type, custom_path = custom_path,
source = "mesh", n_terms = n_terms,
mesh_query = mesh_query, sanitize = sanitize))
}
# Try to load from package data
pkg_path <- system.file("extdata", "dictionaries", package = "LBDiscover")
if (pkg_path == "") {
# If package is not installed, use example dictionaries for development
message("Package not installed or dictionary not found. Using example dictionaries.")
local_dict <- create_dummy_dictionary(dictionary_type)
} else {
# Load dictionary from package
dict_path <- file.path(pkg_path, paste0(dictionary_type, "_dictionary.rds"))
if (!file.exists(dict_path)) {
message("Dictionary not found: ", dict_path, ". Using example dictionary.")
local_dict <- create_dummy_dictionary(dictionary_type)
} else {
local_dict <- readRDS(dict_path)
}
}
# Apply sanitization if requested
if (sanitize) {
local_dict <- sanitize_dictionary(local_dict)
}
return(local_dict)
}
}
#' Load terms from MeSH using rentrez with improved error handling
#'
#' This function uses the rentrez package to retrieve terms from MeSH database.
#'
#' @param dictionary_type Type of dictionary to load (e.g., "disease", "drug", "gene").
#' @param n_terms Maximum number of terms to fetch.
#' @param query Additional query to filter MeSH terms.
#'
#' @return A data frame containing the MeSH terms.
#' @keywords internal
load_from_mesh <- function(dictionary_type, n_terms = 200, query = NULL) {
# Check if rentrez is available
if (!requireNamespace("rentrez", quietly = TRUE)) {
stop("The rentrez package is required for MeSH queries. Install it with: install.packages('rentrez')")
}
if (!requireNamespace("xml2", quietly = TRUE)) {
stop("The xml2 package is required for parsing MeSH data. Install it with: install.packages('xml2')")
}
# Build query based on dictionary type - expanded for more categories
mesh_query_map <- list(
"disease" = "disease[MeSH]",
"drug" = "pharmaceutical preparations[MeSH]",
"gene" = "genes[MeSH]",
"protein" = "proteins[MeSH]",
"chemical" = "chemicals[MeSH]",
"pathway" = "metabolic networks and pathways[MeSH]",
"anatomy" = "anatomy[MeSH]",
"organism" = "organisms[MeSH]",
"biological_process" = "biological phenomena[MeSH]",
"cell" = "cells[MeSH]",
"tissue" = "tissues[MeSH]",
"symptom" = "signs and symptoms[MeSH]",
"diagnostic_procedure" = "diagnostic techniques and procedures[MeSH]",
"therapeutic_procedure" = "therapeutics[MeSH]",
"phenotype" = "phenotype[MeSH]",
"molecular_function" = "molecular biology[MeSH]"
)
if (is.null(dictionary_type) || !dictionary_type %in% names(mesh_query_map)) {
# If dictionary_type is not specified or not in our mapping, use a general search
base_query <- "mesh[sb]"
} else {
base_query <- mesh_query_map[[dictionary_type]]
}
# Add user-provided query if available
if (!is.null(query)) {
# If the query already has MeSH qualifier, use it directly
if (grepl("\\[MeSH\\]", query)) {
search_query <- query
} else {
# Otherwise, treat it as a keyword to search within MeSH terms
search_query <- paste0("(", query, "[MeSH Terms]) AND ", base_query)
}
} else {
search_query <- base_query
}
message("Searching MeSH database for: ", search_query)
# Try to fetch MeSH IDs with more robust error handling
mesh_ids <- NULL
# Try searching PubMed with MeSH terms first
pubmed_search <- tryCatch({
rentrez::entrez_search(
db = "pubmed",
term = search_query,
use_history = TRUE,
retmax = 0 # We only need the count
)
}, error = function(e) {
message("PubMed search error: ", e$message)
return(NULL)
})
if (!is.null(pubmed_search) && pubmed_search$count > 0) {
message("Found ", pubmed_search$count, " PubMed records with matching MeSH terms")
# Get most common MeSH terms from these results
mesh_count <- tryCatch({
rentrez::entrez_link(
dbfrom = "pubmed",
db = "mesh",
cmd = "neighbor_score",
query_key = pubmed_search$QueryKey,
WebEnv = pubmed_search$WebEnv
)
}, error = function(e) {
message("Error getting MeSH links: ", e$message)
return(NULL)
})
if (!is.null(mesh_count) && !is.null(mesh_count$links) &&
!is.null(mesh_count$links$pubmed_mesh) &&
length(mesh_count$links$pubmed_mesh) > 0) {
# Limit the number of MeSH IDs to retrieve
max_ids <- min(n_terms, length(mesh_count$links$pubmed_mesh))
mesh_ids <- mesh_count$links$pubmed_mesh[1:max_ids]
}
}
# If we didn't get MeSH IDs from PubMed, try direct MeSH search
if (is.null(mesh_ids)) {
message("Trying direct MeSH database search...")
# For direct MeSH search, simplify the query
if (!is.null(query)) {
direct_query <- query
} else if (!is.null(dictionary_type) && dictionary_type %in% names(mesh_query_map)) {
# Extract keyword from MeSH query (removing the [MeSH] qualifier)
direct_query <- gsub("\\[MeSH\\]", "", mesh_query_map[[dictionary_type]])
} else {
direct_query <- "" # Empty query will return all MeSH terms
}
# Use shorter term limit for direct MeSH search to avoid large results
limited_n_terms <- min(n_terms, 100)
mesh_search <- tryCatch({
rentrez::entrez_search(
db = "mesh",
term = direct_query,
retmax = limited_n_terms
)
}, error = function(e) {
message("MeSH search error: ", e$message)
return(NULL)
})
if (!is.null(mesh_search) && length(mesh_search$ids) > 0) {
mesh_ids <- mesh_search$ids
} else {
warning("No results found in MeSH for query: ", direct_query)
return(create_dummy_dictionary(dictionary_type))
}
}
# If we still don't have MeSH IDs, return dummy dictionary
if (is.null(mesh_ids) || length(mesh_ids) == 0) {
warning("Could not retrieve any MeSH IDs")
return(create_dummy_dictionary(dictionary_type))
}
# Fetch MeSH records in smaller batches to avoid memory issues
batch_size <- 20
num_batches <- ceiling(length(mesh_ids) / batch_size)
# Initialize combined dictionary
combined_dict <- data.frame(
term = character(),
id = character(),
type = character(),
source = character(),
stringsAsFactors = FALSE
)
for (batch in 1:num_batches) {
message("Processing batch ", batch, " of ", num_batches)
# Get batch of IDs
start_idx <- (batch - 1) * batch_size + 1
end_idx <- min(batch * batch_size, length(mesh_ids))
batch_ids <- mesh_ids[start_idx:end_idx]
# Fetch MeSH record data for this batch
mesh_records <- tryCatch({
rentrez::entrez_fetch(
db = "mesh",
id = batch_ids,
rettype = "full",
retmode = "xml"
)
}, error = function(e) {
message("Error fetching MeSH records for batch ", batch, ": ", e$message)
return(NULL)
})
if (!is.null(mesh_records)) {
# Process the batch
batch_dict <- process_mesh_xml(mesh_records, dictionary_type)
# Add to combined dictionary
if (nrow(batch_dict) > 0 && !inherits(batch_dict, "try-error")) {
combined_dict <- rbind(combined_dict, batch_dict)
}
}
}
# Check if we got any terms
if (nrow(combined_dict) == 0) {
warning("No terms extracted from MeSH records")
return(create_dummy_dictionary(dictionary_type))
}
# Remove duplicates
combined_dict <- combined_dict[!duplicated(combined_dict$term), ]
message("Retrieved ", nrow(combined_dict), " unique terms from MeSH")
return(combined_dict)
}
#' Process MeSH XML data with improved error handling
#'
#' Helper function to process MeSH XML data and extract terms
#'
#' @param mesh_records XML data from MeSH database
#' @param dictionary_type Type of dictionary
#'
#' @return A data frame with MeSH terms
#' @keywords internal
process_mesh_xml <- function(mesh_records, dictionary_type) {
# Check if the input is empty or invalid
if (is.null(mesh_records) || length(mesh_records) == 0) {
warning("Empty MeSH records provided")
return(create_dummy_dictionary(dictionary_type))
}
# Check for excessively large input to avoid memory issues
if (nchar(mesh_records) > 1e7) { # 10MB limit
warning("MeSH records too large to process at once, using chunked processing")
return(process_mesh_chunks(mesh_records, dictionary_type))
}
# Try to parse the XML with improved error handling
tryCatch({
# First, check if the data looks like valid XML
if (!grepl("<\\?xml|<DescriptorRecord|<Concept", substr(mesh_records, 1, 1000))) {
# If not, it might be text data instead of XML - try to extract useful information
return(extract_mesh_from_text(mesh_records, dictionary_type))
}
# Try to parse as XML
mesh_xml <- xml2::read_xml(mesh_records)
# Extract terms and IDs - handle both DescriptorRecord and Concept formats
mesh_nodes <- xml2::xml_find_all(mesh_xml, "//DescriptorRecord|//Concept")
if (length(mesh_nodes) == 0) {
warning("No MeSH descriptor records found in XML")
return(create_dummy_dictionary(dictionary_type))
}
# Initialize vectors to store extracted data
terms <- character()
ids <- character()
# Process each node
for (node in mesh_nodes) {
node_name <- xml2::xml_name(node)
if (node_name == "DescriptorRecord") {
# Get descriptor UI (ID)
descriptor_ui_node <- xml2::xml_find_first(node, ".//DescriptorUI")
if (!is.na(descriptor_ui_node)) {
descriptor_ui <- xml2::xml_text(descriptor_ui_node)
# Get descriptor name (term)
descriptor_name_node <- xml2::xml_find_first(node, ".//DescriptorName/String")
if (!is.na(descriptor_name_node)) {
descriptor_name <- xml2::xml_text(descriptor_name_node)
# Add to vectors
terms <- c(terms, descriptor_name)
ids <- c(ids, descriptor_ui)
# Also get entry terms (synonyms)
entry_terms <- xml2::xml_find_all(node, ".//ConceptList//Term/String")
if (length(entry_terms) > 0) {
entry_terms_text <- xml2::xml_text(entry_terms)
terms <- c(terms, entry_terms_text)
ids <- c(ids, rep(descriptor_ui, length(entry_terms_text)))
}
}
}
} else if (node_name == "Concept") {
# For Concept records
concept_ui_node <- xml2::xml_find_first(node, ".//ConceptUI")
if (!is.na(concept_ui_node)) {
concept_ui <- xml2::xml_text(concept_ui_node)
concept_name_node <- xml2::xml_find_first(node, ".//ConceptName/String")
if (!is.na(concept_name_node)) {
concept_name <- xml2::xml_text(concept_name_node)
# Add to vectors
terms <- c(terms, concept_name)
ids <- c(ids, concept_ui)
# Also get terms
term_nodes <- xml2::xml_find_all(node, ".//TermList/Term/String")
if (length(term_nodes) > 0) {
term_texts <- xml2::xml_text(term_nodes)
terms <- c(terms, term_texts)
ids <- c(ids, rep(concept_ui, length(term_texts)))
}
}
}
}
}
if (length(terms) == 0) {
warning("No terms extracted from MeSH XML")
return(create_dummy_dictionary(dictionary_type))
}
# Create data frame
dict <- data.frame(
term = terms,
id = ids,
type = rep(dictionary_type, length(terms)),
source = rep("mesh", length(terms)),
stringsAsFactors = FALSE
)
# Remove duplicates
dict <- dict[!duplicated(dict$term), ]
message("Retrieved ", nrow(dict), " unique terms from MeSH")
return(dict)
}, error = function(e) {
warning("Error parsing MeSH XML: ", e$message)
return(create_dummy_dictionary(dictionary_type))
})
}
#' Process MeSH data in chunks to avoid memory issues
#'
#' @param mesh_records Large MeSH records data
#' @param dictionary_type Type of dictionary
#'
#' @return A data frame with MeSH terms
#' @keywords internal
process_mesh_chunks <- function(mesh_records, dictionary_type) {
# Extract records by splitting on record boundaries
chunk_size <- 1e6 # 1MB chunks
num_chunks <- ceiling(nchar(mesh_records) / chunk_size)
# Initialize data frame for results
combined_dict <- data.frame(
term = character(),
id = character(),
type = character(),
source = character(),
stringsAsFactors = FALSE
)
# Process in chunks
for (i in 1:num_chunks) {
start_pos <- (i-1) * chunk_size + 1
end_pos <- min(i * chunk_size, nchar(mesh_records))
# Extract chunk
chunk <- substr(mesh_records, start_pos, end_pos)
# Find complete records in this chunk
# Look for record start/end patterns
start_tags <- gregexpr("<DescriptorRecord>|<Concept>", chunk)[[1]]
end_tags <- gregexpr("</DescriptorRecord>|</Concept>", chunk)[[1]]
# If no complete records in this chunk, combine with next chunk
if (length(start_tags) == 0 || length(end_tags) == 0 ||
start_tags[1] == -1 || end_tags[1] == -1) {
next
}
# Process each complete record
for (j in 1:length(start_tags)) {
# Find matching end tag
matched_end_idx <- which(end_tags > start_tags[j])[1]
if (!is.na(matched_end_idx)) {
record <- substr(chunk, start_tags[j], end_tags[matched_end_idx] +
attr(end_tags, "match.length")[matched_end_idx] - 1)
# Try to parse this record
tryCatch({
# Wrap in root element for XML parsing
record_xml <- paste0("<root>", record, "</root>")
dict_chunk <- process_mesh_xml(record_xml, dictionary_type)
# Add to combined dictionary
if (nrow(dict_chunk) > 0 && !inherits(dict_chunk, "try-error")) {
combined_dict <- rbind(combined_dict, dict_chunk)
}
}, error = function(e) {
# Skip this record on error
})
}
}
}
# Remove duplicates from combined dictionary
if (nrow(combined_dict) > 0) {
combined_dict <- combined_dict[!duplicated(combined_dict$term), ]
message("Retrieved ", nrow(combined_dict), " unique terms from chunked MeSH processing")
return(combined_dict)
} else {
return(create_dummy_dictionary(dictionary_type))
}
}
#' Extract MeSH terms from text format instead of XML
#'
#' @param mesh_text Text containing MeSH data in non-XML format
#' @param dictionary_type Type of dictionary
#'
#' @return A data frame with MeSH terms
#' @keywords internal
extract_mesh_from_text <- function(mesh_text, dictionary_type) {
# Initialize vectors to store terms and IDs
terms <- character()
ids <- character()
# Try to identify terms and their IDs from the text
# Look for patterns like "1: Term Name" or "Tree Number(s): D25.651"
# Extract terms - look for patterns like "1: Term Name" or just capitalized phrases
term_matches <- gregexpr("\\d+:\\s*([A-Z][^,\\n\\r]+)|(^[A-Z][^,\\n\\r]+)", mesh_text, perl = TRUE)
if (term_matches[[1]][1] != -1) {
term_starts <- term_matches[[1]]
term_lengths <- attr(term_matches[[1]], "match.length")
for (i in 1:length(term_starts)) {
term_text <- substr(mesh_text, term_starts[i], term_starts[i] + term_lengths[i] - 1)
# Clean up the term
term_text <- gsub("^\\d+:\\s*", "", term_text)
term_text <- trimws(term_text)
# Add to our vectors
if (nchar(term_text) > 0) {
terms <- c(terms, term_text)
# Generate a placeholder ID if we can't extract it
ids <- c(ids, paste0("MESH_", i))
}
}
}
# Look for Entry Terms section
entry_term_section <- regexpr("Entry Terms?:([^\\n]+)\\n", mesh_text)
if (entry_term_section[1] != -1) {
entry_text <- substr(mesh_text,
entry_term_section[1],
entry_term_section[1] + attr(entry_term_section, "match.length")[1] - 1)
# Extract entry terms
entry_terms <- strsplit(gsub("Entry Terms?:", "", entry_text), ",")[[1]]
entry_terms <- trimws(entry_terms)
# Add to our vectors
if (length(entry_terms) > 0) {
terms <- c(terms, entry_terms)
# Use the same ID for all entry terms from this section
ids <- c(ids, rep(paste0("MESH_ENTRY_", length(ids) + 1), length(entry_terms)))
}
}
# Extract Tree Numbers - these might help us generate better IDs
tree_matches <- regexpr("Tree Number\\(s\\):\\s*([^\\n]+)", mesh_text)
tree_numbers <- character()
if (tree_matches[1] != -1) {
tree_text <- substr(mesh_text,
tree_matches[1],
tree_matches[1] + attr(tree_matches, "match.length")[1] - 1)
# Extract tree numbers
tree_numbers <- strsplit(gsub("Tree Number\\(s\\):\\s*", "", tree_text), ",")[[1]]
tree_numbers <- trimws(tree_numbers)
# If we have tree numbers, use them to improve our IDs
if (length(tree_numbers) > 0 && length(ids) > 0) {
# Update the first ID with a tree number
ids[1] <- gsub("\\.", "", tree_numbers[1])
}
}
# If we failed to extract terms, return dummy dictionary
if (length(terms) == 0) {
warning("No terms extracted from MeSH text")
return(create_dummy_dictionary(dictionary_type))
}
# Create data frame
dict <- data.frame(
term = terms,
id = ids,
type = rep(dictionary_type, length(terms)),
source = rep("mesh_text", length(terms)),
stringsAsFactors = FALSE
)
# Remove duplicates
dict <- dict[!duplicated(dict$term), ]
message("Extracted ", nrow(dict), " unique terms from MeSH text format")
return(dict)
}
#' Load terms from UMLS API
#'
#' This function retrieves terms from UMLS using the REST API.
#'
#' @param dictionary_type Type of dictionary to load (e.g., "disease", "drug", "gene").
#' @param api_key UMLS API key for authentication.
#' @param n_terms Maximum number of terms to fetch.
#' @param semantic_types Vector of semantic type identifiers to filter by.
#'
#' @return A data frame containing the UMLS terms.
#' @keywords internal
load_from_umls <- function(dictionary_type, api_key, n_terms = 200, semantic_types = NULL) {
if (!requireNamespace("httr", quietly = TRUE)) {
stop("The httr package is required for UMLS API calls. Install it with: install.packages('httr')")
}
if (!requireNamespace("jsonlite", quietly = TRUE)) {
stop("The jsonlite package is required for parsing UMLS API responses. Install it with: install.packages('jsonlite')")
}
if (is.null(semantic_types)) {
warning("No semantic types specified for UMLS query. Using default types for: ", dictionary_type)
semantic_types <- switch(dictionary_type,
"disease" = c("T047", "T048", "T019", "T046"), # Disease and Syndrome, Mental Disorder, Congenital Abnormality, Pathologic Function
"drug" = c("T116", "T121", "T195", "T200"), # Amino Acid, Lipid, Antibiotic, Clinical Drug
"gene" = c("T028", "T087", "T123"), # Gene, Amino Acid Sequence, Nucleotide Sequence
stop("Unsupported dictionary type for UMLS: ", dictionary_type))
}
message("Authenticating with UMLS...")
# Authenticate with UMLS to get TGT
tgt_url <- authenticate_umls(api_key)
if (is.null(tgt_url)) {
warning("Failed to authenticate with UMLS. Using dummy dictionary instead.")
return(create_dummy_dictionary(dictionary_type))
}
message("Fetching terms from UMLS for semantic types: ", paste(semantic_types, collapse = ", "))
# Instead of searching by semantic type directly (which seems to be causing issues),
# we'll search using a keyword based on the dictionary type and then filter by semantic type
search_string <- switch(dictionary_type,
"disease" = "disease",
"drug" = "drug",
"gene" = "gene",
"")
# Get service ticket for this request
service_ticket <- get_service_ticket(tgt_url)
if (is.null(service_ticket)) {
warning("Failed to get service ticket")
return(create_dummy_dictionary(dictionary_type))
}
# Perform search with keyword
search_url <- "https://uts-ws.nlm.nih.gov/rest/search/current"
# Search query parameters
query_params <- list(
ticket = service_ticket,
string = search_string,
sabs = "SNOMEDCT_US,ICD10CM,MSH", # Limit to these vocabularies
searchType = "words",
returnIdType = "concept",
pageSize = n_terms
)
response <- httr::GET(url = search_url, query = query_params)
if (httr::status_code(response) != 200) {
warning("Failed to search UMLS: ", httr::http_status(response)$message)
return(create_dummy_dictionary(dictionary_type))
}
# Parse response
content <- httr::content(response, "text", encoding = "UTF-8")
result <- jsonlite::fromJSON(content)
# Check if results exist
if (length(result$result$results) == 0) {
warning("No results found for search: ", search_string)
return(create_dummy_dictionary(dictionary_type))
}
# Get CUIs from the search results
cuis <- result$result$results$ui
# Now for each CUI, get its semantic types and filter
all_terms <- list()
for (cui in cuis) {
# Get a fresh service ticket for each request to avoid expiration
service_ticket <- get_service_ticket(tgt_url)
if (is.null(service_ticket)) {
next
}
# Get concept info to check its semantic type
concept_url <- paste0("https://uts-ws.nlm.nih.gov/rest/content/current/CUI/", cui)
response <- httr::GET(
url = concept_url,
query = list(ticket = service_ticket)
)
if (httr::status_code(response) != 200) {
next
}
content <- httr::content(response, "text", encoding = "UTF-8")
concept_info <- jsonlite::fromJSON(content)
# Get another service ticket for semantic types
service_ticket <- get_service_ticket(tgt_url)
if (is.null(service_ticket)) {
next
}
# Get semantic types for this concept
sem_types_url <- concept_info$result$semanticTypes
# Check if semanticTypes is a URL or just a nested JSON element
if (is.null(sem_types_url) || !is.character(sem_types_url) || length(sem_types_url) == 0) {
# Try to extract semantic types directly from concept_info
if (!is.null(concept_info$result$semanticTypes) && is.list(concept_info$result$semanticTypes)) {
sem_types_info <- list(result = concept_info$result$semanticTypes)
} else {
next # Skip if we can't find semantic types
}
} else {
# Ensure we have a single valid URL
if (length(sem_types_url) > 1) {
sem_types_url <- sem_types_url[1] # Take just the first URL if multiple are returned
}
# Make sure it's a valid URL
if (!grepl("^https?://", sem_types_url)) {
next # Skip if not a valid URL
}
# Get semantic types data
response <- httr::GET(
url = sem_types_url,
query = list(ticket = service_ticket, pageSize = 100)
)
if (httr::status_code(response) != 200) {
next
}
content <- httr::content(response, "text", encoding = "UTF-8")
sem_types_info <- jsonlite::fromJSON(content)
}
# This block has been moved into the conditional above
# Check if this concept has one of our target semantic types
if (length(sem_types_info$result) == 0) {
next
}
# Extract semantic type IDs, handling different response formats
concept_sem_types <- tryCatch({
if (is.data.frame(sem_types_info$result)) {
# If result is a data frame
if ("uri" %in% names(sem_types_info$result)) {
# Extract TUIs from URIs
uris <- sem_types_info$result$uri
sapply(uris, function(uri) {
tui_match <- regmatches(uri, regexpr("T[0-9]+", uri))
if (length(tui_match) > 0) return(tui_match[1])
else return(NA)
})
} else if ("semanticType" %in% names(sem_types_info$result)) {
sem_types_info$result$semanticType
} else {
NA
}
} else if (is.list(sem_types_info$result)) {
# If result is a list of semantic types
sem_types <- vector("character")
# Process each element in the result list
for (i in seq_along(sem_types_info$result)) {
st <- sem_types_info$result[[i]]
if (is.list(st)) {
# Handle list element
if (!is.null(st$semanticType)) {
sem_types <- c(sem_types, st$semanticType)
} else if (!is.null(st$uri)) {
# Extract TUI from URI if available
tui_match <- regmatches(st$uri, regexpr("T[0-9]+", st$uri))
if (length(tui_match) > 0) {
sem_types <- c(sem_types, tui_match[1])
}
}
} else if (is.character(st)) {
# Handle character element (might be a direct TUI)
if (grepl("^T[0-9]+$", st)) {
sem_types <- c(sem_types, st)
}
}
}
if (length(sem_types) > 0) {
sem_types
} else {
NA
}
} else {
NA
}
}, error = function(e) {
message("Error extracting semantic types: ", e$message)
return(NA)
})
concept_sem_types <- concept_sem_types[!is.na(concept_sem_types)]
# Check if any of the concept's semantic types match our filter
if (any(concept_sem_types %in% semantic_types)) {
matching_sem_types <- concept_sem_types[concept_sem_types %in% semantic_types]
for (sem_type in matching_sem_types) {
# Store concept with its matching semantic type
if (is.null(all_terms[[sem_type]])) {
all_terms[[sem_type]] <- list()
}
all_terms[[sem_type]][[length(all_terms[[sem_type]]) + 1]] <- list(
ui = concept_info$result$ui,
name = concept_info$result$name,
semantic_type = sem_type
)
}
}
}
# Process all collected terms
if (length(all_terms) == 0) {
warning("No terms found with matching semantic types. Using dummy dictionary instead.")
return(create_dummy_dictionary(dictionary_type))
}
# Create data frame with all terms
terms_list <- list()
for (sem_type in names(all_terms)) {
sem_type_terms <- all_terms[[sem_type]]
if (length(sem_type_terms) > 0) {
df <- data.frame(
term = sapply(sem_type_terms, function(x) x$name),
id = sapply(sem_type_terms, function(x) x$ui),
type = rep(dictionary_type, length(sem_type_terms)),
source = rep("umls", length(sem_type_terms)),
semantic_type = rep(sem_type, length(sem_type_terms)),
stringsAsFactors = FALSE
)
terms_list[[sem_type]] <- df
}
}
if (length(terms_list) == 0) {
warning("Failed to process UMLS terms. Using dummy dictionary instead.")
return(create_dummy_dictionary(dictionary_type))
}
# Combine all data frames
combined_dict <- do.call(rbind, terms_list)
# Remove duplicates based on term
deduped_dict <- combined_dict[!duplicated(combined_dict$term), ]
message("Retrieved ", nrow(deduped_dict), " unique terms from UMLS")
return(deduped_dict)
}
#' Authenticate with UMLS
#'
#' This function authenticates with UMLS and returns a TGT URL.
#'
#' @param api_key UMLS API key
#'
#' @return Character string with TGT URL or NULL if authentication fails
#' @keywords internal
authenticate_umls <- function(api_key) {
# Base URL for UTS REST API authentication
auth_url <- "https://utslogin.nlm.nih.gov/cas/v1/api-key"
tryCatch({
# Get Ticket Granting Ticket (TGT)
response <- httr::POST(
url = auth_url,
body = list(apikey = api_key),
encode = "form"
)
if (httr::status_code(response) != 201) {
warning("Failed to authenticate with UMLS: ", httr::content(response, "text"))
return(NULL)
}
# Extract TGT URL from response
tgt_url <- httr::headers(response)$location
if (is.null(tgt_url)) {
warning("No TGT URL found in response headers")
return(NULL)
}
return(tgt_url)
}, error = function(e) {
warning("Error authenticating with UMLS: ", e$message)
return(NULL)
})
}
#' Get a service ticket from a TGT URL
#'
#' This function gets a service ticket for a specific service using the TGT URL.
#'
#' @param tgt_url Ticket Granting Ticket URL
#'
#' @return Character string with service ticket or NULL if it fails
#' @keywords internal
get_service_ticket <- function(tgt_url) {
tryCatch({
# Get Service Ticket
service_ticket_url <- tgt_url
response <- httr::POST(
url = service_ticket_url,
body = list(service = "http://umlsks.nlm.nih.gov"),
encode = "form"
)
if (httr::status_code(response) != 200) {
warning("Failed to get service ticket: ", httr::content(response, "text"))
return(NULL)
}
service_ticket <- httr::content(response, "text")
return(service_ticket)
}, error = function(e) {
warning("Error getting service ticket: ", e$message)
return(NULL)
})
}
#' Validate a UMLS API key
#'
#' This function validates a UMLS API key using the validation endpoint.
#'
#' @param api_key UMLS API key to validate
#' @param validator_api_key Your application's UMLS API key (for third-party validation)
#'
#' @return Logical indicating if the API key is valid
#' @export
validate_umls_key <- function(api_key, validator_api_key = NULL) {
if (!requireNamespace("httr", quietly = TRUE)) {
stop("The httr package is required for UMLS API calls. Install it with: install.packages('httr')")
}
if (is.null(validator_api_key)) {
# Use the API key itself for validation by trying to authenticate
result <- !is.null(authenticate_umls(api_key))
return(result)
} else {
# Use third-party validation endpoint
validate_url <- "https://utslogin.nlm.nih.gov/validateUser"
response <- httr::GET(
url = validate_url,
query = list(
validatorApiKey = validator_api_key,
apiKey = api_key
)
)
# Check the response
if (httr::status_code(response) != 200) {
warning("Failed to validate UMLS API key: ", httr::content(response, "text"))
return(FALSE)
}
# Parse the response content
content <- httr::content(response, "text")
# The endpoint returns "true" or "false"
return(tolower(content) == "true")
}
}
#' Helper function to create dummy dictionaries
#' @keywords internal
create_dummy_dictionary <- function(dictionary_type) {
message("Creating dummy dictionary for ", dictionary_type)
if (dictionary_type == "disease") {
dict <- data.frame(
term = c("covid-19", "sars-cov-2", "coronavirus", "pneumonia", "fever",
"cough", "dyspnea", "migraine", "headache", "hypertension",
"diabetes", "asthma", "cancer", "obesity", "myocarditis",
"thrombosis", "stroke", "anosmia", "fatigue", "hypoxemia"),
id = paste0("D", sprintf("%06d", 1:20)),
type = rep("disease", 20),
source = rep("dummy", 20),
stringsAsFactors = FALSE
)
} else if (dictionary_type == "drug") {
dict <- data.frame(
term = c("remdesivir", "dexamethasone", "hydroxychloroquine", "azithromycin",
"favipiravir", "tocilizumab", "heparin", "aspirin", "ibuprofen",
"paracetamol", "acetaminophen", "statins", "metformin", "insulin",
"warfarin", "enoxaparin", "ivermectin", "lopinavir", "ritonavir", "oseltamivir"),
id = paste0("D", sprintf("%06d", 21:40)),
type = rep("drug", 20),
source = rep("dummy", 20),
stringsAsFactors = FALSE
)
} else if (dictionary_type == "gene") {
dict <- data.frame(
term = c("ACE2", "TMPRSS2", "IL6", "IL1B", "TNF", "IFNG", "CXCL10",
"CCL2", "TNFRSF1A", "NFKB1", "TLR3", "TLR7", "MYD88", "IRF3",
"IRF7", "STAT1", "STAT3", "JAK1", "JAK2", "MAPK1"),
id = paste0("G", sprintf("%06d", 1:20)),
type = rep("gene", 20),
source = rep("dummy", 20),
stringsAsFactors = FALSE
)
} else {
dict <- data.frame(
term = character(0),
id = character(0),
type = character(0),
source = character(0),
stringsAsFactors = FALSE
)
}
return(dict)
}
# Helper function to search UMLS by semantic type
search_umls_by_semantic_type <- function(tgt_url, semantic_type, max_results = 100) {
# Get Service Ticket
service_ticket_url <- paste0(tgt_url, "?service=http://umlsks.nlm.nih.gov")
response <- httr::POST(service_ticket_url)
service_ticket <- httr::content(response, "text")
# Base URL for UMLS REST API
base_url <- "https://uts-ws.nlm.nih.gov/rest"
# Search by semantic type
search_url <- paste0(base_url, "/search/current")
response <- httr::GET(
url = search_url,
query = list(
string = "",
sabs = "SNOMEDCT_US,ICD10CM,MSH",
searchType = "exact",
inputType = "sourceUi",
rootSource = "SEMANTIC_TYPE",
pageSize = max_results,
ticket = service_ticket
)
)
if (httr::status_code(response) != 200) {
warning("Failed to search UMLS: ", httr::content(response, "text"))
return(list())
}
# Extract results
search_results <- httr::content(response)
# If no results, return empty list
if (length(search_results$result$results) == 0) {
return(list())
}
# Process results
terms <- list()
for (result in search_results$result$results) {
terms[[length(terms) + 1]] <- list(
ui = result$ui,
name = result$name,
semantic_type = semantic_type
)
}
return(terms)
}
#' Detect language of text
#'
#' This function attempts to detect the language of a text string.
#' It implements a simple n-gram based approach that doesn't require
#' additional packages.
#'
#' @param text Text string to analyze
#' @param sample_size Maximum number of characters to sample for language detection
#'
#' @return Character string containing the ISO 639-1 language code
#' @export
detect_lang <- function(text, sample_size = 1000) {
# Simple language detection based on common words in different languages
# This is a basic implementation - more sophisticated methods would use proper language models
# Sample the text to avoid processing very large texts
if (nchar(text) > sample_size) {
text <- substr(text, 1, sample_size)
}
# Convert to lowercase
text <- tolower(text)
# Define common words for major languages
language_profiles <- list(
en = c("the", "and", "in", "of", "to", "a", "is", "that", "for", "it", "with", "as", "was", "on"),
es = c("el", "la", "de", "que", "y", "en", "un", "por", "con", "no", "una", "su", "para", "es"),
fr = c("le", "la", "de", "et", "les", "des", "en", "un", "du", "une", "que", "est", "dans", "qui"),
de = c("der", "die", "und", "in", "den", "von", "zu", "das", "mit", "sich", "des", "auf", "f\u00fcr", "ist"),
it = c("il", "di", "che", "e", "la", "in", "un", "a", "per", "\u00e8", "una", "con", "sono", "su"),
pt = c("de", "a", "o", "que", "e", "do", "da", "em", "um", "para", "com", "n\u00e3o", "uma", "os"),
nl = c("de", "en", "van", "het", "een", "in", "is", "dat", "op", "te", "zijn", "met", "voor", "niet"),
ru = c("\u0438", "\u0432", "\u043d\u0435", "\u043d\u0430", "\u044f", "\u0447\u0442\u043e", "\u0441", "\u043f\u043e", "\u044d\u0442\u043e", "\u043a", "\u0430", "\u043d\u043e", "\u043a\u0430\u043a", "\u0438\u0437")
)
# Count occurrences of words in each language profile
scores <- sapply(language_profiles, function(profile) {
# Extract words from text
words <- unlist(strsplit(gsub("[^a-z\u0430-\u044f ]", " ", text), "\\s+"))
words <- words[words != ""]
# Count matches
matches <- sum(words %in% profile)
return(matches / length(profile))
})
# Return the language with the highest score
if (max(scores) > 0.1) { # Threshold to avoid misidentification of very short texts
return(names(which.max(scores)))
} else {
return("unknown")
}
}
#' Extract n-grams from text
#'
#' This function extracts n-grams (sequences of n words) from text.
#'
#' @param text Character vector of texts to process
#' @param n Integer specifying the n-gram size (1 for unigrams, 2 for bigrams, etc.)
#' @param min_freq Minimum frequency to include an n-gram
#'
#' @return A data frame containing n-grams and their frequencies
#' @export
extract_ngrams <- function(text, n = 1, min_freq = 2) {
# Ensure text is character
text <- as.character(text)
# Remove missing values
text <- text[!is.na(text)]
# Function to extract n-grams from a single text
extract_text_ngrams <- function(single_text, n) {
# Remove non-alphanumeric characters and convert to lowercase
clean_text <- tolower(gsub("[^a-zA-Z0-9 ]", " ", single_text))
# Split into words
words <- unlist(strsplit(clean_text, "\\s+"))
words <- words[words != ""]
# If fewer words than n, return empty result
if (length(words) < n) {
return(character(0))
}
# Create n-grams
ngrams <- character(length(words) - n + 1)
for (i in 1:(length(words) - n + 1)) {
ngrams[i] <- paste(words[i:(i + n - 1)], collapse = " ")
}
return(ngrams)
}
# Apply to all texts and combine results
all_ngrams <- unlist(lapply(text, extract_text_ngrams, n = n))
# Count frequencies
ngram_freq <- table(all_ngrams)
# Convert to data frame and filter by minimum frequency
result <- data.frame(
ngram = names(ngram_freq),
frequency = as.numeric(ngram_freq),
stringsAsFactors = FALSE
)
# Filter by minimum frequency
result <- result[result$frequency >= min_freq, ]
# Sort by frequency
result <- result[order(result$frequency, decreasing = TRUE), ]
return(result)
}
#' Perform named entity recognition on text
#'
#' This function performs a simple dictionary-based named entity recognition.
#' For more advanced NER, consider using external tools via reticulate.
#'
#' @param text Character vector of texts to process
#' @param entity_types Character vector of entity types to recognize
#' @param custom_dictionaries List of custom dictionaries (named by entity type)
#'
#' @return A data frame containing found entities, their types, and positions
#' @export
extract_ner <- function(text, entity_types = c("disease", "drug", "gene"),
custom_dictionaries = NULL) {
# Load dictionaries for requested entity types
dictionaries <- list()
for (type in entity_types) {
if (!is.null(custom_dictionaries) && type %in% names(custom_dictionaries)) {
dictionaries[[type]] <- custom_dictionaries[[type]]
} else {
# Try to load built-in dictionary
dict <- try(load_dictionary(type), silent = TRUE)
if (!inherits(dict, "try-error")) {
dictionaries[[type]] <- dict
} else {
warning("Dictionary for entity type '", type, "' not found")
}
}
}
# Check if any dictionaries were loaded
if (length(dictionaries) == 0) {
stop("No valid dictionaries found for the requested entity types")
}
# Initialize results
results <- data.frame(
text_id = integer(),
entity = character(),
entity_type = character(),
start_pos = integer(),
end_pos = integer(),
stringsAsFactors = FALSE
)
# Process each text
for (i in seq_along(text)) {
if (i %% 10 == 0) {
message("Processing text ", i, " of ", length(text))
}
current_text <- text[i]
# Skip missing values
if (is.na(current_text) || current_text == "") {
next
}
# Scan for entities from each dictionary
for (type in names(dictionaries)) {
dict <- dictionaries[[type]]
for (j in 1:nrow(dict)) {
term <- dict$term[j]
# Create a regex pattern with word boundaries
pattern <- paste0("\\b", term, "\\b")
# Find all matches
matches <- gregexpr(pattern, current_text, ignore.case = TRUE)
# Extract match positions
if (matches[[1]][1] != -1) { # -1 indicates no match
start_positions <- matches[[1]]
end_positions <- start_positions + attr(matches[[1]], "match.length") - 1
# Add to results
for (k in seq_along(start_positions)) {
results <- rbind(results, data.frame(
text_id = i,
entity = term,
entity_type = type,
start_pos = start_positions[k],
end_pos = end_positions[k],
stringsAsFactors = FALSE
))
}
}
}
}
}
# Return empty data frame if no entities found
if (nrow(results) == 0) {
message("No entities found")
return(data.frame(
text_id = integer(),
entity = character(),
entity_type = character(),
start_pos = integer(),
end_pos = integer(),
stringsAsFactors = FALSE
))
}
return(results)
}
#' Perform sentence segmentation on text
#'
#' This function splits text into sentences.
#'
#' @param text Character vector of texts to process
#'
#' @return A list where each element contains a character vector of sentences
#' @export
segment_sentences <- function(text) {
# Ensure text is character
text <- as.character(text)
# Remove missing values
text <- text[!is.na(text)]
# Function to split a single text into sentences
split_text <- function(single_text) {
# Handle common abbreviations to avoid splitting at them
abbrevs <- c("Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Inc.", "Ltd.", "Co.", "Fig.", "e.g.", "i.e.", "etc.")
# Temporarily replace periods in abbreviations
for (abbr in abbrevs) {
single_text <- gsub(abbr, gsub("\\.", "<<DOT>>", abbr), single_text)
}
# Split on sentence boundaries
sentences <- unlist(strsplit(single_text, "(?<=[.!?])\\s+", perl = TRUE))
# Restore periods in abbreviations
sentences <- gsub("<<DOT>>", ".", sentences)
# Remove empty sentences
sentences <- sentences[sentences != ""]
return(sentences)
}
# Apply to all texts
result <- lapply(text, split_text)
return(result)
}
#' Map terms to biomedical ontologies
#'
#' This function maps terms to standard biomedical ontologies like MeSH or UMLS.
#'
#' @param terms Character vector of terms to map
#' @param ontology Character string. The ontology to use: "mesh" or "umls"
#' @param api_key UMLS API key (required if ontology = "umls")
#' @param fuzzy_match Logical. If TRUE, allows fuzzy matching of terms
#' @param similarity_threshold Numeric between 0 and 1. Minimum similarity for fuzzy matching
#' @param mesh_query Additional query to filter MeSH terms (only if ontology = "mesh")
#' @param semantic_types Vector of semantic types to filter UMLS results
#' @param dictionary_type Type of dictionary to use ("disease", "drug", "gene", etc.)
#'
#' @return A data frame with mapped terms and ontology identifiers
#' @export
map_ontology <- function(terms, ontology = c("mesh", "umls"),
api_key = NULL,
fuzzy_match = FALSE,
similarity_threshold = 0.8,
mesh_query = NULL,
semantic_types = NULL,
dictionary_type = "disease") {
# Match ontology argument
ontology <- match.arg(ontology)
# Check if terms is empty
if (length(terms) == 0) {
return(data.frame(
term = character(),
ontology_id = character(),
ontology_term = character(),
match_type = character(),
stringsAsFactors = FALSE
))
}
# Check if API key is provided when using UMLS
if (ontology == "umls" && is.null(api_key)) {
stop("API key is required for UMLS mapping")
}
# Load ontology dictionary using the improved functions
if (ontology == "mesh") {
ontology_dict <- load_dictionary(dictionary_type,
source = "mesh",
mesh_query = mesh_query)
} else {
ontology_dict <- load_dictionary(dictionary_type,
source = "umls",
api_key = api_key,
semantic_type_filter = semantic_types)
}
# Check if we have a valid dictionary
if (nrow(ontology_dict) == 0) {
warning("No terms found in the ", ontology, " dictionary")
return(data.frame(
term = character(),
ontology_id = character(),
ontology_term = character(),
match_type = character(),
stringsAsFactors = FALSE
))
}
# Initialize results
results <- data.frame(
term = character(),
ontology_id = character(),
ontology_term = character(),
match_type = character(),
stringsAsFactors = FALSE
)
# Function to calculate string similarity for fuzzy matching
string_similarity <- function(a, b) {
# Simple implementation of Levenshtein distance-based similarity
a <- tolower(a)
b <- tolower(b)
distance <- adist(a, b)[1]
max_length <- max(nchar(a), nchar(b))
similarity <- 1 - (distance / max_length)
return(similarity)
}
# Process each term
for (term in terms) {
# Exact match - case insensitive
exact_matches <- ontology_dict[tolower(ontology_dict$term) == tolower(term), ]
if (nrow(exact_matches) > 0) {
for (i in 1:nrow(exact_matches)) {
results <- rbind(results, data.frame(
term = term,
ontology_id = exact_matches$id[i],
ontology_term = exact_matches$term[i],
match_type = "exact",
stringsAsFactors = FALSE
))
}
} else if (fuzzy_match) {
# Fuzzy matching
# Calculate similarity with all terms in the dictionary
all_terms <- ontology_dict$term
similarities <- numeric(length(all_terms))
for (i in seq_along(all_terms)) {
similarities[i] <- string_similarity(term, all_terms[i])
}
# Get matches above threshold
matches <- which(similarities >= similarity_threshold)
if (length(matches) > 0) {
for (i in matches) {
results <- rbind(results, data.frame(
term = term,
ontology_id = ontology_dict$id[i],
ontology_term = ontology_dict$term[i],
match_type = "fuzzy",
stringsAsFactors = FALSE
))
}
}
}
}
# If no matches found
if (nrow(results) == 0) {
message("No matches found for the input terms in the ", ontology, " ontology")
}
return(results)
}
#' Apply topic modeling to a corpus
#'
#' This function implements a simple non-negative matrix factorization (NMF)
#' approach to topic modeling, without requiring additional packages.
#'
#' @param text_data A data frame containing the text data
#' @param text_column Name of the column containing the text
#' @param n_topics Number of topics to extract
#' @param max_terms Maximum number of terms per topic to return
#' @param n_iterations Number of iterations for the NMF algorithm
#' @param seed Optional seed for reproducibility. If NULL, no seed is set.
#'
#' @return A list containing topic-term and document-topic matrices
#' @export
extract_topics <- function(text_data, text_column = "abstract", n_topics = 5,
max_terms = 10, n_iterations = 50, seed = NULL) {
# Check if text column exists
if (!text_column %in% colnames(text_data)) {
stop("Text column '", text_column, "' not found in the data")
}
# Set seed if provided
if (!is.null(seed)) {
set.seed(seed)
}
# Preprocess text data
preprocessed_data <- preprocess_text(text_data, text_column = text_column,
remove_stopwords = TRUE)
# Extract terms and create document-term matrix
all_terms <- list()
for (i in 1:nrow(preprocessed_data)) {
terms_df <- preprocessed_data$terms[[i]]
if (nrow(terms_df) > 0) {
all_terms[[i]] <- terms_df
}
}
# Get unique terms
unique_terms <- unique(unlist(lapply(all_terms, function(df) df$word)))
# Create document-term matrix
dtm <- matrix(0, nrow = nrow(preprocessed_data), ncol = length(unique_terms))
colnames(dtm) <- unique_terms
for (i in 1:nrow(preprocessed_data)) {
if (i %in% seq_along(all_terms)) {
terms_df <- all_terms[[i]]
if (nrow(terms_df) > 0) {
term_indices <- match(terms_df$word, unique_terms)
dtm[i, term_indices] <- terms_df$count
}
}
}
# Simple implementation of NMF (Non-negative Matrix Factorization)
# Initialize random matrices W and H
# Note: For reproducible results, users can set seed parameter or call set.seed() before this function
W <- matrix(runif(nrow(dtm) * n_topics), nrow = nrow(dtm))
H <- matrix(runif(n_topics * ncol(dtm)), nrow = n_topics)
# Iterative update of W and H
for (iter in 1:n_iterations) {
# Update H
H <- H * (t(W) %*% dtm) / (t(W) %*% W %*% H + 1e-10)
# Update W
W <- W * (dtm %*% t(H)) / (W %*% H %*% t(H) + 1e-10)
# Normalize columns of W
W <- W / apply(W, 2, sum)
}
# Extract top terms for each topic
top_terms <- list()
for (i in 1:n_topics) {
term_weights <- H[i, ]
top_indices <- order(term_weights, decreasing = TRUE)[1:min(max_terms, length(term_weights))]
top_terms[[i]] <- data.frame(
term = unique_terms[top_indices],
weight = term_weights[top_indices],
stringsAsFactors = FALSE
)
}
# Assign topic labels
topic_labels <- paste("Topic", 1:n_topics)
names(top_terms) <- topic_labels
# Document-topic assignments
doc_topics <- W
colnames(doc_topics) <- topic_labels
# Return results
return(list(
topics = top_terms,
document_topics = doc_topics,
dtm = dtm
))
}
#' Get UMLS semantic types for a given dictionary type
#'
#' Helper function to map dictionary types to UMLS semantic type identifiers
#'
#' @param dictionary_type The type of dictionary to get semantic types for
#' @return Vector of UMLS semantic type identifiers
#' @keywords internal
get_umls_semantic_types <- function(dictionary_type) {
# Create a comprehensive mapping of dictionary types to UMLS semantic types
semantic_types <- list(
# Original mappings
"disease" = c("T047", "T048", "T019", "T046"), # Disease and Syndrome, Mental Disorder, Congenital Abnormality, Pathologic Function
"drug" = c("T116", "T121", "T195", "T200"), # Amino Acid, Lipid, Antibiotic, Clinical Drug
"gene" = c("T028", "T087", "T123"), # Gene, Amino Acid Sequence, Nucleotide Sequence
# New expanded mappings
"protein" = c("T116", "T126", "T125"), # Amino Acid, Peptide/Protein, Enzyme
"chemical" = c("T103", "T104", "T109", "T196"), # Chemical, Chemical Viewed Structurally, Organic Chemical, Element, Ion, or Isotope
"pathway" = c("T044", "T042", "T045"), # Molecular Function, Organ or Tissue Function, Genetic Function
"anatomy" = c("T017", "T018", "T023", "T024"), # Anatomical Structure, Embryonic Structure, Body Part, Organ, or Organ Component, Tissue
"organism" = c("T001", "T002", "T004", "T005", "T007"), # Organism, Plant, Fungus, Virus, Bacterium
"biological_process" = c("T038", "T039", "T040", "T041"), # Biologic Function, Physiologic Function, Organism Function, Mental Process
"cellular_component" = c("T026", "T025", "T029"), # Cell Component, Cell, Body Location or Region
"molecular_function" = c("T044", "T045", "T118"), # Molecular Function, Genetic Function, Carbohydrate
"diagnostic_procedure" = c("T059", "T060"), # Laboratory Procedure, Diagnostic Procedure
"therapeutic_procedure" = c("T061", "T058"), # Therapeutic or Preventive Procedure, Health Care Activity
"phenotype" = c("T046", "T047", "T048", "T020"), # Pathologic Function, Disease or Syndrome, Mental or Behavioral Dysfunction, Acquired Abnormality
"symptom" = c("T184", "T046", "T048"), # Sign or Symptom, Pathologic Function, Mental or Behavioral Dysfunction
"cell" = c("T025"), # Cell
"tissue" = c("T024"), # Tissue
"mental_process" = c("T041", "T048"), # Mental Process, Mental or Behavioral Dysfunction
"physiologic_function" = c("T039", "T040"), # Physiologic Function, Organism Function
"laboratory_procedure" = c("T059") # Laboratory Procedure
)
if (!is.null(dictionary_type) && dictionary_type %in% names(semantic_types)) {
return(semantic_types[[dictionary_type]])
} else {
return(NULL)
}
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict),
" terms with non-alphanumeric characters (final cleanup)")
}
if (verbose) {
message("Sanitization complete. ", nrow(safe_dict), " terms remaining (",
round(nrow(safe_dict) / nrow(dictionary) * 100, 1), "% of original)")
}
return(safe_dict)
}
#' Enhanced sanitize dictionary function
#'
#' This function sanitizes dictionary terms to ensure they're valid for entity extraction.
#'
#' @param dictionary A data frame containing dictionary terms.
#' @param term_column The name of the column containing the terms to sanitize.
#' @param type_column The name of the column containing entity types.
#' @param validate_types Logical. If TRUE, validates terms against their claimed type.
#' @param verbose Logical. If TRUE, prints information about the filtering process.
#'
#' @return A data frame with sanitized terms.
#' @export
sanitize_dictionary <- function(dictionary, term_column = "term", type_column = "type",
validate_types = TRUE, verbose = TRUE) {
# Check input
if (is.null(dictionary) || nrow(dictionary) == 0) {
warning("Empty dictionary provided for sanitization")
return(dictionary)
}
if (!term_column %in% colnames(dictionary)) {
stop("Term column '", term_column, "' not found in the dictionary")
}
original_count <- nrow(dictionary)
if (verbose) {
message("Sanitizing dictionary with ", original_count, " terms...")
}
# Check for NA terms
na_terms <- is.na(dictionary[[term_column]])
if (any(na_terms)) {
if (verbose) {
message(" Removing ", sum(na_terms), " NA terms")
}
dictionary <- dictionary[!na_terms, ]
}
# Check for empty terms
empty_terms <- dictionary[[term_column]] == ""
if (any(empty_terms)) {
if (verbose) {
message(" Removing ", sum(empty_terms), " empty terms")
}
dictionary <- dictionary[!empty_terms, ]
}
# Step 1: Remove terms with regex special characters that could cause issues
safe_dict <- dictionary[!grepl('[\\[\\]\\(\\)\\{\\}\\^\\$\\*\\+\\?\\|\\\\/]', dictionary[[term_column]]), ]
if (verbose && nrow(safe_dict) < nrow(dictionary)) {
message(" Removed ", nrow(dictionary) - nrow(safe_dict), " terms with regex special characters")
}
# If the dictionary is now empty, early return to avoid further processing
if (nrow(safe_dict) == 0) {
warning("No terms remain after removing terms with regex special characters")
return(safe_dict)
}
# Step 2: Remove terms with numbers followed by special characters
original_count <- nrow(safe_dict)
safe_dict <- safe_dict[!grepl('\\d\\s*[^a-zA-Z0-9\\s]', safe_dict[[term_column]]), ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict), " terms with numbers followed by special characters")
}
# If the dictionary is now empty, early return
if (nrow(safe_dict) == 0) {
warning("No terms remain after removing terms with numbers followed by special characters")
return(safe_dict)
}
# Step 3: Check for specific problematic patterns and remove them
problem_terms <- grep('\\d\\s*\\[', safe_dict[[term_column]], value = TRUE)
if (length(problem_terms) > 0) {
if (verbose) {
message(" Warning: Found potentially problematic terms like: ",
paste(head(problem_terms, 5), collapse = ", "))
}
original_count <- nrow(safe_dict)
safe_dict <- safe_dict[!grepl('\\d\\s*\\[', safe_dict[[term_column]]), ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict), " problematic terms with patterns like '68 [1'")
}
}
# If the dictionary is now empty, early return
if (nrow(safe_dict) == 0) {
warning("No terms remain after removing problematic terms")
return(safe_dict)
}
# Step 4: Remove common conjunctive adverbs and non-medical terms
original_count <- nrow(safe_dict)
# Expanded list of common non-medical terms and general terminology
common_adverbs <- c(
"however", "therefore", "furthermore", "moreover", "consequently",
"nevertheless", "accordingly", "thus", "hence", "meanwhile", "subsequently",
"conversely", "indeed", "instead", "likewise", "namely", "regardless",
"similarly", "specifically", "undoubtedly", "whereas", "annotation", "annotated",
"nonetheless", "meanwhile", "absolutely", "accordingly", "additionally",
"afterwards", "albeit", "although", "altogether", "anyway", "certainly",
"clearly", "considering", "despite", "earlier", "especially", "essentially",
"eventually", "exactly", "evidently", "finally", "further", "generally",
"granted", "fortunately", "hopefully", "importantly", "incidentally",
"initially", "ironically", "literally", "merely", "naturally", "obviously",
"occasionally", "otherwise", "particularly", "perhaps", "possibly",
"presumably", "previously", "primarily", "probably", "rather",
"regardless", "relatively", "reportedly", "seemingly", "seriously",
"significantly", "simultaneously", "specifically", "strangely", "surprisingly",
"ultimately", "unfortunately", "whereby"
)
# List of common abbreviations and academic terms
common_abbrevs <- c(
"et al", "fig", "etc", "ie", "eg", "viz", "vs", "ref", "refs",
"approx", "ca", "cf", "cp", "ed", "eds", "eq", "eqn", "eqns", "intro",
"min", "max", "nb", "no", "non", "vol", "vols", "yr", "yrs",
"abstract", "table", "figure", "chart", "graph", "analysis", "summary",
"introduction", "conclusion", "discussion", "result", "results", "method",
"methods", "protocol", "review", "sample", "case", "cases", "control",
"controls", "subject", "subjects", "participant", "participants",
"pubmed", "embase", "medline", "cochrane", "meta", "database", "databases"
)
# Common non-specific terms that are incorrectly appearing in our results
common_nonspecific <- c(
"effect", "effects", "affect", "affects", "influence", "influences",
"impact", "impacts", "association", "associated", "associations",
"correlation", "correlations", "relationship", "relationships",
"connection", "connections", "link", "links", "cause", "causes",
"causing", "caused", "increase", "increases", "increasing",
"decrease", "decreases", "decreasing", "demonstrate", "demonstrates",
"demonstrated", "show", "shows", "shown", "find", "finds", "found",
"observe", "observes", "observed", "report", "reports", "reported",
"reveal", "reveals", "revealed", "suggest", "suggests", "suggested",
"indicates", "indicated", "document", "documented", "value", "values",
"level", "levels", "measure", "measures", "measured", "significant",
"significance", "compare", "compared", "comparing", "determine",
"determined", "determining", "identify", "identified", "identifying",
"estimate", "estimated", "estimating", "total", "high", "higher",
"highest", "low", "lower", "lowest", "new", "novel", "recent", "previous",
"older", "younger", "early", "earlier", "late", "later", "frequently",
"frequently", "rarely", "common", "commonly", "rare", "rarely", "severe",
"severity", "mild", "moderate", "normal", "abnormal", "improvement",
"improved", "improving", "worsening", "worsened", "detection", "detected",
"detecting", "prevention", "prevented", "preventing", "assessment",
"assessed", "assessing", "evaluation", "evaluated", "evaluating",
"investigation", "investigated", "investigating", "calculation",
"calculated", "calculating", "classification", "classified", "classifying",
"stat", "stats", "clinical", "treatment", "patient", "patients", "outcome",
"outcomes", "intervention", "interventions", "agent", "agents", "order",
"orders", "ample", "amples", "population", "play", "negative", "accuracy",
"leading", "baseline", "evidence", "efficacy", "effectiveness", "placebo",
"follow", "following", "design", "trial", "trials", "quality", "recommendation",
"recommendations", "response", "responses", "safe", "safety", "adverse",
"profile", "profiles", "interaction", "interactions", "combination",
"combinations", "standard", "standards", "guideline", "guidelines",
"first", "second", "third", "fourth", "fifth", "primary", "secondary",
"tertiary", "initial", "final", "potential", "beneficial", "beneficial",
"strategy", "strategies", "approach", "approaches", "method", "methods",
"technique", "techniques", "procedure", "procedures", "process", "processes",
"long", "short", "term", "single", "double", "multiple", "simple", "complex",
"positive", "subsequent", "beneficial", "harmful", "randomized", "randomised",
"controlled", "substantial", "significant", "consistent", "considerable",
"relevant", "important", "interesting", "promising", "similar", "different",
"distinct", "specific", "particular", "major", "minor", "key", "main",
"essential", "necessary", "sufficient", "adequate", "proper", "appropriate",
"suitable", "consecutive", "simultaneous", "various", "variable", "concurrent",
"concomitant", "overall", "entire", "whole", "optimum", "optimal", "ideal",
"better", "best", "worse", "worst", "efficacious", "limited", "extensive",
"intensive", "widespread", "reliable", "reproducible", "repeatable",
"comparable", "varied", "useful", "valuable", "successful", "unsuccessful",
"effective", "ineffective", "extensive", "intensive", "minimum", "maximum"
)
# New: Medical jargon that's too general to be useful biomedical entities
medical_jargon <- c(
"acute", "chronic", "condition", "conditions", "diagnosis", "diagnoses",
"diagnostic", "diagnostics", "disorder", "disorders", "dose", "doses",
"dosage", "dosages", "progressive", "regressive", "regimen", "regimens",
"symptom", "symptoms", "syndrome", "syndromes", "therapeutic", "therapy",
"therapies", "onset", "remission", "recurrent", "relapse", "relapses",
"treatment", "treatments", "manageable", "unmanageable", "intensive",
"supportive", "palliative", "preventive", "preventative", "prophylactic",
"prophylaxis", "maintenance", "induction", "consolidation", "adjuvant",
"neoadjuvant", "first-line", "second-line", "rescue", "salvage",
"empiric", "empirical", "targeted", "systemic", "topical", "local",
"regional", "general", "medical", "surgical", "pharmacological",
"behavioral", "behavioural", "psychological", "physical", "occupational",
"alternative", "complementary", "conventional", "standard", "novel",
"experimental", "investigational", "established", "emerging", "conventional"
)
# Combine all terms to filter out
terms_to_filter <- unique(c(common_adverbs, common_abbrevs, common_nonspecific, medical_jargon))
# Make term filtering case-insensitive
safe_dict <- safe_dict[!tolower(safe_dict[[term_column]]) %in% terms_to_filter, ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict),
" common non-medical terms, conjunctive adverbs, and general terms")
}
# If the dictionary is now empty, early return
if (nrow(safe_dict) == 0) {
warning("No terms remain after removing common non-medical terms")
return(safe_dict)
}
# Step 5: Apply term-type corrections for commonly misclassified terms
if (validate_types && type_column %in% colnames(safe_dict)) {
# Additional term-type corrections for common misclassified terms
term_type_corrections <- list(
# Analytical techniques/methods that are often misclassified as chemicals
"faers" = "method", # FDA Adverse Event Reporting System
"bcpnn" = "method", # Bayesian Confidence Propagation Neural Network
"uplc" = "method", # Ultra Performance Liquid Chromatography
"frap" = "method", # Fluorescence Recovery After Photobleaching
"hplc" = "method", # High Performance Liquid Chromatography
"lc-ms" = "method", # Liquid Chromatography-Mass Spectrometry
"gc-ms" = "method", # Gas Chromatography-Mass Spectrometry
"maldi" = "method", # Matrix-Assisted Laser Desorption/Ionization
"elisa" = "method", # Enzyme-Linked Immunosorbent Assay
"ft-ir" = "method", # Fourier Transform Infrared Spectroscopy
"nmr" = "method", # Nuclear Magnetic Resonance
"pcr" = "method", # Polymerase Chain Reaction
"sem" = "method", # Scanning Electron Microscopy
"tem" = "method", # Transmission Electron Microscopy
"xrd" = "method", # X-Ray Diffraction
"saxs" = "method", # Small-Angle X-ray Scattering
"uv-vis" = "method", # Ultraviolet-Visible Spectroscopy
"ms" = "method", # Mass Spectrometry
"ms/ms" = "method", # Tandem Mass Spectrometry
"lc" = "method", # Liquid Chromatography
"gc" = "method", # Gas Chromatography
"tga" = "method", # Thermogravimetric Analysis
"dsc" = "method", # Differential Scanning Calorimetry
"uv" = "method", # Ultraviolet
"ir" = "method", # Infrared
"rna-seq" = "method", # RNA Sequencing
"qtof" = "method", # Quadrupole Time-of-Flight
# Common biostatistical methods incorrectly classified as chemicals
"anova" = "method", # Analysis of Variance
"ancova" = "method", # Analysis of Covariance
"manova" = "method", # Multivariate Analysis of Variance
"pca" = "method", # Principal Component Analysis
"sem" = "method", # Structural Equation Modeling
"glm" = "method", # Generalized Linear Model
"lda" = "method", # Linear Discriminant Analysis
"svm" = "method", # Support Vector Machine
"ann" = "method", # Artificial Neural Network
"kmeans" = "method", # K-means clustering
"roc" = "method", # Receiver Operating Characteristic
"auc" = "method", # Area Under the Curve
# Symptoms that are often misclassified
"pain" = "symptom",
"headache" = "symptom",
"migraine" = "disease",
"nausea" = "symptom",
"vomiting" = "symptom",
"dizziness" = "symptom",
"fatigue" = "symptom",
"weakness" = "symptom",
"aura" = "symptom",
"photophobia" = "symptom",
"phonophobia" = "symptom",
# Proteins and receptors
"receptor" = "protein",
"receptors" = "protein",
"channel" = "protein",
"channels" = "protein",
"transporter" = "protein",
"transporters" = "protein",
# Biological processes
"inflammation" = "biological_process",
"signaling" = "biological_process",
"activation" = "biological_process",
"inhibition" = "biological_process",
"regulation" = "biological_process",
"phosphorylation" = "biological_process",
# Diseases
"migraine" = "disease",
"malformation" = "disease",
"disorder" = "disease",
"syndrome" = "disease"
)
# Apply corrections to dictionary
corrections_made <- 0
for (i in 1:nrow(safe_dict)) {
term_lower <- tolower(safe_dict[[term_column]][i])
if (term_lower %in% names(term_type_corrections)) {
# Get the correct type
correct_type <- term_type_corrections[[term_lower]]
# If current type is different from the correct type
if (safe_dict[[type_column]][i] != correct_type) {
# Update the type to the correct one
if (verbose) {
message(" Correcting type for '", safe_dict[[term_column]][i],
"' from '", safe_dict[[type_column]][i],
"' to '", correct_type, "'")
}
safe_dict[[type_column]][i] <- correct_type
corrections_made <- corrections_made + 1
}
}
}
if (verbose && corrections_made > 0) {
message(" Applied ", corrections_made, " type corrections for commonly misclassified terms")
}
}
# Step 6: Type validation - ensure terms match their claimed entity types
if (validate_types && type_column %in% colnames(safe_dict)) {
original_count <- nrow(safe_dict)
# Rules for validating types - define characteristics for each entity type
is_valid_type <- function(term, claimed_type) {
term <- tolower(term)
claimed_type <- tolower(claimed_type)
# Single character or very short terms are likely not valid entities
if (nchar(term) < 3) {
return(FALSE)
}
# Check for capitalized acronyms (common for genes)
is_acronym <- grepl("^[A-Z0-9]{2,}$", term)
# Term validation rules by type
if (claimed_type == "gene") {
# Genes are often acronyms or have numbers
known_gene_patterns <- c("gene", "receptor", "factor", "kinase", "protein", "enzyme",
"domain", "binding", "transcription", "transporter", "channel")
pattern_match <- any(sapply(known_gene_patterns, function(p) grepl(p, term)))
return(is_acronym || pattern_match)
}
else if (claimed_type == "protein") {
# Proteins often end with -in or contain protein-related terms
known_protein_patterns <- c("protein", "enzyme", "antibody", "receptor", "factor",
"kinase", "ase$", "in$", "globulin", "albumin", "peptide")
pattern_match <- any(sapply(known_protein_patterns, function(p) grepl(p, term)))
# Special case for receptor/receptors
if (term == "receptor" || term == "receptors") {
return(TRUE)
}
return(pattern_match)
}
else if (claimed_type == "drug") {
# Drugs often end with specific suffixes
known_drug_patterns <- c("caine$", "mycin$", "oxacin$", "dronate$", "olol$", "pril$",
"sartan$", "mab$", "nib$", "gliptin$", "prazole$", "vastatin$",
"dine$", "zosin$", "parin$", "ide$", "ane$", "ene$")
pattern_match <- any(sapply(known_drug_patterns, function(p) grepl(p, term)))
# Known drug classes or categories
drug_classes <- c("antibiotic", "inhibitor", "antagonist", "agonist", "blocker",
"vaccine", "antidepressant", "antipsychotic", "antiepileptic",
"sedative", "stimulant", "antihistamine", "analgesic", "hormone")
class_match <- any(sapply(drug_classes, function(c) grepl(c, term)))
return(pattern_match || class_match)
}
else if (claimed_type == "disease") {
# Diseases often contain specific terms
known_disease_patterns <- c("disease$", "disorder$", "syndrome$", "itis$", "emia$",
"pathy$", "oma$", "osis$", "iasis$", "itis$", "algia$",
"cancer", "tumor", "infection", "deficiency", "failure")
pattern_match <- any(sapply(known_disease_patterns, function(p) grepl(p, term)))
# Special case for "migraine"
if (term == "migraine") {
return(TRUE)
}
return(pattern_match)
}
else if (claimed_type == "pathway") {
# Pathways often contain these terms
known_pathway_patterns <- c("pathway", "signaling", "cascade", "metabolism",
"biosynthesis", "cycle", "regulation", "transport")
pattern_match <- any(sapply(known_pathway_patterns, function(p) grepl(p, term)))
return(pattern_match)
}
else if (claimed_type == "symptom") {
# Symptoms often contain these terms
known_symptom_patterns <- c("pain", "ache", "fatigue", "nausea", "vomiting",
"headache", "fever", "cough", "dizziness", "vertigo",
"weakness", "numbness", "tingling", "photophobia",
"phonophobia", "aura")
# Direct check for common symptoms
if (term %in% known_symptom_patterns) {
return(TRUE)
}
pattern_match <- any(sapply(known_symptom_patterns, function(p) grepl(p, term)))
return(pattern_match)
}
else if (claimed_type == "biological_process") {
# Biological processes often contain these terms
known_bioprocess_patterns <- c("process", "signaling", "activation", "inhibition",
"regulation", "expression", "secretion", "transcription",
"phosphorylation", "metabolism", "inflammation")
# Direct check for common biological processes
if (term %in% c("inflammation", "signaling", "activation", "inhibition", "regulation")) {
return(TRUE)
}
pattern_match <- any(sapply(known_bioprocess_patterns, function(p) grepl(p, term)))
return(pattern_match)
}
else if (claimed_type == "method") {
# Check if term is a known analytical method
analytical_methods <- c(
"faers", "bcpnn", "uplc", "frap", "hplc", "lc-ms", "gc-ms", "maldi",
"elisa", "ft-ir", "nmr", "pcr", "sem", "tem", "xrd", "saxs", "uv-vis",
"ms", "ms/ms", "lc", "gc", "tga", "dsc", "uv", "ir", "rna-seq", "qtof",
"mri", "ct", "pet", "spect", "ecg", "eeg", "emg", "fmri", "qsar", "qspr",
"anova", "ancova", "manova", "pca", "sem", "glm", "lda", "svm", "ann",
"roc", "auc"
)
if (term %in% analytical_methods) {
return(TRUE)
}
# Check for method terms
method_patterns <- c("method", "technique", "assay", "analysis", "procedure",
"protocol", "algorithm", "approach")
pattern_match <- any(sapply(method_patterns, function(p) grepl(p, term)))
return(pattern_match)
}
# If no specific rules for this type, allow it to pass
return(TRUE)
}
# Apply type validation
valid_rows <- rep(TRUE, nrow(safe_dict))
for (i in seq_len(nrow(safe_dict))) {
term <- safe_dict[[term_column]][i]
claimed_type <- safe_dict[[type_column]][i]
# Check if term matches its claimed type
if (!is.na(claimed_type) && !is_valid_type(term, claimed_type)) {
valid_rows[i] <- FALSE
}
}
# Keep only rows with valid type assignments
safe_dict <- safe_dict[valid_rows, ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict),
" terms that did not match their claimed entity types")
}
}
# Final step: Remove terms consisting solely of numbers
original_count <- nrow(safe_dict)
safe_dict <- safe_dict[!grepl("^[0-9]+$", safe_dict[[term_column]]), ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict),
" terms consisting solely of numbers")
}
# Final cleanup - only include terms with alphanumeric characters and spaces
original_count <- nrow(safe_dict)
safe_dict <- safe_dict[grepl('^[a-zA-Z0-9\\s]+$', safe_dict[[term_column]]), ]
if (verbose && nrow(safe_dict) < original_count) {
message(" Removed ", original_count - nrow(safe_dict),
" terms with non-alphanumeric characters (final cleanup)")
}
if (verbose) {
message("Sanitization complete. ", nrow(safe_dict), " terms remaining (",
round(nrow(safe_dict) / nrow(dictionary) * 100, 1), "% of original)")
}
return(safe_dict)
}
#' Load terms from MeSH using PubMed search
#'
#' This function enhances the MeSH dictionary by extracting additional terms
#' from PubMed search results using MeSH queries.
#'
#' @param mesh_queries A named list of MeSH queries for different categories.
#' @param max_results Maximum number of results to retrieve per query.
#' @param min_term_length Minimum length of terms to include.
#' @param sanitize Logical. If TRUE, sanitizes the extracted terms.
#'
#' @return A data frame containing the combined dictionary with extracted terms.
#' @keywords internal
load_mesh_terms_from_pubmed <- function(mesh_queries, max_results = 50, min_term_length = 3, sanitize = TRUE) {
# Check if pubmed_search function is available
if (!exists("pubmed_search", mode = "function")) {
stop("pubmed_search function not available. Make sure required dependencies are loaded.")
}
# Initialize combined dictionary
combined_dict <- data.frame(
term = character(),
type = character(),
id = character(),
source = character(),
stringsAsFactors = FALSE
)
# Process each query
for (category in names(mesh_queries)) {
message("Getting ", category, " terms from PubMed/MeSH...")
# Use PubMed search to get relevant MeSH terms
search_result <- tryCatch({
pubmed_search(
query = mesh_queries[[category]],
max_results = max_results,
use_mesh = TRUE
)
}, error = function(e) {
message("Error searching for ", category, " terms: ", e$message)
return(NULL)
})
if (!is.null(search_result) && nrow(search_result) > 0) {
# Extract potential terms from titles and abstracts
terms_text <- paste(search_result$title, search_result$abstract, sep = " ")
# Simple term extraction - extract words and phrases
term_candidates <- unique(unlist(strsplit(terms_text, "[\\s,.;:()/]+|and\\s|or\\s")))
term_candidates <- term_candidates[nchar(term_candidates) > min_term_length] # Filter out short terms
# Filter out terms with ANY regex special characters completely
clean_terms <- character(0)
for (term in term_candidates) {
if (!grepl("[\\[\\]\\(\\)\\{\\}\\^\\$\\*\\+\\?\\|\\\\/]", term)) {
clean_terms <- c(clean_terms, term)
}
}
# If we have clean terms, create a dictionary
if (length(clean_terms) > 0) {
mini_dict <- data.frame(
term = clean_terms,
type = category,
id = paste0(category, "_", 1:length(clean_terms)),
source = "pubmed_mesh_search",
stringsAsFactors = FALSE
)
# Add to combined dictionary
message("Added ", nrow(mini_dict), " potential ", category, " terms")
combined_dict <- rbind(combined_dict, mini_dict)
} else {
message("No valid terms found for category: ", category)
}
} else {
message("No search results found for category: ", category)
}
}
message("Created combined dictionary with ", nrow(combined_dict), " terms")
# Sanitize the dictionary if requested
if (sanitize && nrow(combined_dict) > 0) {
combined_dict <- sanitize_dictionary(combined_dict, term_column = "term")
}
return(combined_dict)
}
#' Extract entities from text with improved efficiency using only base R
#'
#' This function provides a complete workflow for extracting entities from text
#' using dictionaries from multiple sources, with improved performance and robust error handling.
#'
#' @param text_data A data frame containing article text data.
#' @param text_column Name of the column containing text to process.
#' @param entity_types Character vector of entity types to include.
#' @param dictionary_sources Character vector of sources for entity dictionaries.
#' @param additional_mesh_queries Named list of additional MeSH queries.
#' @param sanitize Logical. If TRUE, sanitizes dictionaries before extraction.
#' @param api_key API key for UMLS access (if "umls" is in dictionary_sources).
#' @param custom_dictionary A data frame containing custom dictionary entries to
#' incorporate into the entity extraction process.
#' @param max_terms_per_type Maximum number of terms to fetch per entity type. Default is 200.
#' @param verbose Logical. If TRUE, prints detailed progress information.
#' @param batch_size Number of documents to process in a single batch. Default is 500.
#' @param parallel Logical. If TRUE, uses parallel processing when available. Default is FALSE.
#' @param num_cores Number of cores to use for parallel processing. Default is 2.
#' @param cache_dictionaries Logical. If TRUE, caches dictionaries for faster reuse. Default is TRUE.
#'
#' @return A data frame with extracted entities, their types, and positions.
#' @export
extract_entities_workflow <- function(text_data, text_column = "abstract",
entity_types = c("disease", "drug", "gene"),
dictionary_sources = c("local", "mesh", "umls"),
additional_mesh_queries = NULL,
sanitize = TRUE,
api_key = NULL,
custom_dictionary = NULL,
max_terms_per_type = 200,
verbose = TRUE,
batch_size = 500,
parallel = FALSE,
num_cores = 2,
cache_dictionaries = TRUE) {
# Check if running in R CMD check environment
is_check <- !interactive() &&
(!is.null(Sys.getenv("R_CHECK_RUNNING")) &&
Sys.getenv("R_CHECK_RUNNING") == "true")
# More robust check for testing environment
if (!is_check && !is.null(Sys.getenv("_R_CHECK_LIMIT_CORES_"))) {
is_check <- TRUE
}
# Adjust parameters if in check environment
if (is_check) {
parallel <- FALSE
num_cores <- 1
if (verbose) message("Running in R CMD check environment. Disabling parallel processing.")
}
# Ensure num_cores doesn't exceed system capabilities or reasonable limits
if (parallel) {
# Get available cores
available_cores <- parallel::detectCores()
# Limit to reasonable maximum (e.g., available - 1, not exceeding 8)
max_cores <- min(available_cores - 1, 8)
# Ensure at least 1 core
num_cores <- max(1, min(num_cores, max_cores))
} else {
num_cores <- 1
}
# Start timing the function for performance reporting
start_time <- Sys.time()
# Initialize dictionary cache environment if caching is enabled
if (cache_dictionaries) {
# Use the package's dict_cache_env instead of GlobalEnv
dict_cache <- get_dict_cache()
}
# Function to create a unique cache key
create_cache_key <- function(entity_type, source) {
paste(entity_type, source, sep = "_")
}
# Validate inputs
if (!text_column %in% colnames(text_data)) {
stop("Text column '", text_column, "' not found in the data")
}
# Setup parallel processing if requested
use_parallel <- FALSE
cl <- NULL
if (parallel) {
tryCatch({
if (!requireNamespace("parallel", quietly = TRUE)) {
warning("The 'parallel' package is required for parallel processing. Falling back to sequential processing.")
} else {
# Detect available cores if not specified
if (is.null(num_cores) || num_cores < 1) {
num_cores <- parallel::detectCores() - 1
num_cores <- max(1, num_cores) # Ensure at least 1 core
}
if (num_cores > 1) {
use_parallel <- TRUE
if (verbose) message("Setting up parallel processing with ", num_cores, " cores...")
} else {
if (verbose) message("Only 1 core available, using sequential processing.")
}
}
}, error = function(e) {
warning("Error setting up parallel processing: ", e$message, ". Falling back to sequential processing.")
})
}
# Validate dictionary sources
valid_sources <- c("local", "mesh", "umls", "pubmed")
invalid_sources <- setdiff(dictionary_sources, valid_sources)
if (length(invalid_sources) > 0) {
warning("Invalid dictionary sources: ", paste(invalid_sources, collapse = ", "),
". Valid sources are: ", paste(valid_sources, collapse = ", "))
dictionary_sources <- intersect(dictionary_sources, valid_sources)
}
# Check for UMLS API key if needed
if ("umls" %in% dictionary_sources && is.null(api_key)) {
warning("UMLS source requested but no API key provided. Skipping UMLS.")
dictionary_sources <- setdiff(dictionary_sources, "umls")
}
# Define which entity types are supported by which sources
local_dict_types <- c("disease", "drug", "gene")
mesh_dict_types <- c(
"disease", "drug", "gene", "protein", "chemical", "pathway",
"anatomy", "organism", "biological_process", "cell", "tissue",
"symptom", "diagnostic_procedure", "therapeutic_procedure",
"phenotype", "molecular_function"
)
# For each entity type, determine which sources can provide it
source_map <- list()
for (etype in entity_types) {
if (etype %in% local_dict_types) {
source_map[[etype]] <- dictionary_sources
} else {
# For expanded types, remove 'local' from sources
source_map[[etype]] <- setdiff(dictionary_sources, "local")
if (length(source_map[[etype]]) == 0 && "local" %in% dictionary_sources) {
# If only local was specified but not supported for this type, try mesh
source_map[[etype]] <- c("mesh")
if (verbose) {
message("Entity type '", etype, "' not supported by local source. Using MeSH instead.")
}
}
}
}
# Preprocess the text data if needed
if (!"doc_id" %in% colnames(text_data)) {
if (verbose) message("Adding document IDs to text data...")
text_data$doc_id <- seq_len(nrow(text_data))
}
# Create combined dictionary
if (verbose) message("Creating dictionaries for entity extraction...")
combined_dict <- NULL
# Add custom dictionary first if provided
if (!is.null(custom_dictionary) && nrow(custom_dictionary) > 0) {
if (verbose) message("Adding ", nrow(custom_dictionary), " terms from custom dictionary")
# Validate custom dictionary format
required_cols <- c("term", "type")
if (!all(required_cols %in% colnames(custom_dictionary))) {
stop("Custom dictionary must have at least columns: ", paste(required_cols, collapse = ", "))
}
# Ensure 'source' column exists
if (!"source" %in% colnames(custom_dictionary)) {
custom_dictionary$source <- "custom"
}
# Ensure 'id' column exists
if (!"id" %in% colnames(custom_dictionary)) {
custom_dictionary$id <- paste0("custom_", seq_len(nrow(custom_dictionary)))
}
combined_dict <- custom_dictionary
}
# Function to load dictionary for a single entity type
load_dict_single <- function(entity_type) {
tryCatch({
# Get the sources that support this entity type
valid_sources <- source_map[[entity_type]]
if (length(valid_sources) == 0) {
return(NULL)
}
# Try each source in order until successful
dict <- NULL
for (source in valid_sources) {
# Check cache first if enabled
if (cache_dictionaries) {
cache_key <- create_cache_key(entity_type, source)
if (exists(cache_key, envir = dict_cache)) {
dict <- get(cache_key, envir = dict_cache)
if (!is.null(dict) && nrow(dict) > 0) {
if (verbose) message(" Using cached dictionary for ", entity_type, " (", source, ")")
return(dict)
}
}
}
# Skip local source for expanded types
if (source == "local" && !(entity_type %in% local_dict_types)) {
next
}
# Check if we should use extended MeSH
extended_mesh <- (source == "mesh" && !is.null(additional_mesh_queries))
# Extract appropriate MeSH queries for this type
mesh_queries_for_type <- NULL
if (extended_mesh && entity_type %in% names(additional_mesh_queries)) {
mesh_queries_for_type <- list()
mesh_queries_for_type[[entity_type]] <- additional_mesh_queries[[entity_type]]
}
# Try to load the dictionary with error handling
tryCatch({
dict <- load_dictionary(
dictionary_type = entity_type,
source = source,
api_key = api_key,
n_terms = max_terms_per_type,
extended_mesh = (source == "mesh" && !is.null(mesh_queries_for_type)),
mesh_queries = mesh_queries_for_type,
sanitize = FALSE # We'll sanitize the combined dictionary later
)
# Store in cache if successful
if (cache_dictionaries && !is.null(dict) && nrow(dict) > 0) {
assign(cache_key, dict, envir = dict_cache)
}
# If successful, break the loop
if (!is.null(dict) && nrow(dict) > 0) {
if (verbose) message(" Added ", nrow(dict), " terms from ", entity_type, " (", source, ")")
return(dict)
}
}, error = function(e) {
if (verbose) message(" Error loading ", entity_type, " dictionary from ", source, ": ", e$message)
return(NULL)
})
}
# Return NULL if no dictionary was loaded
return(dict)
}, error = function(e) {
if (verbose) message(" Error processing ", entity_type, ": ", e$message)
return(NULL)
})
}
# Load dictionaries (parallel or sequential)
dict_list <- list()
if (use_parallel) {
if (verbose) message("Loading dictionaries in parallel...")
# Create a parallel cluster
cl <- parallel::makeCluster(num_cores)
# Export necessary functions and data to the cluster
parallel::clusterExport(cl, varlist = c("source_map", "local_dict_types",
"additional_mesh_queries", "api_key",
"max_terms_per_type", "verbose",
"cache_dictionaries", "dict_cache"),
envir = environment())
# Make sure load_dictionary is available in the cluster
tryCatch({
parallel::clusterEvalQ(cl, {
# Load any required functions
if (exists("load_dictionary", mode = "function")) {
# Function is already available in global environment
} else if (requireNamespace("LBDiscover", quietly = TRUE)) {
# Try loading from the package
load_dictionary <- LBDiscover::load_dictionary
} else {
# Function might be in the current environment
# The calling environment's functions should be exported already
}
})
# Use parLapply for cross-platform parallel execution
dict_list <- parallel::parLapply(cl, entity_types, load_dict_single)
}, error = function(e) {
warning("Error in parallel dictionary loading: ", e$message,
". Falling back to sequential processing.")
dict_list <- lapply(entity_types, load_dict_single)
}, finally = {
# Always stop the cluster
parallel::stopCluster(cl)
cl <- NULL
})
} else {
# Sequential dictionary loading
if (verbose) message("Loading dictionaries sequentially...")
dict_list <- lapply(entity_types, load_dict_single)
}
# Combine dictionaries using base R
for (dict in dict_list) {
if (!is.null(dict) && nrow(dict) > 0) {
if (is.null(combined_dict)) {
combined_dict <- dict
} else {
# Use base R rbind for combining
combined_dict <- rbind(combined_dict, dict)
}
}
}
# Check if combined dictionary was created
if (is.null(combined_dict) || nrow(combined_dict) == 0) {
warning("No valid dictionary terms found. Creating fallback dictionary.")
# Create a minimal fallback dictionary
combined_dict <- data.frame(
term = c("disease", "drug", "gene", "protein", "chemical", "pathway"),
id = paste0("FALLBACK_", 1:6),
type = c("disease", "drug", "gene", "protein", "chemical", "pathway"),
source = rep("fallback", 6),
stringsAsFactors = FALSE
)
}
# Remove duplicates with prioritization for custom entries
if (!is.null(custom_dictionary) && nrow(custom_dictionary) > 0) {
# Add is_custom column using base R
combined_dict$is_custom <- combined_dict$source == "custom"
# Sort the data frame to prioritize custom entries (TRUE first)
combined_dict <- combined_dict[order(combined_dict$is_custom, decreasing = TRUE), ]
# Keep first occurrence of each term (prioritizing custom entries)
combined_dict <- combined_dict[!duplicated(combined_dict$term), ]
# Remove the temporary is_custom column
combined_dict$is_custom <- NULL
} else {
# Simple deduplication with base R
combined_dict <- combined_dict[!duplicated(combined_dict$term), ]
}
if (verbose) message("Created combined dictionary with ", nrow(combined_dict), " unique terms")
# Sanitize the dictionary if requested, but preserve custom dictionary entries
if (sanitize) {
# OPTIMIZATION: Only sanitize if more than 10,000 terms to avoid overhead for small dictionaries
if (nrow(combined_dict) > 10000) {
if (verbose) message("Large dictionary detected, applying optimized sanitization...")
if (!is.null(custom_dictionary) && nrow(custom_dictionary) > 0) {
# Separate custom entries from other entries
custom_entries <- combined_dict[combined_dict$source == "custom", ]
other_entries <- combined_dict[combined_dict$source != "custom", ]
# Apply sanitization in chunks for large dictionaries
if (nrow(other_entries) > 10000) {
chunk_size <- 5000
num_chunks <- ceiling(nrow(other_entries) / chunk_size)
if (verbose) message("Processing in ", num_chunks, " chunks for efficient sanitization")
sanitized_chunks <- list()
for (i in 1:num_chunks) {
start_idx <- (i - 1) * chunk_size + 1
end_idx <- min(i * chunk_size, nrow(other_entries))
if (verbose && i %% 5 == 0) message(" Sanitizing chunk ", i, "/", num_chunks)
chunk <- other_entries[start_idx:end_idx, ]
sanitized_chunks[[i]] <- sanitize_dictionary(chunk, verbose = FALSE)
}
# Combine sanitized chunks
other_entries <- do.call(rbind, sanitized_chunks)
} else {
# Sanitize in one go for smaller dictionaries
other_entries <- sanitize_dictionary(other_entries, verbose = verbose)
}
# Recombine
combined_dict <- rbind(custom_entries, other_entries)
} else {
# No custom entries, sanitize everything
combined_dict <- sanitize_dictionary(combined_dict, verbose = verbose)
}
} else {
# Standard sanitization for reasonable-sized dictionaries
if (!is.null(custom_dictionary) && nrow(custom_dictionary) > 0) {
# Separate custom entries from other entries
custom_entries <- combined_dict[combined_dict$source == "custom", ]
other_entries <- combined_dict[combined_dict$source != "custom", ]
# Sanitize only non-custom entries
if (nrow(other_entries) > 0) {
other_entries <- sanitize_dictionary(other_entries, verbose = verbose)
}
# Recombine
combined_dict <- rbind(custom_entries, other_entries)
} else {
# No custom entries, sanitize everything
combined_dict <- sanitize_dictionary(combined_dict, verbose = verbose)
}
}
}
# Set up indexing for large dictionaries using base R
dict_size <- nrow(combined_dict)
if (dict_size > 10000) {
# Create a named vector for fast lookups
if (verbose) message("Creating index for large dictionary with ", dict_size, " terms...")
term_index <- seq_len(nrow(combined_dict))
names(term_index) <- combined_dict$term
}
# Extract entities with proper error handling
if (verbose) message("Extracting entities from ", nrow(text_data), " documents...")
# Create smaller batches to avoid memory issues
# Use dynamic batch sizing based on the number of documents and dictionary size
adjusted_batch_size <- min(batch_size, ceiling(10000000 / (nrow(combined_dict) + 100)))
final_batch_size <- max(100, min(adjusted_batch_size, nrow(text_data)))
num_batches <- ceiling(nrow(text_data) / final_batch_size)
if (verbose && num_batches > 1) {
message("Processing in ", num_batches, " batches (", final_batch_size, " documents per batch) to optimize memory usage")
}
# Define process_batch function for both sequential and parallel processing
process_batch <- function(batch_idx) {
tryCatch({
start_idx <- (batch_idx - 1) * final_batch_size + 1
end_idx <- min(batch_idx * final_batch_size, nrow(text_data))
# Extract entities for this batch
batch_data <- text_data[start_idx:end_idx, ]
# Extract entities
batch_entities <- extract_entities(
text_data = batch_data,
text_column = text_column,
dictionary = combined_dict,
case_sensitive = FALSE,
overlap_strategy = "priority",
sanitize_dict = FALSE # Already sanitized above
)
return(batch_entities)
}, error = function(e) {
warning("Error in entity extraction for batch ", batch_idx, ": ", e$message)
# Return empty data frame with correct structure
return(data.frame(
doc_id = integer(),
entity = character(),
entity_type = character(),
start_pos = integer(),
end_pos = integer(),
sentence = character(),
frequency = integer(),
stringsAsFactors = FALSE
))
})
}
# Process batches (parallel or sequential)
all_entities <- data.frame()
if (use_parallel && num_batches > 1) {
# Create a parallel cluster for batch processing
if (verbose) message("Processing document batches in parallel with ", num_cores, " cores...")
cl <- parallel::makeCluster(num_cores)
# Export necessary functions and data to the cluster
parallel::clusterExport(cl, varlist = c("text_data", "text_column", "combined_dict",
"final_batch_size", "extract_entities"),
envir = environment())
# Process batches in parallel
batch_results <- tryCatch({
parallel::parLapply(cl, 1:num_batches, process_batch)
}, error = function(e) {
warning("Error in parallel batch processing: ", e$message,
". Falling back to sequential processing.")
lapply(1:num_batches, process_batch)
}, finally = {
# Always stop the cluster
parallel::stopCluster(cl)
cl <- NULL
})
# Combine results
# Use do.call with rbind for base R approach
non_empty_batches <- which(sapply(batch_results, nrow) > 0)
if (length(non_empty_batches) > 0) {
all_entities <- do.call(rbind, batch_results[non_empty_batches])
} else {
all_entities <- data.frame()
}
} else {
# Process batches sequentially
for (batch_idx in 1:num_batches) {
if (verbose && (batch_idx %% 5 == 0 || batch_idx == 1 || batch_idx == num_batches)) {
message("Processing batch ", batch_idx, "/", num_batches)
}
# Process this batch
batch_entities <- process_batch(batch_idx)
# Combine with previous results
if (nrow(batch_entities) > 0) {
if (nrow(all_entities) == 0) {
all_entities <- batch_entities
} else {
# Use rbind for combining
all_entities <- rbind(all_entities, batch_entities)
}
}
# Clean up memory
rm(batch_entities)
gc(verbose = FALSE)
}
}
# Clean up any lingering cluster
if (!is.null(cl)) {
tryCatch({
parallel::stopCluster(cl)
}, error = function(e) {
# Ignore errors when stopping cluster
})
}
# Return the results
if (nrow(all_entities) == 0) {
warning("No entities found in the text")
} else {
if (verbose) {
end_time <- Sys.time()
processing_time <- difftime(end_time, start_time, units = "mins")
message("Extracted ", nrow(all_entities), " entity mentions in ",
round(processing_time, 2), " minutes")
# Count by type
entity_types_count <- table(all_entities$entity_type)
for (type in names(entity_types_count)) {
message(" ", type, ": ", entity_types_count[type])
}
}
}
return(all_entities)
}
#' Create a term-document matrix from preprocessed text
#'
#' This function creates a term-document matrix from preprocessed text data.
#' It's a simplified version of create_tdm() for direct use in models.
#'
#' @param preprocessed_data A data frame with preprocessed text data.
#' @param min_df Minimum document frequency for a term to be included.
#' @param max_df Maximum document frequency (as a proportion) for a term to be included.
#'
#' @return A term-document matrix.
#' @export
create_term_document_matrix <- function(preprocessed_data, min_df = 2, max_df = 0.9) {
# Check if terms column exists
if (!"terms" %in% colnames(preprocessed_data)) {
stop("Terms column not found in preprocessed data")
}
# Extract all unique terms
all_terms <- unique(unlist(lapply(preprocessed_data$terms, function(df) {
if (is.data.frame(df) && nrow(df) > 0) {
return(df$word)
} else {
return(character(0))
}
})))
# Create term-document matrix
tdm <- matrix(0, nrow = length(all_terms), ncol = nrow(preprocessed_data))
rownames(tdm) <- all_terms
# Fill the term-document matrix
for (i in 1:nrow(preprocessed_data)) {
terms_df <- preprocessed_data$terms[[i]]
if (is.data.frame(terms_df) && nrow(terms_df) > 0) {
for (j in 1:nrow(terms_df)) {
term <- terms_df$word[j]
count <- terms_df$count[j]
tdm[term, i] <- count
}
}
}
# Calculate document frequency
doc_freq <- rowSums(tdm > 0)
# Filter terms by document frequency
min_doc_count <- min_df
max_doc_count <- max_df * ncol(tdm)
valid_terms <- which(doc_freq >= min_doc_count & doc_freq <= max_doc_count)
if (length(valid_terms) == 0) {
stop("No terms remain after filtering by document frequency")
}
# Subset matrix to include only valid terms
tdm <- tdm[valid_terms, , drop = FALSE]
return(tdm)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.