R/alternative_models.R

Defines functions create_tdm lsi_model bitola_model anc_model

Documented in anc_model bitola_model create_tdm lsi_model

#' ANC model for literature-based discovery with biomedical term filtering
#'
#' This function implements an improved ANC model that ensures only biomedical
#' terms are used as intermediaries.
#'
#' @param co_matrix A co-occurrence matrix produced by create_cooccurrence_matrix().
#' @param a_term Character string, the source term (A).
#' @param n_b_terms Number of intermediate B terms to consider.
#' @param c_type Character string, the entity type for C terms. If NULL, all types are considered.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#' @param enforce_biomedical_terms Logical. If TRUE, enforces strict biomedical term filtering.
#' @param b_term_types Character vector of entity types allowed for B terms.
#' @param validation_function Function to validate biomedical terms.
#'
#' @return A data frame with ranked discovery results.
#' @export
anc_model <- function(co_matrix, a_term, n_b_terms = 3,
                      c_type = NULL, min_score = 0.1, n_results = 100,
                      enforce_biomedical_terms = TRUE,
                      b_term_types = c("protein", "gene", "chemical", "pathway",
                                       "drug", "disease", "biological_process"),
                      validation_function = is_valid_biomedical_entity) {

  # Check if the matrix has entity types
  has_entity_types <- !is.null(attr(co_matrix, "entity_types"))

  # Extract entity types if available
  entity_types <- if (has_entity_types) attr(co_matrix, "entity_types") else NULL

  # Check if A term exists in the matrix
  if (!a_term %in% rownames(co_matrix)) {
    stop("A-term '", a_term, "' not found in the co-occurrence matrix")
  }

  # Extract A-B associations
  a_associations <- co_matrix[a_term, ]

  # Filter B terms by removing terms with low association to A
  b_terms <- names(a_associations[a_associations > min_score])

  # Remove A term from B terms if present
  b_terms <- b_terms[b_terms != a_term]

  # Define explicit blacklist of problematic terms
  blacklisted_terms <- c(
    # Common language terms that aren't biomedical
    "the", "of", "and", "in", "to", "a", "is", "that", "for", "it", "with", "as", "was",
    "on", "be", "by", "this", "an", "we", "our", "these", "those", "which", "or", "if",
    "have", "has", "had", "are", "were", "can", "could", "would", "should", "may", "might",
    "will", "must", "also", "only", "very", "such", "so", "but", "than", "when", "where",
    "how", "what", "who", "whom", "whose", "why", "not", "all", "any", "each", "every",
    "some", "many", "few", "most", "more", "less", "other", "another", "same", "different",
    "CENTRAL", "remain", "attention", "Delay", "highlight", "highlights", "highlighting",
    "indicate", "suggests", "demonstrated", "show", "shown", "shows", "reveal", "revealed",
    "further", "findings", "results", "find", "found", "into", "across", "both", "several",
    "provide", "provides", "provided", "crucial", "critical", "important", "notably",
    "particularly", "specific", "associated", "related", "linked", "while", "however",
    "advanced", "advancing", "development", "potential", "potentially", "novel", "insights",

    # Generic research terms
    "method", "approach", "analysis", "assessment", "evaluation", "procedure", "technique",
    "protocol", "intervention", "treatment", "outcome", "result", "effect", "impact",
    "value", "study", "trial", "research", "experiment", "observation", "publication"
  )

  # Apply strict biomedical term filtering if requested
  if (enforce_biomedical_terms) {
    # Remove blacklisted terms
    b_terms <- b_terms[!tolower(b_terms) %in% tolower(blacklisted_terms)]

    # Filter B terms by entity type if available and types specified
    if (has_entity_types && !is.null(b_term_types)) {
      b_term_type_filter <- function(term) {
        if (term %in% names(entity_types)) {
          return(entity_types[term] %in% b_term_types)
        }
        return(FALSE)
      }

      b_terms <- b_terms[sapply(b_terms, b_term_type_filter)]
    }

    # Additional biomedical entity validation using provided function
    message("Validating biomedical relevance of B terms...")
    valid_b_terms <- character(0)

    for (term in b_terms) {
      term_type <- if (has_entity_types && term %in% names(entity_types)) entity_types[term] else NULL

      # Apply validation function
      if (validation_function(term, term_type)) {
        valid_b_terms <- c(valid_b_terms, term)
      }
    }

    # Update b_terms with validated terms
    b_terms <- valid_b_terms

    message("Retained ", length(b_terms), " biomedically relevant B terms after filtering")
  }

  # If no B terms found, return empty result
  if (length(b_terms) == 0) {
    message("No suitable B terms found with association score > ", min_score)
    return(data.frame(
      a_term = character(),
      b_terms = character(),
      c_term = character(),
      a_b_scores = character(),
      b_c_scores = character(),
      anc_score = numeric(),
      stringsAsFactors = FALSE
    ))
  }

  # Sort B terms by association strength and take top n_b_terms
  sorted_b_terms <- b_terms[order(-a_associations[b_terms])]
  if (length(sorted_b_terms) > n_b_terms) {
    sorted_b_terms <- sorted_b_terms[1:n_b_terms]
  }

  # Get all terms except A and B terms
  all_terms <- rownames(co_matrix)
  potential_c_terms <- setdiff(all_terms, c(a_term, sorted_b_terms))

  # Filter by c_type if provided and entity types are available
  if (!is.null(c_type) && has_entity_types) {
    c_type_terms <- names(entity_types[entity_types == c_type])
    potential_c_terms <- intersect(potential_c_terms, c_type_terms)
  }

  # Apply the same biomedical term filtering to C terms
  if (enforce_biomedical_terms) {
    # Remove blacklisted terms from potential C terms
    potential_c_terms <- potential_c_terms[!tolower(potential_c_terms) %in% tolower(blacklisted_terms)]

    # Additional biomedical entity validation for C terms
    message("Validating biomedical relevance of C terms...")
    valid_c_terms <- character(0)

    for (term in potential_c_terms) {
      term_type <- if (has_entity_types && term %in% names(entity_types)) entity_types[term] else NULL

      # Apply validation function
      if (validation_function(term, term_type)) {
        valid_c_terms <- c(valid_c_terms, term)
      }
    }

    # Update potential_c_terms with validated terms
    potential_c_terms <- valid_c_terms

    message("Retained ", length(potential_c_terms), " biomedically relevant C terms after filtering")
  }

  # Initialize results
  results <- data.frame(
    a_term = character(),
    b_terms = character(),
    c_term = character(),
    a_b_scores = character(),
    b_c_scores = character(),
    anc_score = numeric(),
    stringsAsFactors = FALSE
  )

  # For each potential C term
  message("Analyzing ", length(potential_c_terms), " potential C terms...")
  pb <- utils::txtProgressBar(min = 0, max = length(potential_c_terms), style = 3)

  for (c_idx in seq_along(potential_c_terms)) {
    utils::setTxtProgressBar(pb, c_idx)

    c_term <- potential_c_terms[c_idx]

    # Check B-C associations for all B terms
    b_c_scores <- co_matrix[sorted_b_terms, c_term]

    # Filter for potential C terms with sufficient association
    valid_b_indices <- which(b_c_scores > min_score)

    if (length(valid_b_indices) >= 2) {  # Require at least 2 B terms
      valid_b_terms <- sorted_b_terms[valid_b_indices]
      valid_b_c_scores <- b_c_scores[valid_b_indices]
      valid_a_b_scores <- a_associations[valid_b_terms]

      # Calculate AnC score
      # Geometric mean of A-B scores and B-C scores
      anc_score <- sqrt(mean(valid_a_b_scores) * mean(valid_b_c_scores)) * length(valid_b_indices) / n_b_terms

      # Create result entry
      results <- rbind(results, data.frame(
        a_term = a_term,
        b_terms = paste(valid_b_terms, collapse = ", "),
        c_term = c_term,
        a_b_scores = paste(round(valid_a_b_scores, 3), collapse = ", "),
        b_c_scores = paste(round(valid_b_c_scores, 3), collapse = ", "),
        anc_score = anc_score,
        stringsAsFactors = FALSE
      ))
    }
  }

  close(pb)

  # If no results found, return empty data frame
  if (nrow(results) == 0) {
    message("No AnC connections found")
    return(results)
  }

  # Sort by AnC score and limit to n_results
  results <- results[order(-results$anc_score), ]
  if (nrow(results) > n_results) {
    results <- results[1:n_results, ]
  }

  # Add entity type information if available
  if (has_entity_types) {
    results$a_type <- entity_types[results$a_term]
    results$c_type <- entity_types[results$c_term]
  }

  return(results)
}

#' Apply BITOLA-style discovery model
#'
#' This function implements a BITOLA-style discovery model based on MeSH term
#' co-occurrence and semantic type filtering.
#'
#' @param co_matrix A co-occurrence matrix produced by create_cooccurrence_matrix().
#' @param a_term Character string, the source term (A).
#' @param a_semantic_type Character string, the semantic type for A term.
#' @param c_semantic_type Character string, the semantic type for C terms.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#'
#' @return A data frame with ranked discovery results.
#' @export
bitola_model <- function(co_matrix, a_term, a_semantic_type = NULL,
                         c_semantic_type = NULL, min_score = 0.1,
                         n_results = 100) {

  # Check if semantic types are provided
  if (is.null(a_semantic_type) || is.null(c_semantic_type)) {
    stop("Both A and C semantic types must be provided for BITOLA model")
  }

  # Check if the matrix has entity types
  has_entity_types <- !is.null(attr(co_matrix, "entity_types"))

  if (!has_entity_types) {
    stop("Entity types must be available in the co-occurrence matrix for BITOLA model")
  }

  # Check if A term exists in the matrix
  if (!a_term %in% rownames(co_matrix)) {
    stop("A-term '", a_term, "' not found in the co-occurrence matrix")
  }

  # Get entity types
  entity_types <- attr(co_matrix, "entity_types")

  # Check if A term is of the specified semantic type
  if (entity_types[a_term] != a_semantic_type) {
    stop("A-term '", a_term, "' is not of semantic type '", a_semantic_type, "'")
  }

  # Extract A-B associations
  a_associations <- co_matrix[a_term, ]

  # Filter B terms by removing terms with low association to A
  b_terms <- names(a_associations[a_associations > min_score])

  # Remove A term from B terms if present
  b_terms <- b_terms[b_terms != a_term]

  # If no B terms found, return empty result
  if (length(b_terms) == 0) {
    message("No B terms found with association score > ", min_score)
    return(data.frame(
      a_term = character(),
      a_type = character(),
      b_term = character(),
      b_type = character(),
      c_term = character(),
      c_type = character(),
      a_b_score = numeric(),
      b_c_score = numeric(),
      bitola_score = numeric(),
      support = integer(),
      stringsAsFactors = FALSE
    ))
  }

  # Get potential C terms of the specified semantic type
  c_type_terms <- names(entity_types[entity_types == c_semantic_type])

  # Initialize results
  results <- data.frame(
    a_term = character(),
    a_type = character(),
    b_term = character(),
    b_type = character(),
    c_term = character(),
    c_type = character(),
    a_b_score = numeric(),
    b_c_score = numeric(),
    bitola_score = numeric(),
    support = integer(),
    stringsAsFactors = FALSE
  )

  # For each B term
  message("Analyzing ", length(b_terms), " B terms...")
  pb <- utils::txtProgressBar(min = 0, max = length(b_terms), style = 3)

  for (b_idx in seq_along(b_terms)) {
    utils::setTxtProgressBar(pb, b_idx)

    b_term <- b_terms[b_idx]
    b_type <- entity_types[b_term]

    # Get B-C associations
    b_associations <- co_matrix[b_term, ]

    # Filter C terms with sufficient connection
    potential_c_for_b <- names(b_associations[b_associations > min_score])

    # Intersect with C terms of the specified semantic type
    potential_c_for_b <- intersect(potential_c_for_b, c_type_terms)

    if (length(potential_c_for_b) > 0) {
      for (c_term in potential_c_for_b) {
        # Get scores
        a_b_score <- a_associations[b_term]
        b_c_score <- b_associations[c_term]

        # Calculate BITOLA score
        # Modified formula based on BITOLA paper
        bitola_score <- (a_b_score * b_c_score)^2

        # Create result entry
        results <- rbind(results, data.frame(
          a_term = a_term,
          a_type = a_semantic_type,
          b_term = b_term,
          b_type = b_type,
          c_term = c_term,
          c_type = c_semantic_type,
          a_b_score = a_b_score,
          b_c_score = b_c_score,
          bitola_score = bitola_score,
          support = 1,  # Will be summed later
          stringsAsFactors = FALSE
        ))
      }
    }
  }

  close(pb)

  # If no results found, return empty data frame
  if (nrow(results) == 0) {
    message("No BITOLA connections found")
    return(results)
  }

  # Aggregate results by C term (support count)
  aggregated <- aggregate(
    cbind(support, bitola_score) ~ a_term + a_type + c_term + c_type,
    data = results,
    FUN = function(x) c(sum = sum(x), max = max(x))
  )

  # Restructure the aggregated results
  final_results <- data.frame(
    a_term = aggregated$a_term,
    a_type = aggregated$a_type,
    c_term = aggregated$c_term,
    c_type = aggregated$c_type,
    support = aggregated$support[, "sum"],
    bitola_score = aggregated$bitola_score[, "max"],
    stringsAsFactors = FALSE
  )

  # Add intermediate B terms for each A-C pair
  final_results$b_terms <- sapply(1:nrow(final_results), function(i) {
    ac_results <- results[results$a_term == final_results$a_term[i] &
                            results$c_term == final_results$c_term[i], ]

    # Sort B terms by score
    sorted_indices <- order(-ac_results$bitola_score)
    sorted_b_terms <- ac_results$b_term[sorted_indices]

    # Return comma-separated list of B terms
    paste(sorted_b_terms, collapse = ", ")
  })

  # Calculate new ranking score based on both support and BITOLA score
  final_results$ranking_score <- final_results$support * final_results$bitola_score

  # Sort by ranking score and limit to n_results
  final_results <- final_results[order(-final_results$ranking_score), ]
  if (nrow(final_results) > n_results) {
    final_results <- final_results[1:n_results, ]
  }

  return(final_results)
}

#' LSI model with enhanced biomedical term filtering and NLP verification
#'
#' This function implements an improved LSI model that more rigorously filters out
#' non-biomedical terms from the results to ensure clinical relevance. It adds
#' NLP-based validation as an additional layer of filtering.
#'
#' @param term_doc_matrix A term-document matrix.
#' @param a_term Character string, the source term (A).
#' @param n_factors Number of factors to use in LSI.
#' @param n_results Maximum number of results to return.
#' @param enforce_biomedical_terms Logical. If TRUE, enforces strict biomedical term filtering.
#' @param c_term_types Character vector of entity types allowed for C terms.
#' @param entity_types Named vector of entity types (if NULL, will try to detect).
#' @param validation_function Function to validate biomedical terms.
#' @param min_word_length Minimum word length to include.
#' @param use_nlp Logical. If TRUE, uses NLP-based validation for biomedical terms.
#' @param nlp_threshold Numeric between 0 and 1. Minimum confidence for NLP validation.
#'
#' @return A data frame with ranked discovery results.
#' @export
lsi_model <- function(term_doc_matrix, a_term, n_factors = 100, n_results = 100,
                      enforce_biomedical_terms = TRUE,
                      c_term_types = NULL,
                      entity_types = NULL,
                      validation_function = is_valid_biomedical_entity,
                      min_word_length = 3,
                      use_nlp = TRUE,
                      nlp_threshold = 0.7) {

  # Check if SVD package is available
  if (!requireNamespace("irlba", quietly = TRUE)) {
    stop("The irlba package is required for LSI. Install it with: install.packages('irlba')")
  }

  # Check if A term exists in the matrix
  if (!a_term %in% rownames(term_doc_matrix)) {
    stop("A-term '", a_term, "' not found in the term-document matrix")
  }

  # Number of factors should not exceed rank limits
  n_factors <- min(n_factors, min(nrow(term_doc_matrix), ncol(term_doc_matrix)) - 1)

  # Perform TF-IDF weighting
  # Term frequency (already in the matrix)
  # Document frequency
  doc_freq <- rowSums(term_doc_matrix > 0)

  # Inverse document frequency
  idf <- log(ncol(term_doc_matrix) / doc_freq)

  # Apply IDF weighting
  tfidf_matrix <- term_doc_matrix * idf

  # Perform SVD using irlba for efficiency with large matrices
  message("Performing SVD with ", n_factors, " factors...")
  svd_result <- irlba::irlba(tfidf_matrix, nv = n_factors)

  # Calculate term and document vectors in latent space
  term_vectors <- svd_result$u %*% diag(svd_result$d)

  # Get index of A term
  a_idx <- which(rownames(term_doc_matrix) == a_term)

  # Get the A term vector in latent space
  a_vector <- term_vectors[a_idx, ]

  # Calculate cosine similarity between A term and all other terms
  # Normalize vectors
  term_vectors_norm <- term_vectors / sqrt(rowSums(term_vectors^2))
  a_vector_norm <- a_vector / sqrt(sum(a_vector^2))

  # Calculate similarities
  similarities <- term_vectors_norm %*% a_vector_norm

  # Remove A term from results
  similarities[a_idx] <- -1

  # Define GREATLY expanded blacklist of problematic terms - now with many more academic terms
  blacklisted_terms <- c(
    # Common language terms that aren't biomedical
    "the", "of", "and", "in", "to", "a", "is", "that", "for", "it", "with", "as", "was",
    "on", "be", "by", "this", "an", "we", "our", "these", "those", "which", "or", "if",
    "have", "has", "had", "are", "were", "can", "could", "would", "should", "may", "might",
    "will", "must", "also", "only", "very", "such", "so", "but", "than", "when", "where",
    "how", "what", "who", "whom", "whose", "why", "not", "all", "any", "each", "every",
    "some", "many", "few", "most", "more", "less", "other", "another", "same", "different",
    "remain", "attention", "delay", "highlight", "highlights", "highlighting",
    "indicate", "suggests", "demonstrated", "show", "shown", "shows", "reveal", "revealed",
    "further", "findings", "results", "find", "found", "into", "across", "both", "several",
    "provide", "provides", "provided", "crucial", "critical", "important", "notably",
    "particularly", "specific", "associated", "related", "linked", "while", "however",
    "advanced", "advancing", "development", "potential", "potentially", "novel", "insights",
    "thus", "well", "early", "ad", "notably", "remains", "particularly", "specific",

    # Academic language and research paper terminology - EXPANDED
    "demonstrate", "within", "alongside", "investigate", "explain", "integrate", "elucidate",
    "certain", "debate", "state", "recapitulate", "phase", "translate", "modulate",
    "ultimate", "whole", "varied", "role", "speculate", "side", "academia",
    "considerable", "consistent", "substantial", "significant", "relevant", "important",
    "interesting", "promising", "similar", "different", "distinct", "specific", "particular",
    "major", "minor", "key", "main", "essential", "necessary", "sufficient", "adequate",
    "proper", "appropriate", "suitable", "consecutive", "simultaneous", "various",
    "variable", "concurrent", "concomitant", "overall", "entire", "whole", "optimum",
    "optimal", "ideal", "better", "best", "worse", "worst", "efficacious", "limited",
    "extensive", "intensive", "widespread", "reliable", "reproducible", "repeatable",
    "comparable", "varied", "useful", "valuable", "successful", "unsuccessful",
    "effective", "ineffective", "minimum", "maximum", "exhibit", "display", "present",
    "observe", "notice", "examine", "investigate", "explore", "clarify", "elucidate",
    "interpret", "understand", "establish", "confirm", "validate", "verify", "question",
    "challenge", "refute", "contradict", "support", "substantiate", "corroborate",
    "reinforce", "review", "summarize", "analyze", "solve", "resolve", "address",
    "tackle", "overcome", "approach", "strategy", "plan", "method", "way", "means",
    "technique", "perspective", "viewpoint", "standpoint", "opinion", "belief", "view",
    "understanding", "knowledge", "insight", "wisdom", "expertise", "mechanism",
    "process", "pathway", "proceed", "progress", "proceed", "move", "advance", "achieve",
    "accomplish", "complete", "conclude", "finalize", "implement", "execute", "identify",
    "recognition", "indication", "evidence", "proof", "report", "account", "description",
    "example", "instance", "case", "account", "description", "statement", "proposition",
    "theory", "hypothesis", "postulate", "assumption", "premise", "thesis", "conclusion",
    "deduction", "inference", "idea", "concept", "notion", "thought", "point", "matter",
    "issue", "subject", "topic", "aspect", "dimension", "element", "component", "part",
    "section", "segment", "fragment", "piece", "portion", "sample", "individual", "single",
    "separate", "isolated", "distributed", "allocated", "assigned", "documented", "reported",
    "reported", "described", "defined", "characterized", "identified", "recognized",
    "detected", "observed", "monitored", "tracked", "followed", "traced", "explored",
    "investigated", "studied", "analyzed", "evaluated", "assessed", "measured", "quantified",
    "calculated", "computed", "estimated", "approximated", "improved", "enhanced",
    "boosted", "augmented", "increased", "decreased", "reduced", "lowered", "inhibited",
    "suppressed", "attenuated", "abolished", "eliminated", "removed", "cleared", "purged",
    "figure", "table", "chart", "graph", "diagram", "schematic", "illustration", "image",
    "picture", "photo", "photograph", "drawing", "sketch", "outline", "draft", "plan",
    "design", "layout", "structure", "arrangement", "organization", "composition",
    "construction", "configuration", "formation", "constitution", "build", "composition",
    "makeup", "preparation", "formulation", "fabrication", "synthesis", "creation",
    "production", "generation", "formation", "emergence", "appearance", "manifestation",
    "expression", "articulation", "formulation", "presentation", "proposal", "suggestion",
    "recommendation", "advice", "guidance", "direction", "instruction", "indication",
    "specification", "stipulation", "requirement", "prerequisite", "precondition",
    "qualification", "condition", "standard", "criterion", "benchmark",

    # NEW: Additional problematic academic terms from the LSI Results
    "generate", "incorporate", "intricate", "facilitate", "mitigate", "implicate",
    "ensure", "enable", "enhance", "expand", "extend", "maintain", "preserve",
    "promote", "leverage", "utilize", "employ", "deploy", "apply", "implement",
    "integrate", "combine", "coordinate", "align", "adapt", "adjust", "modify",
    "tailor", "customize", "personalize", "standardize", "normalize", "conceptualize",
    "formulate", "develop", "evolve", "emerge", "derive", "originate", "construct",
    "reconstruct", "deconstruct", "synthesize", "compile", "assemble", "gather",
    "collect", "accumulate", "aggregate", "designate", "allocate", "distribute",
    "disseminate", "propagate", "widespread", "comprehensive", "thorough", "detailed",
    "precise", "accurate", "exact", "robust", "rigorous", "systematic", "methodical",
    "innovative", "creative", "novel", "unique", "diverse", "heterogeneous", "homogeneous",
    "uniform", "equivalent", "comparative", "relative", "absolute", "objective", "subjective",
    "empirical", "theoretical", "conceptual", "abstract", "concrete", "tangible", "intangible",
    "explicit", "implicit", "inherent", "intrinsic", "extrinsic", "underlying", "fundamental",
    "pioneering", "groundbreaking", "transformative", "revolutionary", "evolutionary",
    "progressive", "regressive", "retrospective", "prospective", "longitudinal", "cross-sectional",
    "temporal", "spatial", "multidimensional", "interdisciplinary", "multidisciplinary",
    "intersectional", "collaborative", "cooperative", "continuous", "discrete", "incremental",
    "gradual", "sudden", "abrupt", "dramatic", "pronounced", "subtle", "nuanced", "complex",
    "complicated", "elaborate", "intricate", "sophisticated", "advanced", "elementary",
    "rudimentary", "basic", "primary", "secondary", "tertiary", "complementary", "supplementary",
    "auxiliary", "adjunct", "peripheral", "central", "pivotal", "crucial", "definitive",
    "widespread", "selectively", "readily", "primarily", "predominantly", "frequently",
    "commonly", "typically", "generally", "usually", "always", "never", "rarely", "occasionally",
    "sometimes", "often", "continuously", "persistently", "consistently", "intermittently",
    "subsequently", "consecutively", "concurrently", "simultaneously", "consequently",
    "sequentially", "ultimately", "eventually", "finally", "traditionally", "conventionally",
    "classically", "recently", "evidently", "apparently", "supposedly", "presumably",
    "purportedly", "allegedly", "arguably", "conceivably", "seemingly", "strikingly",
    "remarkably", "notably", "markedly", "considerably", "substantially", "significantly",
    "moderately", "marginally", "slightly", "minimally", "negligibly", "profoundly",
    "deeply", "thoroughly", "entirely", "completely", "fully", "partially", "partly",
    "incompletely", "inadequately", "insufficiently", "excessively", "disproportionately",
    "outweigh", "underscore", "emphasize", "highlight", "accentuate", "exacerbate",
    "ameliorate", "worsen", "diminish", "heighten", "strengthen", "weaken", "intensify",
    "attenuate", "modulate", "regulate", "mediate", "influence", "impact", "affect",
    "effect", "induce", "trigger", "stimulate", "activate", "initiate", "commence",
    "begin", "end", "terminate", "cease", "discontinue", "continue", "persist", "endure",
    "prevail", "predominate", "recur", "recurrence", "relapse", "remission", "resolution",
    "reversal", "inversion", "conversion", "transition", "transformation", "alteration",
    "modification", "deviation", "variation", "fluctuation", "oscillation",

    # Generic research terms
    "method", "approach", "analysis", "assessment", "evaluation", "procedure", "technique",
    "protocol", "intervention", "treatment", "outcome", "result", "effect", "impact",
    "value", "study", "trial", "research", "experiment", "observation", "publication",
    "test", "measure", "detection", "identification", "classification", "characterization",
    "determination", "calculation", "examination", "investigation", "exploration",
    "screening", "monitoring", "surveillance", "survey", "review", "overview", "summary",
    "score", "grade", "rating", "ranking", "stratification", "categorization", "grouping",
    "framework", "structure", "process", "system", "model", "theory", "concept",
    "principle", "paradigm", "perspective", "viewpoint", "approach", "strategy",
    "direction", "scheme", "plan", "proposal", "project", "program", "objective",
    "aim", "goal", "target", "purpose", "intention", "motivation", "reason", "rationale",
    "justification", "significance", "importance", "relevance", "value", "benefit",
    "advantage", "merit", "worth", "quality", "attribute", "characteristic", "feature",

    # Statistical terms
    "average", "mean", "median", "mode", "range", "variance", "standard", "deviation",
    "correlation", "regression", "association", "relationship", "factor", "variable",
    "parameter", "statistic", "significance", "confidence", "interval", "margin",
    "error", "probability", "likelihood", "chance", "odds", "ratio", "proportion",
    "percentage", "fraction", "decimal", "number", "quantity", "amount", "level",
    "degree", "extent", "magnitude", "size", "dimension", "area", "volume", "duration",
    "frequency", "incidence", "prevalence", "rate", "speed", "velocity", "acceleration",
    "force", "pressure", "temperature", "dose", "concentration"
  )

  # Get all term indices
  all_indices <- 1:length(similarities)

  # Apply biomedical term filtering if requested
  if (enforce_biomedical_terms) {
    # Filter out terms with length less than min_word_length
    short_term_indices <- which(nchar(rownames(term_doc_matrix)) < min_word_length)

    # Filter out blacklisted terms (case-insensitive)
    blacklist_indices <- which(tolower(rownames(term_doc_matrix)) %in% tolower(blacklisted_terms))

    # Combine all indices to exclude
    exclude_indices <- unique(c(a_idx, short_term_indices, blacklist_indices))

    # Get remaining indices
    candidate_indices <- setdiff(all_indices, exclude_indices)

    # Validate biomedical relevance of remaining terms
    if (!is.null(validation_function)) {
      message("Validating biomedical relevance of terms...")

      valid_indices <- integer(0)
      candidate_terms <- rownames(term_doc_matrix)[candidate_indices]

      # New scoring system for biomedical relevance
      biomedical_scores <- numeric(length(candidate_indices))

      for (i in seq_along(candidate_indices)) {
        term <- candidate_terms[i]

        # Get entity type if available
        term_type <- NULL
        if (!is.null(entity_types) && term %in% names(entity_types)) {
          term_type <- entity_types[term]
        }

        # Apply base validation using provided function (boolean)
        base_valid <- validation_function(term, term_type)

        # NLP validation if requested
        nlp_valid <- FALSE
        if (use_nlp) {
          nlp_valid <- tryCatch({
            # Try to apply NLP-based entity recognition
            # This could be a call to spaCy, NLTK, or other NLP tools
            # For now, we'll use our existing validation_entity_with_nlp function
            if (exists("validate_entity_with_nlp", mode="function")) {
              validate_entity_with_nlp(term, term_type)
            } else {
              # Fallback to base validation if NLP function doesn't exist
              base_valid
            }
          }, error = function(e) {
            message("NLP validation failed for term: ", term, ". Error: ", e$message)
            return(FALSE)
          })
        }

        # Calculate biomedical relevance score based on multiple criteria
        term_score <- 0

        # Add points for passing base validation
        if (base_valid) term_score <- term_score + 0.5

        # Add points for passing NLP validation
        if (nlp_valid) term_score <- term_score + 0.5

        # Check for known biomedical entity patterns
        term_lower <- tolower(term)

        # Biomedical entity patterns
        biomed_patterns <- list(
          # Disease/pathology patterns
          "disease" = c("itis$", "emia$", "oma$", "pathy$", "osis$", "algia$", "disease", "disorder",
                        "syndrome", "infection", "injury", "lesion", "trauma", "cancer", "tumor",
                        "carcinoma", "sarcoma", "lymphoma", "melanoma", "leukemia"),

          # Anatomical/physiological patterns
          "anatomy" = c("nerve", "neural", "neuron", "brain", "spinal", "cardiac", "heart", "liver",
                        "renal", "kidney", "pulmonary", "lung", "vascular", "artery", "vein", "blood",
                        "tissue", "muscle", "bone", "joint", "cell", "plasma", "serum"),

          # Molecular/biochemical patterns
          "molecular" = c("protein", "peptide", "enzyme", "gene", "receptor", "antibody", "hormone",
                          "cytokine", "kinase", "ase$", "factor", "growth", "channel", "transporter",
                          "signaling", "pathway"),

          # Pharmaceutical/therapeutic patterns
          "pharma" = c("drug", "therapy", "treatment", "medication", "dose", "clinical", "therapeutic",
                       "inhibitor", "agonist", "antagonist", "blocker", "vaccine", "antibiotics",
                       "mab$", "statin$", "pril$", "sone$", "olol$")
        )

        # Check for pattern matches
        for (category in names(biomed_patterns)) {
          for (pattern in biomed_patterns[[category]]) {
            if (grepl(pattern, term_lower)) {
              term_score <- term_score + 0.25  # Add points for pattern matches
              break  # Only count one match per category
            }
          }
        }

        # Store the score
        biomedical_scores[i] <- term_score
      }

      # Select terms with score above threshold
      valid_indices <- candidate_indices[biomedical_scores >= nlp_threshold]

      message("Retained ", length(valid_indices), " biomedically relevant terms after validation")
    }

    # Update indices for ranking
    all_indices <- valid_indices
  }

  # Get top similar terms from filtered set
  top_indices <- all_indices[order(similarities[all_indices], decreasing = TRUE)]
  top_indices <- head(top_indices, min(n_results, length(top_indices)))

  # Extract final results
  top_terms <- rownames(term_doc_matrix)[top_indices]
  top_scores <- similarities[top_indices]

  # Create result data frame
  results <- data.frame(
    a_term = rep(a_term, length(top_terms)),
    c_term = top_terms,
    lsi_similarity = top_scores,
    stringsAsFactors = FALSE
  )

  # Add entity type information if available
  if (!is.null(entity_types)) {
    results$c_type <- sapply(results$c_term, function(term) {
      if (term %in% names(entity_types)) entity_types[term] else NA
    })

    # If c_term_types is specified, filter results accordingly
    if (!is.null(c_term_types) && length(c_term_types) > 0) {
      results <- results[results$c_type %in% c_term_types | is.na(results$c_type), ]
    }
  }

  return(results)
}

#' Create a term-document matrix from preprocessed text
#'
#' This function creates a term-document matrix from preprocessed text data.
#'
#' @param preprocessed_data A data frame with preprocessed text data.
#' @param min_df Minimum document frequency for a term to be included.
#' @param max_df Maximum document frequency (as a proportion) for a term to be included.
#'
#' @return A term-document matrix.
#' @export
create_tdm <- function(preprocessed_data, min_df = 2, max_df = 0.9) {

  # Check if terms column exists
  if (!"terms" %in% colnames(preprocessed_data)) {
    stop("Terms column not found in preprocessed data")
  }

  # Extract all unique terms
  all_terms <- unique(unlist(lapply(preprocessed_data$terms, function(df) {
    if (is.data.frame(df) && nrow(df) > 0) {
      return(df$word)
    } else {
      return(character(0))
    }
  })))

  # Create term-document matrix
  tdm <- matrix(0, nrow = length(all_terms), ncol = nrow(preprocessed_data))
  rownames(tdm) <- all_terms

  # Fill the term-document matrix
  for (i in 1:nrow(preprocessed_data)) {
    terms_df <- preprocessed_data$terms[[i]]
    if (is.data.frame(terms_df) && nrow(terms_df) > 0) {
      for (j in 1:nrow(terms_df)) {
        term <- terms_df$word[j]
        count <- terms_df$count[j]
        tdm[term, i] <- count
      }
    }
  }

  # Calculate document frequency
  doc_freq <- rowSums(tdm > 0)

  # Filter terms by document frequency
  min_doc_count <- min_df
  max_doc_count <- max_df * ncol(tdm)

  valid_terms <- which(doc_freq >= min_doc_count & doc_freq <= max_doc_count)

  if (length(valid_terms) == 0) {
    stop("No terms remain after filtering by document frequency")
  }

  # Subset matrix to include only valid terms
  tdm <- tdm[valid_terms, , drop = FALSE]

  return(tdm)
}

Try the LBDiscover package in your browser

Any scripts or data that you put into this service are public.

LBDiscover documentation built on June 16, 2025, 5:09 p.m.