R/abc_model.R

Defines functions query_external_api validate_entity_comprehensive validate_biomedical_entity validate_entity_with_nlp filter_by_type get_type_dist remove_ac_terms diversify_c_paths diversify_abc perm_test_abc apply_correction alternative_validation standard_validation validate_abc abc_timeslice abc_model_sig find_abc_all add_statistical_significance diversify_b_terms calculate_score abc_model filter_terms_for_abc_model is_valid_biomedical_entity create_comat

Documented in abc_model abc_model_sig abc_timeslice add_statistical_significance alternative_validation apply_correction calculate_score create_comat diversify_abc diversify_b_terms diversify_c_paths filter_by_type find_abc_all get_type_dist is_valid_biomedical_entity perm_test_abc query_external_api remove_ac_terms standard_validation validate_abc validate_biomedical_entity validate_entity_comprehensive validate_entity_with_nlp

#' Create co-occurrence matrix without explicit entity type constraints
#'
#' This function creates a co-occurrence matrix from entity data
#' while preserving entity type information as an attribute without
#' enforcing type constraints.
#'
#' @param entity_data A data frame with document IDs and entities.
#' @param doc_id_col Name of the column containing document IDs.
#' @param entity_col Name of the column containing entity names.
#' @param count_col Name of the column containing entity counts (optional).
#' @param type_col Name of the column containing entity types (optional).
#' @param normalize Logical. If TRUE, normalizes the co-occurrence matrix.
#' @param normalization_method Method for normalization ("cosine", "jaccard", or "dice").
#'
#' @return A co-occurrence matrix with entity types stored as an attribute.
#' @export
create_comat <- function(entity_data,
                         doc_id_col = "doc_id",
                         entity_col = "entity",
                         count_col = NULL,
                         type_col = "entity_type",
                         normalize = TRUE,
                         normalization_method = c("cosine", "jaccard", "dice")) {

  # Match normalization method
  normalization_method <- match.arg(normalization_method)

  # Check if required columns exist
  required_cols <- c(doc_id_col, entity_col)
  if (!all(required_cols %in% colnames(entity_data))) {
    stop("Required columns not found in the data: ",
         paste(required_cols[!required_cols %in% colnames(entity_data)], collapse = ", "))
  }

  # Check if type column exists and store this information
  has_types <- type_col %in% colnames(entity_data)
  if (!has_types) {
    message("Entity type column '", type_col, "' not found. Co-occurrence matrix will have no type information.")
  }

  # Filter out any rows with NA in required columns
  valid_rows <- !is.na(entity_data[[doc_id_col]]) & !is.na(entity_data[[entity_col]])
  if (has_types) {
    valid_rows <- valid_rows & !is.na(entity_data[[type_col]])
  }

  # If count_col is provided, also filter rows with NA counts
  if (!is.null(count_col) && count_col %in% colnames(entity_data)) {
    valid_rows <- valid_rows & !is.na(entity_data[[count_col]])
  }

  # Apply filtering
  entity_data <- entity_data[valid_rows, ]

  # Check for empty data after filtering
  if (nrow(entity_data) == 0) {
    stop("No valid data after filtering NA values")
  }

  # Extract unique documents and entities
  docs <- unique(entity_data[[doc_id_col]])
  entities <- unique(entity_data[[entity_col]])

  # Ensure docs and entities are character vectors
  docs <- as.character(docs)
  entities <- as.character(entities)

  # Extract entity types if available
  if (has_types) {
    # Create a data frame to handle multiple types per entity
    entity_type_df <- unique(entity_data[, c(entity_col, type_col)])

    # If there are multiple types for the same entity, keep them all
    entity_types <- tapply(entity_type_df[[type_col]], entity_type_df[[entity_col]],
                           function(x) paste(sort(unique(x)), collapse = "|"))
  }

  # Check if Matrix package is available for sparse matrix
  if (!requireNamespace("Matrix", quietly = TRUE)) {
    stop("The Matrix package is required. Install it with: install.packages('Matrix')")
  }

  # Create document-term incidence matrix (binary or weighted)
  message("Building entity-document matrix...")
  dtm <- Matrix::sparseMatrix(
    i = match(entity_data[[doc_id_col]], docs),
    j = match(entity_data[[entity_col]], entities),
    x = if (!is.null(count_col) && count_col %in% colnames(entity_data)) {
      entity_data[[count_col]]
    } else {
      rep(1, nrow(entity_data))
    },
    dims = c(length(docs), length(entities)),
    dimnames = list(docs, entities)
  )

  # Calculate co-occurrence matrix using matrix multiplication
  message("Calculating co-occurrence matrix...")
  co_matrix <- Matrix::t(dtm) %*% dtm

  # Set diagonal to zero (entities don't co-occur with themselves)
  diag(co_matrix) <- 0

  # Normalize if requested
  if (normalize) {
    message("Normalizing co-occurrence matrix using ", normalization_method, " method...")

    # Get the frequency of each entity
    entity_freq <- Matrix::diag(Matrix::t(dtm) %*% dtm)

    if (normalization_method == "cosine") {
      # Cosine normalization: co_ij / sqrt(freq_i * freq_j)
      norm_matrix <- sqrt(outer(entity_freq, entity_freq))
      # Avoid division by zero
      norm_matrix[norm_matrix == 0] <- 1
      # Normalize
      co_matrix <- co_matrix / norm_matrix

    } else if (normalization_method == "jaccard") {
      # Jaccard coefficient: co_ij / (freq_i + freq_j - co_ij)
      for (i in 1:nrow(co_matrix)) {
        for (j in 1:ncol(co_matrix)) {
          if (i != j && co_matrix[i, j] > 0) {
            co_matrix[i, j] <- co_matrix[i, j] /
              (entity_freq[i] + entity_freq[j] - co_matrix[i, j])
          }
        }
      }

    } else if (normalization_method == "dice") {
      # Dice coefficient: 2 * co_ij / (freq_i + freq_j)
      for (i in 1:nrow(co_matrix)) {
        for (j in 1:ncol(co_matrix)) {
          if (i != j && co_matrix[i, j] > 0) {
            co_matrix[i, j] <- (2 * co_matrix[i, j]) /
              (entity_freq[i] + entity_freq[j])
          }
        }
      }
    }
  }

  # Add entity types as attribute if available
  if (has_types) {
    attr(co_matrix, "entity_types") <- entity_types
  }

  # Add entity frequencies as attribute
  attr(co_matrix, "entity_freq") <- entity_freq

  # Add metadata
  attr(co_matrix, "metadata") <- list(
    n_docs = length(docs),
    n_entities = length(entities),
    has_types = has_types,
    normalization = if (normalize) normalization_method else "none"
  )

  return(co_matrix)
}

#' Determine if a term is likely a specific biomedical entity with improved accuracy
#'
#' @param term Character string, the term to check
#' @param claimed_type Character string, the claimed entity type of the term
#' @return Logical, TRUE if the term is likely a valid biomedical entity, FALSE otherwise
#' @export
is_valid_biomedical_entity <- function(term, claimed_type = NULL) {
  # Skip empty terms
  if (is.null(term) || is.na(term) || term == "") {
    return(FALSE)
  }

  # Convert to lowercase for case-insensitive matching
  term_lower <- tolower(term)
  if (!is.null(claimed_type)) {
    claimed_type <- tolower(claimed_type)
  }

  # Dictionary of specific problematic acronyms and their correct types
  acronym_corrections <- list(
    # Analytical techniques/methods that are often misclassified as chemicals
    "faers" = "method",              # FDA Adverse Event Reporting System
    "bcpnn" = "method",              # Bayesian Confidence Propagation Neural Network
    "uplc" = "method",               # Ultra Performance Liquid Chromatography
    "frap" = "method",               # Fluorescence Recovery After Photobleaching
    "hplc" = "method",               # High Performance Liquid Chromatography
    "lc-ms" = "method",              # Liquid Chromatography-Mass Spectrometry
    "gc-ms" = "method",              # Gas Chromatography-Mass Spectrometry
    "maldi" = "method",              # Matrix-Assisted Laser Desorption/Ionization
    "elisa" = "method",              # Enzyme-Linked Immunosorbent Assay
    "ft-ir" = "method",              # Fourier Transform Infrared Spectroscopy
    "nmr" = "method",                # Nuclear Magnetic Resonance
    "pcr" = "method",                # Polymerase Chain Reaction
    "sem" = "method",                # Scanning Electron Microscopy
    "tem" = "method",                # Transmission Electron Microscopy
    "xrd" = "method",                # X-Ray Diffraction
    "saxs" = "method",               # Small-Angle X-ray Scattering
    "uv-vis" = "method",             # Ultraviolet-Visible Spectroscopy
    "ms" = "method",                 # Mass Spectrometry
    "ms/ms" = "method",              # Tandem Mass Spectrometry
    "lc" = "method",                 # Liquid Chromatography
    "gc" = "method",                 # Gas Chromatography
    "tga" = "method",                # Thermogravimetric Analysis
    "dsc" = "method",                # Differential Scanning Calorimetry
    "uv" = "method",                 # Ultraviolet
    "ir" = "method",                 # Infrared
    "rna-seq" = "method",            # RNA Sequencing
    "qtof" = "method",               # Quadrupole Time-of-Flight
    "mri" = "method",                # Magnetic Resonance Imaging
    "ct" = "method",                 # Computed Tomography
    "pet" = "method",                # Positron Emission Tomography
    "spect" = "method",              # Single-Photon Emission Computed Tomography
    "ecg" = "method",                # Electrocardiogram
    "eeg" = "method",                # Electroencephalogram
    "emg" = "method",                # Electromyography
    "fmri" = "method",               # Functional Magnetic Resonance Imaging
    "qsar" = "method",               # Quantitative Structure-Activity Relationship
    "qspr" = "method",               # Quantitative Structure-Property Relationship

    # Common biostatistical methods incorrectly classified as chemicals
    "anova" = "method",              # Analysis of Variance
    "ancova" = "method",             # Analysis of Covariance
    "manova" = "method",             # Multivariate Analysis of Variance
    "pca" = "method",                # Principal Component Analysis
    "sem" = "method",                # Structural Equation Modeling
    "glm" = "method",                # Generalized Linear Model
    "lda" = "method",                # Linear Discriminant Analysis
    "svm" = "method",                # Support Vector Machine
    "ann" = "method",                # Artificial Neural Network
    "kmeans" = "method",             # K-means clustering
    "roc" = "method",                # Receiver Operating Characteristic
    "auc" = "method",                # Area Under the Curve

    # Database and algorithm acronyms
    "kegg" = "database",             # Kyoto Encyclopedia of Genes and Genomes
    "smiles" = "method",             # Simplified Molecular-Input Line-Entry System
    "blast" = "method",              # Basic Local Alignment Search Tool
    "mace" = "method",               # Major Adverse Cardiac Events

    # Correct classifications for symptoms
    "pain" = "symptom",
    "headache" = "symptom",
    "migraine" = "disease",
    "nausea" = "symptom",
    "vomiting" = "symptom",
    "dizziness" = "symptom",
    "fatigue" = "symptom",
    "weakness" = "symptom",
    "aura" = "symptom",
    "photophobia" = "symptom",
    "phonophobia" = "symptom",

    # Proteins and receptors
    "receptor" = "protein",
    "receptors" = "protein",
    "channel" = "protein",
    "channels" = "protein",
    "transporter" = "protein",
    "transporters" = "protein",

    # Biological processes
    "inflammation" = "biological_process",
    "signaling" = "biological_process",
    "activation" = "biological_process",
    "inhibition" = "biological_process",
    "regulation" = "biological_process",
    "phosphorylation" = "biological_process"
  )

  # If the term is an acronym in our dictionary, check against claimed_type
  if (term_lower %in% names(acronym_corrections)) {
    correct_type <- acronym_corrections[[term_lower]]

    # If a claimed type is provided, check if it matches our correction
    if (!is.null(claimed_type)) {
      # Return TRUE if it matches, FALSE otherwise
      return(claimed_type == correct_type ||
               # Also allow method to be diagnostic_procedure or therapeutic_procedure
               (correct_type == "method" &&
                  (claimed_type == "diagnostic_procedure" || claimed_type == "therapeutic_procedure")))
    } else {
      # If no claimed type provided, return TRUE (it is a valid biomedical term)
      return(TRUE)
    }
  }

  # STAGE 1: Basic text pattern disqualifiers

  # Very short terms are rarely valid biomedical entities unless they're acronyms
  if (nchar(term) < 3 && !grepl("^[A-Z0-9]{2}$", term)) {
    return(FALSE)
  }

  # Terms consisting solely of numbers
  if (grepl("^[0-9]+$", term)) {
    return(FALSE)
  }

  # List of geographic locations that should never be biomedical entities
  geographic_locations <- c(
    "africa", "america", "asia", "australia", "europe", "north america", "south america",
    "central america", "western europe", "eastern europe", "northern europe", "southern europe",
    "middle east", "southeast asia", "east asia", "central asia", "south asia", "north africa",
    "sub-saharan africa", "oceania", "antarctica", "arctic", "caribbean", "mediterranean",
    "scandinavia", "benelux", "balkans", "pacific", "atlantic", "central europe",
    "usa", "china", "japan", "germany", "uk", "france", "italy", "spain", "russia",
    "brazil", "india", "canada", "mexico", "australia", "switzerland", "sweden", "norway"
  )

  # Immediately reject if the term is a geographic location
  if (term_lower %in% geographic_locations) {
    return(FALSE)
  }

  # Specific terms that should be rejected based on our examples
  problematic_specific_terms <- c(
    "europe", "vehicle", "optimization", "retention", "malformation" # Terms mentioned in the issues
  )

  if (term_lower %in% problematic_specific_terms) {
    # Special exception for malformation if it's being claimed as a disease
    if (term_lower == "malformation" && !is.null(claimed_type) && claimed_type == "disease") {
      # Allow "malformation" when it's claimed to be a disease
      return(TRUE)
    }
    return(FALSE)
  }

  # Very common English words and non-scientific terms (expanded list)
  common_words <- c(
    # Original list
    "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not", "on",
    "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from", "they", "we",
    "say", "her", "she", "or", "an", "will", "my", "one", "all", "would", "there", "their",
    "what", "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", "when", "make",
    "can", "like", "time", "no", "just", "him", "know", "take", "people", "into", "year", "your",
    "good", "some", "could", "them", "see", "other", "than", "then", "now", "look", "only",
    "come", "its", "over", "think", "also", "back", "after", "use", "two", "how", "our", "work",
    "first", "well", "way", "even", "new", "want", "because", "any", "these", "give", "day",
    "most", "us", "very", "although", "much", "should", "still", "something", "find", "many",

    # Extended list of non-scientific terms
    "through", "more", "before", "those", "between", "same", "another", "around", "while",
    "however", "therefore", "furthermore", "moreover", "consequently", "nevertheless",
    "accordingly", "thus", "hence", "meanwhile", "subsequently", "indeed", "instead", "likewise",
    "namely", "regardless", "similarly", "specifically", "undoubtedly", "whereas", "mean",
    "analysis", "result", "method", "find", "show", "increase", "decrease", "effect", "study",
    "research", "data", "information", "measure", "value", "level", "report", "test", "change",
    "control", "development", "management", "system", "process", "model", "determine", "identify",
    "observed", "recorded", "analyzed", "evaluated", "assessed", "examined", "investigated",
    "considered", "described", "presented", "demonstrated", "indicated", "suggested", "revealed",

    # Demographic/socioeconomic terms
    "sociodemographic", "demographic", "social", "economic", "education", "income", "status",
    "cultural", "ethical", "society", "community", "population", "questionnaire", "survey",
    "interview", "assessment", "scale", "score", "index", "measurement", "evaluation",
    "analysis", "nationality", "ethnicity", "race", "gender", "sex", "age", "occupation",
    "employment", "marital", "household", "residence", "urban", "rural", "metropolitan",
    "suburban", "literacy", "socioeconomic",

    # Statistical terms
    "significant", "correlation", "regression", "association", "relationship", "analysis",
    "statistical", "clinically", "percentage", "proportion", "ratio", "factor", "variable",
    "parameter", "confidence", "interval", "probability", "likelihood", "odds", "risk",
    "hazard", "prevalence", "incidence", "rate", "frequency", "distribution", "sample",
    "population", "cohort", "group", "control", "case", "participant", "subject", "patient",
    "individual", "person", "people", "characteristic", "feature", "aspect", "element",
    "component", "quality", "quantity", "measure", "metric", "indicator", "predictor",
    "outcome", "result", "finding", "evidence",

    # Additional problematic terms from the example
    "vehicle", "optimization", "retention", "europe", "africa", "asia", "america", "australia"
  )

  if (term_lower %in% common_words) {
    return(FALSE)
  }

  # STAGE 2: Type-specific validation (positive identification)
  if (!is.null(claimed_type)) {
    # Check if term matches characteristics of claimed type
    # This is a positive identification approach - we check if the term has
    # characteristics typical of its claimed type

    # Is an acronym (common for genes, proteins, etc.)
    is_acronym <- grepl("^[A-Z0-9]{2,}$", term)

    # Check for commonly misclassified acronyms (analytical methods, etc.)
    if (is_acronym && claimed_type == "chemical") {
      # List of commonly misclassified analytical method acronyms
      method_acronyms <- c(
        "FAERS", "BCPNN", "UPLC", "FRAP", "HPLC", "LCMS", "GCMS", "MALDI",
        "ELISA", "FTIR", "NMR", "PCR", "SEM", "TEM", "XRD", "SAXS", "UVVIS",
        "MS", "MSMS", "LC", "GC", "TGA", "DSC", "UV", "IR", "RNASEQ", "QTOF",
        "MRI", "CT", "PET", "SPECT", "ECG", "EEG", "EMG", "FMRI", "QSAR", "QSPR",
        "ANOVA", "ANCOVA", "MANOVA", "PCA", "GLM", "LDA", "SVM", "ANN"
      )

      # Check if the acronym is in our list of analytical methods
      if (toupper(term) %in% method_acronyms) {
        return(FALSE)  # This is not a chemical, but an analytical method
      }
    }

    # Type-specific checks with expanded patterns
    if (claimed_type == "gene") {
      # Genes often match these patterns
      gene_patterns <- c(
        "gene", "receptor", "factor", "kinase", "transcription", "regulatory",
        "promoter", "repressor", "activator", "enhancer", "suppressor", "oncogene",
        "family", "homolog", "ortholog", "paralog", "allele", "mutant", "variant",
        "dna", "rna", "nucleotide", "sequence", "locus", "chromosome", "genome",
        "exon", "intron", "codon", "amplification", "deletion", "insertion",
        "duplication", "polymorphism", "snp", "mutation", "translocation", "inversion"
      )

      # Check for common gene naming patterns (e.g., BRCA1, TP53)
      has_gene_pattern <- any(sapply(gene_patterns, function(p) grepl(p, term_lower)))

      # Many genes are 2-5 letter acronyms followed by numbers
      is_typical_gene_format <- grepl("^[A-Z]{2,5}[0-9]*$", term)

      return(is_acronym || has_gene_pattern || is_typical_gene_format)
    }
    else if (claimed_type == "protein") {
      # Proteins often match these patterns
      protein_patterns <- c(
        "protein", "enzyme", "receptor", "channel", "transporter", "carrier", "hormone",
        "cytokine", "chemokine", "antibody", "immunoglobulin", "kinase", "phosphatase",
        "protease", "ligase", "ase$", "in$", "globulin", "albumin", "peptide", "factor",
        "subunit", "domain", "chain", "complex", "binding", "helicase", "reductase",
        "transferase", "polymerase", "dehydrogenase", "oxidase", "integrin", "fibrinogen",
        "collagen", "elastin", "myosin", "actin", "globin", "hemoglobin", "thrombin",
        "trypsin", "pepsin", "lipase", "amylase", "catalase", "lactase", "synthetase"
      )

      has_protein_pattern <- any(sapply(protein_patterns, function(p) grepl(p, term_lower)))

      # Special case: "receptor" and "receptors" should always be proteins
      if (term_lower == "receptor" || term_lower == "receptors") {
        return(TRUE)
      }

      return(is_acronym || has_protein_pattern)
    }
    else if (claimed_type == "drug") {
      # Common drug name suffixes
      drug_suffixes <- c(
        "caine$", "mycin$", "oxacin$", "dronate$", "olol$", "pril$", "sartan$", "mab$",
        "nib$", "gliptin$", "prazole$", "vastatin$", "dine$", "zosin$", "parin$", "ide$",
        "ane$", "ene$", "azole$", "azepam$", "idine$", "dipine$", "tadine$", "rubicin$",
        "citabine$", "mustine$", "phylline$", "racil$", "profen$", "sulfa$", "micin$",
        "fungin$", "nacin$", "bicin$", "trexate$", "pamide$", "semide$", "setron$",
        "ridone$", "tidine$", "afil$", "lukast$", "xaban$", "orphan$", "tretin$",
        "stigmine$", "curium$", "parib$", "tinib$", "cycline$", "tinel$", "cereb$",
        "navir$", "stat$", "thiazide$", "fibrate$", "glumide$", "glitazone$"
      )

      # Common drug classes
      drug_classes <- c(
        "antibiotic", "inhibitor", "antagonist", "agonist", "blocker", "vaccine",
        "antidepressant", "antipsychotic", "antiepileptic", "sedative", "stimulant",
        "antihistamine", "analgesic", "hormone", "antiviral", "anticancer",
        "antihypertensive", "antiinflammatory", "antidiabetic", "anticoagulant",
        "antithrombotic", "antiemetic", "anticonvulsant", "antiarrhythmic",
        "medication", "medicine", "drug", "tablet", "capsule", "solution", "injection",
        "infusion", "suspension", "syrup", "elixir", "tincture", "suppository",
        "ointment", "cream", "lotion", "gel", "patch", "implant", "spray", "inhaler",
        "antibacterial", "antifungal", "antimalarial", "antiparasitic", "antitussive",
        "bronchodilator", "decongestant", "expectorant", "laxative", "diuretic",
        "antispasmodic", "antiseptic", "anesthetic", "anxiolytic", "hypnotic",
        "antihypertensive", "cardiotonic", "vasodilator", "vasoconstrictor", "statin",
        "cytotoxic", "immunosuppressant", "immunomodulator", "antiretroviral",
        "antiepileptic", "antiemetic", "antimigraine", "muscle relaxant"
      )

      has_drug_suffix <- any(sapply(drug_suffixes, function(s) grepl(s, term_lower)))
      has_drug_class <- any(sapply(drug_classes, function(c) grepl(c, term_lower)))

      return(has_drug_suffix || has_drug_class)
    }
    else if (claimed_type == "disease") {
      # Common disease patterns
      disease_patterns <- c(
        "disease$", "disorder$", "syndrome$", "itis$", "emia$", "pathy$", "oma$", "osis$",
        "iasis$", "itis$", "algia$", "cancer", "tumor", "tumour", "infection", "deficiency",
        "failure", "dysfunction", "lesion", "malformation", "abnormality", "poisoning",
        "injury", "trauma", "stroke", "attack", "seizure", "allergy", "addiction",
        "sclerosis", "palsy", "dystrophy", "atrophy", "hypertrophy", "hyperplasia",
        "hypoplasia", "dysplasia", "neoplasia", "carcinoma", "sarcoma", "leukemia",
        "lymphoma", "melanoma", "adenoma", "hepatoma", "nephroma", "retinopathy",
        "neuropathy", "myopathy", "encephalopathy", "vasculopathy", "arthropathy",
        "gastritis", "colitis", "hepatitis", "nephritis", "bronchitis", "sinusitis",
        "dermatitis", "meningitis", "encephalitis", "myocarditis", "pancreatitis",
        "anemia", "leukemia", "thrombocytopenia", "neutropenia", "lymphopenia",
        "hyperglycemia", "hypoglycemia", "hyperkalemia", "hypokalemia", "hypernatremia",
        "hyponatremia", "hypercalcemia", "hypocalcemia", "acidosis", "alkalosis",
        "fever", "hypertension", "hypotension", "tachycardia", "bradycardia", "arrhythmia"
      )

      has_disease_pattern <- any(sapply(disease_patterns, function(p) grepl(p, term_lower)))

      # Special case: reject if term is "receptor" or "receptors" claimed as disease
      if (term_lower == "receptor" || term_lower == "receptors") {
        return(FALSE)
      }

      # Special case: "malformation" should be allowed as a disease
      if (term_lower == "malformation") {
        return(TRUE)
      }

      # Special case: "migraine" is a disease
      if (term_lower == "migraine") {
        return(TRUE)
      }

      return(has_disease_pattern)
    }
    else if (claimed_type == "chemical") {
      # Common chemical patterns
      chemical_patterns <- c(
        "acid", "oxide", "chloride", "bromide", "iodide", "fluoride", "hydroxide",
        "carbonate", "sulfate", "nitrate", "phosphate", "acetate", "citrate", "sulfide",
        "amine", "amide", "ester", "ether", "alcohol", "phenol", "ketone", "aldehyde",
        "hydrocarbon", "lipid", "carbohydrate", "steroid", "alkaloid", "glycoside",
        "amino acid", "nucleotide", "element", "compound", "metal", "ion", "isotope",
        "molecule", "solvent", "reagent", "catalyst", "polymer", "monomer", "dimer",
        "anhydride", "anion", "cation", "salt", "base", "gas", "solution", "suspension",
        "emulsion", "colloid", "crystal", "precipitate", "solid", "liquid", "vapor",
        "distillate", "extract", "benzene", "methane", "ethane", "propane", "butane",
        "pentane", "hexane", "heptane", "octane", "nonane", "decane", "methanol",
        "ethanol", "propanol", "butanol", "glucose", "fructose", "galactose", "mannose",
        "sucrose", "lactose", "maltose", "cellulose", "starch", "glycogen", "protein",
        "peptide", "amino", "glycine", "alanine", "valine", "leucine", "isoleucine",
        "proline", "phenylalanine", "tyrosine", "tryptophan", "serine", "threonine",
        "cysteine", "methionine", "asparagine", "glutamine", "aspartate", "glutamate",
        "lysine", "arginine", "histidine", "cholesterol", "testosterone", "estrogen",
        "progesterone", "cortisol", "aldosterone", "adrenaline", "noradrenaline",
        "dopamine", "serotonin", "histamine", "acetylcholine", "adenosine", "guanosine",
        "cytidine", "thymidine", "uridine", "atp", "adp", "amp", "gtp", "gdp", "gmp"
      )

      has_chemical_pattern <- any(sapply(chemical_patterns, function(p) grepl(p, term_lower)))

      # Chemical formulas (e.g., H2O, CO2, NaCl)
      is_chemical_formula <- grepl("[A-Z][a-z]?[0-9]*", term)

      # Special case: reject terms known to be misclassified as chemicals
      if (term_lower %in% c("optimization", "retention", "vehicle", "malformation")) {
        return(FALSE)
      }

      # Special case: reject analytical method acronyms that are often misclassified as chemicals
      if (is_acronym) {
        analytical_acronyms <- c(
          "FAERS", "BCPNN", "UPLC", "FRAP", "HPLC", "LCMS", "GCMS", "MALDI",
          "ELISA", "FTIR", "NMR", "PCR", "SEM", "TEM", "XRD", "SAXS", "UV", "IR",
          "MS", "LC", "GC", "CT", "MRI", "PET", "ROC", "AUC", "ANOVA", "PCA"
        )

        if (toupper(term) %in% analytical_acronyms) {
          return(FALSE)
        }
      }

      return(has_chemical_pattern || is_chemical_formula || is_acronym)
    }
    else if (claimed_type == "pathway") {
      # Common pathway patterns
      pathway_patterns <- c(
        "pathway", "signaling", "cascade", "cycle", "biosynthesis", "metabolism",
        "degradation", "synthesis", "catabolism", "anabolism", "oxidation", "reduction",
        "phosphorylation", "glycolysis", "gluconeogenesis", "respiration", "transport",
        "signalling", "transduction", "activation", "inhibition", "regulation",
        "homeostasis", "feedback", "response", "mechanism", "process", "circuit",
        "network", "cross-talk", "interplay", "interaction", "communication",
        "transmission", "conductance", "induction", "repression", "amplification",
        "attenuation", "potentiation", "oscillation", "cycling", "recycling",
        "metabolism", "anabolism", "catabolism", "glycolysis", "gluconeogenesis",
        "glycogenolysis", "glycogenesis", "proteolysis", "proteogenesis", "lipolysis",
        "lipogenesis", "ketogenesis", "ketolysis", "thermogenesis", "hematopoiesis",
        "erythropoiesis", "leukopoiesis", "lymphopoiesis", "myelopoiesis", "thrombopoiesis",
        "apoptosis", "necroptosis", "autophagy", "pyroptosis", "ferroptosis", "senescence",
        "differentiation", "proliferation", "maturation", "migration", "chemotaxis",
        "phagocytosis", "endocytosis", "exocytosis", "transcytosis", "pinocytosis",
        "secretion", "absorption", "diffusion", "filtration", "osmosis", "reabsorption"
      )

      has_pathway_pattern <- any(sapply(pathway_patterns, function(p) grepl(p, term_lower)))

      return(has_pathway_pattern)
    }
    else if (claimed_type == "biological_process") {
      # Common biological process patterns
      bioprocess_patterns <- c(
        "process", "regulation", "activation", "inhibition", "induction", "suppression",
        "proliferation", "differentiation", "apoptosis", "necrosis", "autophagy", "growth",
        "development", "maturation", "aging", "senescence", "inflammation", "fibrosis",
        "angiogenesis", "healing", "repair", "regeneration", "immunity", "response",
        "secretion", "expression", "translation", "transcription", "replication", "binding",
        "signaling", "transduction", "transmission", "recognition", "adhesion", "migration",
        "biogenesis", "morphogenesis", "organogenesis", "embryogenesis", "hematopoiesis",
        "neurogenesis", "vasculogenesis", "myogenesis", "osteogenesis", "chondrogenesis",
        "adipogenesis", "lymphopoiesis", "erythropoiesis", "myelopoiesis", "granulopoiesis",
        "monocytopoiesis", "thrombopoiesis", "spermatogenesis", "oogenesis", "fertilization",
        "implantation", "gastrulation", "neurulation", "placentation", "parturition",
        "lactation", "respiration", "circulation", "digestion", "absorption", "assimilation",
        "excretion", "homeostasis", "thermoregulation", "osmoregulation", "metabolism",
        "catabolism", "anabolism", "glycolysis", "gluconeogenesis", "glycogenolysis",
        "glycogenesis", "lipolysis", "lipogenesis", "proteolysis", "proteogenesis",
        "detoxification", "biotransformation", "conjugation", "elimination", "oxidation",
        "reduction", "phosphorylation", "dephosphorylation", "methylation", "demethylation",
        "acetylation", "deacetylation", "ubiquitination", "deubiquitination", "glycosylation"
      )

      has_bioprocess_pattern <- any(sapply(bioprocess_patterns, function(p) grepl(p, term_lower)))

      return(has_bioprocess_pattern)
    }
    else if (claimed_type == "molecular_function") {
      # Common molecular function patterns
      molfunction_patterns <- c(
        "binding", "activity", "function", "catalysis", "hydrolysis", "synthesis",
        "polymerization", "depolymerization", "phosphorylation", "dephosphorylation",
        "methylation", "demethylation", "acetylation", "deacetylation", "ubiquitination",
        "sumoylation", "glycosylation", "transport", "uptake", "release", "secretion",
        "import", "export", "recognition", "interaction", "regulation", "activation",
        "inhibition", "transduction", "transmission", "receptor", "ligand", "cofactor",
        "coenzyme", "prosthetic", "modulator", "effector", "mediator", "transmitter",
        "antagonist", "agonist", "blocker", "inhibitor", "activator", "stimulator",
        "repressor", "inducer", "catalyst", "enzyme", "isomerase", "transferase",
        "hydrolase", "lyase", "oxidoreductase", "ligase", "kinase", "phosphatase",
        "protease", "nuclease", "glycosidase", "lipase", "transporter", "carrier",
        "channel", "pump", "exchanger", "symporter", "antiporter", "uniporter",
        "permease", "porin", "translocase", "translocator", "transferrin", "ferritin",
        "globin", "hemoglobin", "myoglobin", "albumin", "immunoglobulin", "antibody",
        "antigen", "complement", "cytokine", "chemokine", "interferon", "interleukin",
        "growth factor", "hormone", "neurotransmitter", "neuromodulator", "peptide",
        "neuropeptide", "endorphin", "enkephalin", "dynorphin", "substance"
      )

      has_molfunction_pattern <- any(sapply(molfunction_patterns, function(p) grepl(p, term_lower)))

      return(has_molfunction_pattern)
    }
    else if (claimed_type == "cellular_component" || claimed_type == "cell") {
      # Common cellular component patterns
      cell_patterns <- c(
        "cell", "membrane", "cytoplasm", "nucleus", "organelle", "mitochondrion",
        "mitochondria", "endoplasmic", "golgi", "lysosome", "vesicle", "vacuole",
        "peroxisome", "ribosome", "cytoskeleton", "microtubule", "microfilament",
        "chromosome", "chromatin", "nucleolus", "centrosome", "centriole", "cilium",
        "flagellum", "axoneme", "spindle", "cortex", "matrix", "lumen", "junction",
        "desmosome", "phagosome", "endosome", "exosome", "soma", "dendrite", "axon",
        "synapse", "neuron", "fibroblast", "macrophage", "lymphocyte", "erythrocyte",
        "platelet", "epithelial", "endothelial", "muscle", "nuclear", "nucleoplasm",
        "nucleoid", "nucleoplasm", "nucleopore", "nuclear envelope", "nuclear matrix",
        "nuclear lamina", "nuclear pore", "nuclear receptor", "chromatin", "chromosome",
        "chromatid", "centromere", "telomere", "kinetochore", "cytoplasmic", "cytosol",
        "cytoskeleton", "microfilament", "microtubule", "intermediate filament",
        "actin", "myosin", "tubulin", "kinesin", "dynein", "spectrin", "ankyrin",
        "integrin", "cadherin", "selectin", "immunoglobulin", "fibronectin", "laminin",
        "collagen", "elastin", "proteoglycan", "glycoprotein", "lipoprotein", "receptor",
        "channel", "pump", "transporter", "carrier", "anchoring", "scaffold", "matrix",
        "basal lamina", "basement membrane", "extracellular matrix", "tight junction",
        "gap junction", "adherens junction", "desmosome", "hemidesmosome", "focal adhesion",
        "zonula adherens", "zonula occludens", "macula adherens", "neuromuscular junction",
        "neuron", "neurite", "dendrite", "axon", "synapse", "presynaptic", "postsynaptic",
        "synaptic vesicle", "synaptic cleft", "neurotransmitter", "neuroreceptor"
      )

      has_cell_pattern <- any(sapply(cell_patterns, function(p) grepl(p, term_lower)))

      return(has_cell_pattern)
    }
    else if (claimed_type == "symptom") {
      # Common symptom patterns - significantly expanded
      symptom_patterns <- c(
        "pain", "ache", "fever", "cough", "rash", "swelling", "inflammation", "fatigue",
        "weakness", "dizziness", "vertigo", "nausea", "vomiting", "diarrhea", "constipation",
        "bleeding", "discharge", "lesion", "loss", "deficit", "malaise", "discomfort",
        "distress", "dyspnea", "shortness", "tachycardia", "bradycardia", "hypertension",
        "hypotension", "hyperglycemia", "hypoglycemia", "seizure", "paralysis", "spasm",
        "tremor", "headache", "anorexia", "insomnia", "anxiety", "depression", "symptom",
        "sign", "manifestation", "complaint", "condition", "syndrome", "presentation",
        "syndrome", "disorder", "palpitation", "arrhythmia", "dysphagia", "dysphonia",
        "dysarthria", "aphasia", "amnesia", "paresthesia", "dysesthesia", "anesthesia",
        "hyperesthesia", "hypoesthesia", "ataxia", "apraxia", "agnosia", "alexia",
        "agraphia", "acalculia", "hemianopia", "hemianopsia", "scotoma", "diplopia",
        "amblyopia", "amaurosis", "photophobia", "phonophobia", "hyperacusis", "tinnitus",
        "vertigo", "syncope", "presyncope", "diaphoresis", "hyperhidrosis", "anhidrosis",
        "pruritus", "urticaria", "erythema", "petechiae", "purpura", "ecchymosis",
        "jaundice", "cyanosis", "pallor", "flushing", "edema", "ascites", "pleural effusion",
        "pericardial effusion", "hemoptysis", "hematemesis", "melena", "hematochezia",
        "hematuria", "dysuria", "polyuria", "oliguria", "anuria", "polydipsia", "polyphagia",
        "dyspepsia", "bloating", "flatulence", "tenesmus", "steatorrhea", "pyrosis", "heartburn",
        "migraine", "aura", "osmophobia", "allodynia", "hypersensitivity", "paresthesia"
      )

      # Explicit checks for common symptoms that are frequently misclassified
      if (term_lower %in% c("pain", "headache", "migraine", "nausea", "vomiting",
                            "dizziness", "fatigue", "weakness", "photophobia")) {
        return(TRUE)
      }

      # General check for symptom patterns
      has_symptom_pattern <- any(sapply(symptom_patterns, function(p) grepl(paste0("\\b", p, "\\b"), term_lower)))

      return(has_symptom_pattern)
    }
    else if (claimed_type == "organism") {
      # Common organism patterns
      organism_patterns <- c(
        "bacteria", "virus", "fungus", "parasite", "microbe", "pathogen", "species", "strain",
        "genus", "family", "order", "class", "phylum", "kingdom", "domain", "organism",
        "animal", "plant", "vertebrate", "invertebrate", "mammal", "bird", "reptile",
        "amphibian", "fish", "insect", "arachnid", "crustacean", "mollusk", "worm",
        "protozoa", "algae", "archaea", "prokaryote", "eukaryote", "bacteria", "bacterium",
        "bacillus", "cocci", "coccus", "spirochete", "spirillum", "rickettsia", "mycoplasma",
        "chlamydia", "escherichia", "salmonella", "shigella", "klebsiella", "proteus",
        "pseudomonas", "acinetobacter", "enterobacter", "haemophilus", "neisseria",
        "staphylococcus", "streptococcus", "enterococcus", "pneumococcus", "mycobacterium",
        "clostridium", "bacillus", "listeria", "corynebacterium", "actinomyces",
        "nocardia", "legionella", "bordetella", "brucella", "campylobacter", "helicobacter",
        "vibrio", "yersinia", "pasteurella", "francisella", "bartonella", "virus", "viral",
        "virology", "virion", "capsid", "envelope", "glycoprotein", "adenovirus", "herpesvirus",
        "papillomavirus", "polyomavirus", "poxvirus", "hepadnavirus", "retrovirus",
        "lentivirus", "rhabdovirus", "filovirus", "paramyxovirus", "orthomyxovirus",
        "bunyavirus", "arenavirus", "reovirus", "togavirus", "flavivirus", "picornavirus",
        "coronavirus", "hepatitis", "influenza", "rhinovirus", "rotavirus", "hiv", "fungus",
        "fungi", "yeast", "mold", "mould", "mushroom", "candida", "aspergillus", "cryptococcus",
        "histoplasma", "blastomyces", "coccidioides", "paracoccidioides", "sporothrix",
        "zygomycetes", "mucor", "rhizopus", "absidia", "basidiomycetes", "ascomycetes"
      )

      has_organism_pattern <- any(sapply(organism_patterns, function(p) grepl(p, term_lower)))

      return(has_organism_pattern)
    }
    else if (claimed_type == "tissue") {
      # Common tissue patterns
      tissue_patterns <- c(
        "tissue", "epithelium", "endothelium", "mesothelium", "mesenchyme", "stroma",
        "parenchyma", "connective", "muscle", "nerve", "neural", "vasculature", "vascular",
        "gland", "glandular", "mucous", "mucosa", "submucosa", "serosa", "adventitia",
        "periosteum", "perichondrium", "synovium", "bone", "cartilage", "tendon", "ligament",
        "fascia", "adipose", "blood", "lymph", "marrow", "skin", "dermal", "epidermal",
        "cutaneous", "subcutaneous", "mucosal", "membrane", "meninges", "dura", "arachnoid",
        "pia", "pleura", "peritoneum", "pericardium", "endocardium", "myocardium", "epicardium",
        "endometrium", "myometrium", "perimetrium", "glomerulus", "tubule", "nephron",
        "hepatocyte", "bile duct", "islet", "acinus", "alveolus", "bronchus", "bronchiole",
        "trachea", "esophagus", "stomach", "intestine", "duodenum", "jejunum", "ileum",
        "colon", "rectum", "bladder", "urethra", "ureter", "prostate", "testis", "ovary",
        "uterus", "fallopian", "cervix", "vagina", "breast", "thyroid", "parathyroid",
        "adrenal", "pituitary", "thymus", "spleen", "lymph node", "tonsil", "cerebrum",
        "cerebellum", "brain stem", "spinal cord", "ganglion", "nerve", "neuron", "retina",
        "cornea", "sclera", "choroid", "iris", "lens", "vitreous", "aqueous", "cochlea",
        "vestibule", "labyrinth", "muscle", "skeletal", "cardiac", "smooth", "tendon",
        "ligament", "cartilage", "joint", "synovium", "bone", "cortical", "trabecular",
        "marrow", "osteoid", "callus", "granulation", "scar", "fibrosis", "granuloma"
      )

      has_tissue_pattern <- any(sapply(tissue_patterns, function(p) grepl(p, term_lower)))

      return(has_tissue_pattern)
    }
    else if (claimed_type == "anatomy") {
      # Common anatomy patterns
      anatomy_patterns <- c(
        "head", "neck", "thorax", "chest", "abdomen", "pelvis", "back", "extremity", "arm",
        "forearm", "wrist", "hand", "finger", "thumb", "leg", "thigh", "knee", "calf", "ankle",
        "foot", "toe", "shoulder", "elbow", "hip", "joint", "skull", "cranium", "vertebra",
        "spine", "rib", "sternum", "clavicle", "scapula", "humerus", "radius", "ulna", "carpal",
        "metacarpal", "phalanx", "pelvis", "femur", "patella", "tibia", "fibula", "tarsal",
        "metatarsal", "brain", "cerebrum", "cerebellum", "brainstem", "thalamus", "hypothalamus",
        "pituitary", "pineal", "basal ganglia", "hippocampus", "amygdala", "fornix", "corpus callosum",
        "ventricle", "meninges", "dura", "arachnoid", "pia", "spinal cord", "nerve", "ganglion",
        "plexus", "heart", "atrium", "ventricle", "septum", "valve", "mitral", "tricuspid",
        "pulmonary", "aortic", "aorta", "artery", "arteriole", "capillary", "venule", "vein",
        "vena cava", "pulmonary", "coronary", "carotid", "jugular", "subclavian", "axillary",
        "brachial", "radial", "ulnar", "femoral", "popliteal", "tibial", "lung", "bronchus",
        "bronchiole", "alveolus", "pleura", "diaphragm", "trachea", "larynx", "pharynx",
        "esophagus", "stomach", "duodenum", "jejunum", "ileum", "cecum", "colon", "appendix",
        "rectum", "anus", "liver", "gallbladder", "bile duct", "pancreas", "spleen", "kidney",
        "ureter", "bladder", "urethra", "prostate", "testis", "epididymis", "vas deferens",
        "seminal vesicle", "penis", "urethra", "ovary", "fallopian", "uterus", "cervix",
        "vagina", "vulva", "mammary", "thyroid", "parathyroid", "adrenal", "thymus", "lymph node",
        "tonsil", "eye", "orbit", "eyelid", "conjunctiva", "cornea", "sclera", "choroid",
        "retina", "lens", "iris", "pupil", "ciliary", "ear", "pinna", "external", "middle",
        "inner", "tympanic", "ossicle", "cochlea", "vestibule", "labyrinth", "nose", "nasal",
        "paranasal", "sinus", "mouth", "lip", "tongue", "teeth", "gingiva", "palate", "uvula",
        "skin", "epidermis", "dermis", "subcutaneous", "hair", "nail", "sebaceous", "sweat",
        "ligament", "tendon", "fascia", "bursa", "synovium", "cartilage", "meniscus", "disc"
      )

      has_anatomy_pattern <- any(sapply(anatomy_patterns, function(p) grepl(p, term_lower)))

      return(has_anatomy_pattern)
    }
    else if (claimed_type == "diagnostic_procedure") {
      # Common diagnostic procedure patterns
      diagnostic_procedure_patterns <- c(
        "test", "scan", "imaging", "radiograph", "ultrasound", "sonogram", "tomography",
        "resonance", "endoscopy", "biopsy", "aspiration", "culture", "assay", "analysis",
        "measurement", "examination", "evaluation", "assessment", "screening", "monitoring",
        "probe", "detection", "quantification", "identification", "diagnosis", "diagnostic",
        "radiography", "xray", "x-ray", "radiogram", "fluoroscopy", "angiography", "venography",
        "lymphangiography", "myelography", "arthrography", "mammography", "tomography",
        "ct", "cat scan", "computed tomography", "mri", "magnetic resonance", "fmri",
        "functional mri", "pet", "positron emission", "spect", "single photon", "ultrasound",
        "sonography", "doppler", "echocardiography", "electrocardiography", "ecg", "ekg",
        "electroencephalography", "eeg", "electromyography", "emg", "electroneuronography",
        "nerve conduction", "electrophysiology", "endoscopy", "colonoscopy", "sigmoidoscopy",
        "proctoscopy", "anoscopy", "esophagogastroduodenoscopy", "egds", "bronchoscopy",
        "laryngoscopy", "rhinoscopy", "otoscopy", "cystoscopy", "ureteroscopy", "nephroscopy",
        "laparoscopy", "arthroscopy", "colposcopy", "hysteroscopy", "biopsy", "aspiration",
        "culture", "gram stain", "sensitivity", "pcr", "polymerase chain reaction", "elisa",
        "western blot", "southern blot", "northern blot", "immunoassay", "immunohistochemistry",
        "serology", "titer", "flow cytometry", "karyotype", "cytogenetics", "genetic",
        "sequencing", "microarray", "histopathology", "cytology", "hematology", "chemistry",
        "metabolic", "electrolytes", "glucose", "glycemia", "lipid profile", "cholesterol",
        "triglycerides", "liver function", "renal function", "creatinine", "bun", "urea",
        "hemoglobin", "hematocrit", "complete blood count", "cbc", "coagulation", "urinalysis",
        "urine", "stool", "cerebrospinal", "fluid", "synovial", "pleural", "pericardial",
        "peritoneal", "bronchial", "lavage", "sputum", "spectrum", "spectra", "spectroscopy",
        "spectrophotometry", "chromatography", "mass spectrometry", "electrophoresis"
      )

      # Special case for analytical techniques often misclassified
      analytical_techniques <- c(
        "faers", "bcpnn", "uplc", "frap", "hplc", "lc-ms", "gc-ms", "maldi",
        "elisa", "ft-ir", "nmr", "pcr", "sem", "tem", "xrd", "saxs", "uv-vis",
        "ms", "ms/ms", "lc", "gc", "tga", "dsc", "uv", "ir", "rna-seq", "qtof",
        "mri", "ct", "pet", "spect", "ecg", "eeg", "emg", "fmri", "qsar", "qspr",
        "anova", "ancova", "manova", "pca", "sem", "glm", "lda", "svm", "ann"
      )

      # Check if term is an analytical technique
      if (term_lower %in% analytical_techniques) {
        return(TRUE)
      }

      # Check if it's a diagnostic procedure
      has_diagnostic_procedure_pattern <- any(sapply(diagnostic_procedure_patterns, function(p) grepl(p, term_lower)))

      return(has_diagnostic_procedure_pattern)
    }
    else if (claimed_type == "therapeutic_procedure") {
      # Common therapeutic procedure patterns
      therapeutic_procedure_patterns <- c(
        "therapy", "treatment", "intervention", "management", "procedure", "surgery", "operation",
        "excision", "resection", "removal", "transplantation", "implantation", "replacement",
        "repair", "reconstruction", "restoration", "augmentation", "reduction", "extraction",
        "amputation", "anastomosis", "bypass", "graft", "flap", "catheterization", "intubation",
        "cannulation", "injection", "infusion", "transfusion", "dialysis", "filtration",
        "transplant", "prosthesis", "fixation", "manipulation", "reposition", "reduction",
        "traction", "immobilization", "stimulation", "radiation", "irradiation", "ablation",
        "cauterization", "coagulation", "cryotherapy", "hyperthermia", "phototherapy", "laser",
        "ultrasound", "lithotripsy", "dilatation", "dilation", "stenting", "drainage", "aspiration",
        "lavage", "debridement", "curettage", "anesthesia", "sedation", "analgesia", "ventilation",
        "oxygenation", "suction", "suctioning", "tracheostomy", "gastrostomy", "jejunostomy",
        "ileostomy", "colostomy", "cystostomy", "nephrostomy", "thoracostomy", "thoracentesis",
        "paracentesis", "arthrocentesis", "lumbar puncture", "spinal tap", "amniocentesis",
        "chorionic villus sampling", "cordocentesis", "biopsy", "excisional", "incisional",
        "needle", "aspiration", "incision", "excision", "dissection", "ligation", "suture",
        "stapling", "amputation", "disarticulation", "arthrodesis", "fusion", "arthroplasty",
        "replacement", "osteotomy", "tenorrhaphy", "tenolysis", "tenotomy", "myotomy", "myorrhaphy",
        "neurorrhaphy", "neurolysis", "neurectomy", "sympathectomy", "vagotomy", "rhizotomy",
        "cordotomy", "tractotomy", "lobotomy", "craniotomy", "craniectomy", "cranioplasty",
        "ventriculostomy", "shunt", "laminectomy", "discectomy", "foraminotomy", "vertebroplasty",
        "kyphoplasty", "thoracotomy", "thoracoplasty", "pneumonectomy", "lobectomy", "segmentectomy",
        "wedge resection", "pleurodesis", "pleurectomy", "decortication", "pericardiectomy",
        "pericardiostomy", "valvuloplasty", "valvulotomy", "commissurotomy", "annuloplasty",
        "ventriculoplasty", "bypass", "endarterectomy", "thrombectomy", "embolectomy",
        "aneurysmectomy", "aneurysmorrhaphy", "varicectomy", "sclerotherapy", "gastrectomy",
        "vagotomy", "gastroenterostomy", "gastropexy", "fundoplication", "pyloromyotomy",
        "pyloroplasty", "esophagectomy", "esophagostomy", "esophagoplasty", "colectomy",
        "colostomy", "ileostomy", "appendectomy", "proctectomy", "hemorrhoidectomy", "hepatectomy",
        "hepatorrhaphy", "cholecystectomy", "choledochostomy", "splenectomy", "pancreatectomy",
        "nephrectomy", "nephrostomy", "nephropexy", "pyeloplasty", "ureteroplasty", "cystectomy",
        "cystostomy", "cystoplasty", "prostatectomy", "vasectomy", "vasovasostomy", "orchidectomy",
        "orchiectomy", "orchiopexy", "hydrocelectomy", "varicocelectomy", "hysterectomy", "oophorectomy"
      )

      has_therapeutic_procedure_pattern <- any(sapply(therapeutic_procedure_patterns, function(p) grepl(p, term_lower)))

      return(has_therapeutic_procedure_pattern)
    }
    else if (claimed_type == "method") {
      # Common method patterns - especially for technical terms often misclassified as chemicals
      method_patterns <- c(
        "method", "technique", "assay", "analysis", "procedure", "protocol", "algorithm",
        "approach", "workflow", "process", "measurement", "detection", "quantification",
        "identification", "determination", "evaluation", "assessment", "test", "screen",
        "monitor", "examine", "characterize", "validate", "verify", "qualify", "quantify",
        "calculate", "estimate", "predict", "model", "simulate", "standardize", "optimize",
        "chromatography", "spectroscopy", "microscopy", "spectrometry", "electrophoresis",
        "sequencing", "immunoassay", "radiography", "tomography", "ultrasound", "imaging"
      )

      # List of specific analytical methods and their acronyms
      analytical_methods <- c(
        "faers", "fda adverse event reporting system", "bcpnn", "bayesian confidence propagation neural network",
        "uplc", "ultra performance liquid chromatography", "frap", "fluorescence recovery after photobleaching",
        "hplc", "high performance liquid chromatography", "lc-ms", "liquid chromatography-mass spectrometry",
        "gc-ms", "gas chromatography-mass spectrometry", "maldi", "matrix-assisted laser desorption/ionization",
        "elisa", "enzyme-linked immunosorbent assay", "ft-ir", "fourier transform infrared spectroscopy",
        "nmr", "nuclear magnetic resonance", "pcr", "polymerase chain reaction", "sem", "scanning electron microscopy",
        "tem", "transmission electron microscopy", "xrd", "x-ray diffraction", "saxs", "small-angle x-ray scattering",
        "uv-vis", "ultraviolet-visible spectroscopy", "ms", "mass spectrometry", "ms/ms", "tandem mass spectrometry",
        "lc", "liquid chromatography", "gc", "gas chromatography", "tga", "thermogravimetric analysis",
        "dsc", "differential scanning calorimetry", "uv", "ultraviolet", "ir", "infrared", "rna-seq", "rna sequencing",
        "qtof", "quadrupole time-of-flight", "mri", "magnetic resonance imaging", "ct", "computed tomography",
        "pet", "positron emission tomography", "spect", "single-photon emission computed tomography",
        "ecg", "electrocardiogram", "eeg", "electroencephalogram", "emg", "electromyography",
        "fmri", "functional magnetic resonance imaging", "qsar", "quantitative structure-activity relationship",
        "qspr", "quantitative structure-property relationship", "anova", "analysis of variance",
        "ancova", "analysis of covariance", "manova", "multivariate analysis of variance",
        "pca", "principal component analysis", "sem", "structural equation modeling",
        "glm", "generalized linear model", "lda", "linear discriminant analysis",
        "svm", "support vector machine", "ann", "artificial neural network",
        "roc", "receiver operating characteristic", "auc", "area under the curve"
      )

      # If term is an analytical technique acronym, it's a method
      if (term_lower %in% analytical_methods || is_acronym) {
        return(TRUE)
      }

      # Check for method patterns
      has_method_pattern <- any(sapply(method_patterns, function(p) grepl(p, term_lower)))

      return(has_method_pattern)
    }

    # If no specific rules for this type, allow it to pass
    return(TRUE)
  }

  # STAGE 3: Check for general biomedical term characteristics

  # Check common biomedical characteristics that span multiple types
  # If a term has any of these characteristics, it's likely a biomedical entity

  # Is a recognized acronym pattern common in biomedicine (2-6 uppercase letters, maybe with numbers)
  is_biomedical_acronym <- grepl("^[A-Z]{2,6}[0-9]*$", term)

  # Has Latin or Greek roots common in medical terminology
  has_latin_greek_roots <- grepl("(itis|osis|emia|pathy|trophy|plasia|poiesis|genesis|lysis|ectomy|otomy|ostomy|plasty|pexy|rhaphy|graphy|scopy|metry)", term_lower)

  # Contains numbers and chemical elements (common in chemical formulas)
  is_chemical_formula <- grepl("[A-Z][a-z]?[0-9]+", term)

  # Common biomedical term endings
  biomedical_suffixes <- c("in$", "ase$", "gen$", "one$", "ide$", "ate$", "ene$", "ane$", "ole$",
                           "itis$", "osis$", "emia$", "pathy$", "trophy$", "plasia$", "poiesis$",
                           "genesis$", "lysis$", "ectomy$", "otomy$", "ostomy$", "plasty$", "pexy$",
                           "rhaphy$", "graphy$", "scopy$", "metry$", "algia$", "dynia$", "oma$",
                           "iasis$", "ismus$", "uria$", "pnea$", "emesis$", "pepsia$", "phagia$",
                           "rrhea$", "rrhage$", "sthenia$", "phobia$", "lexia$", "praxia$", "gnosis$",
                           "penia$", "cytosis$", "esthesia$", "kinesia$", "phasia$", "plegia$", "paresis$")

  has_biomedical_suffix <- any(sapply(biomedical_suffixes, function(s) grepl(s, term_lower)))

  # Check for compound terms that contain recognizable biomedical components
  biomedical_components <- c("neuro", "cardio", "gastro", "hepato", "nephro", "dermato",
                             "hemato", "immuno", "onco", "osteo", "arthro", "myelo", "cyto",
                             "histo", "patho", "pharmaco", "psycho", "toxo", "vas", "angio")

  has_biomedical_component <- any(sapply(biomedical_components, function(c) grepl(c, term_lower)))

  # Additional check for terms that should NEVER be considered biomedical entities
  never_biomedical <- c(
    # Geographic regions and locations
    "europe", "asia", "africa", "america", "australia", "us", "uk", "usa",
    # General abstract concepts
    "vehicle", "optimization", "retention",
    # Problematic general terms from example
    "malformation", "receptor", "receptors"
  )

  # Special case handling exceptions - these are only valid with specific type claims
  special_exceptions <- list(
    "malformation" = "disease",   # Malformation is valid as a disease
    "receptor" = "protein",       # Receptor is valid as a protein
    "receptors" = "protein"       # Receptors is valid as a protein
  )

  # Check if the term is in our never_biomedical list
  if (term_lower %in% never_biomedical) {
    # If it's in special_exceptions, check claimed type
    if (term_lower %in% names(special_exceptions)) {
      # Only allow if claimed type matches the exception type
      if (!is.null(claimed_type) && claimed_type == special_exceptions[[term_lower]]) {
        return(TRUE)
      } else {
        return(FALSE)
      }
    } else {
      # Not in exceptions, so reject
      return(FALSE)
    }
  }

  # Special check for analytical method acronyms that are often misclassified as chemicals
  if (is_biomedical_acronym) {
    analytical_acronyms <- c(
      "FAERS", "BCPNN", "UPLC", "FRAP", "HPLC", "LCMS", "GCMS", "MALDI",
      "ELISA", "FTIR", "NMR", "PCR", "SEM", "TEM", "XRD", "SAXS", "UV", "IR",
      "MS", "LC", "GC", "CT", "MRI", "PET", "ROC", "AUC", "ANOVA", "PCA"
    )

    if (toupper(term) %in% analytical_acronyms) {
      # If claimed as chemical, it's invalid
      if (!is.null(claimed_type) && claimed_type == "chemical") {
        return(FALSE)
      }
      # But it's valid as a method or diagnostic_procedure
      else if (!is.null(claimed_type) &&
               (claimed_type == "method" ||
                claimed_type == "diagnostic_procedure" ||
                claimed_type == "therapeutic_procedure")) {
        return(TRUE)
      }
      # If no claimed type, return TRUE as it's a valid biomedical term
      else if (is.null(claimed_type)) {
        return(TRUE)
      }
    }
  }

  # Return TRUE if term shows any of these biomedical characteristics
  if (is_biomedical_acronym || has_latin_greek_roots || is_chemical_formula ||
      has_biomedical_suffix || has_biomedical_component) {
    return(TRUE)
  }

  # Special check for "sociodemographic" and similar terms that are incorrectly labeled
  problematic_terms <- c("sociodemographic", "demographic", "social", "economic", "education",
                         "income", "status", "cultural", "ethical", "society", "community",
                         "population", "questionnaire", "survey", "interview", "assessment")

  if (term_lower %in% problematic_terms) {
    return(FALSE)
  }

  # If no positive identification was made and the term passed all filters, let it through
  # A future version could implement a machine learning classifier for more accuracy
  return(FALSE)  # Default to FALSE for terms that don't match any biomedical patterns
}


# Filter function for use in the ABC model
filter_terms_for_abc_model <- function(terms, entity_types = NULL) {
  valid_terms <- character(0)

  # Process each term
  for (i in seq_along(terms)) {
    term <- terms[i]

    # Get entity type if available
    entity_type <- NULL
    if (!is.null(entity_types) && term %in% names(entity_types)) {
      entity_type <- entity_types[term]
    }

    # Check if the term is a valid biomedical entity
    if (is_valid_biomedical_entity(term, entity_type)) {
      valid_terms <- c(valid_terms, term)
    }
  }

  return(valid_terms)
}

#' Apply the ABC model for literature-based discovery with improved filtering
#'
#' This function implements the ABC model for literature-based discovery with
#' enhanced term filtering and validation.
#'
#' @param co_matrix A co-occurrence matrix produced by create_comat().
#' @param a_term Character string, the source term (A).
#' @param c_term Character string, the target term (C). If NULL, all potential C terms will be evaluated.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#' @param scoring_method Method to use for scoring.
#' @param b_term_types Character vector of entity types allowed for B terms.
#' @param c_term_types Character vector of entity types allowed for C terms.
#' @param exclude_general_terms Logical. If TRUE, excludes common general terms.
#' @param filter_similar_terms Logical. If TRUE, filters out B-terms that are too similar to A-term.
#' @param similarity_threshold Numeric. Maximum allowed string similarity between A and B terms.
#' @param enforce_strict_typing Logical. If TRUE, enforces stricter entity type validation.
#' @param validation_method Character. Method to use for entity validation: "pattern", "nlp", "api", or "comprehensive".
#'
#' @return A data frame with ranked discovery results.
#' @export
abc_model <- function(co_matrix, a_term, c_term = NULL,
                      min_score = 0.1, n_results = 100,
                      scoring_method = c("multiplication", "average", "combined", "jaccard"),
                      b_term_types = NULL, c_term_types = NULL,
                      exclude_general_terms = TRUE,
                      filter_similar_terms = TRUE,
                      similarity_threshold = 0.8,
                      enforce_strict_typing = TRUE,
                      validation_method = "pattern") {

  # Match scoring_method argument
  scoring_method <- match.arg(scoring_method)

  # Set up validation function based on method
  validator <- switch(validation_method,
                      "pattern" = is_valid_biomedical_entity,
                      "nlp" = function(term, type) {
                        tryCatch({
                          validate_entity_with_nlp(term, type)
                        }, error = function(e) {
                          message("NLP validation failed, falling back to pattern-based: ", e$message)
                          is_valid_biomedical_entity(term, type)
                        })
                      },
                      "api" = function(term, type) {
                        tryCatch({
                          query_external_api(term, type)
                        }, error = function(e) {
                          message("API validation failed, falling back to pattern-based: ", e$message)
                          is_valid_biomedical_entity(term, type)
                        })
                      },
                      "comprehensive" = function(term, type) {
                        tryCatch({
                          validate_entity_comprehensive(term, type)
                        }, error = function(e) {
                          message("Comprehensive validation failed, falling back to pattern-based: ", e$message)
                          is_valid_biomedical_entity(term, type)
                        })
                      },
                      is_valid_biomedical_entity)  # Default to pattern-based

  # Check if the matrix has entity types and store in a local variable
  has_entity_types <- !is.null(attr(co_matrix, "entity_types"))
  entity_types_dict <- if (has_entity_types) attr(co_matrix, "entity_types") else NULL

  # Validate type constraints can be applied
  if ((!is.null(b_term_types) || !is.null(c_term_types)) && !has_entity_types) {
    warning("Entity type constraints specified but no entity types found in co-occurrence matrix. Constraints will be ignored.")
    b_term_types <- NULL
    c_term_types <- NULL
  }

  # Check if A term exists in the matrix
  if (!a_term %in% rownames(co_matrix)) {
    stop("A-term '", a_term, "' not found in the co-occurrence matrix")
  }

  # Check if C term exists (if provided)
  if (!is.null(c_term) && !c_term %in% rownames(co_matrix)) {
    stop("C-term '", c_term, "' not found in the co-occurrence matrix")
  }

  # Define explicit blacklist of problematic terms
  blacklisted_terms <- c(
    # Geographic locations
    "africa", "america", "asia", "australia", "europe", "north america", "south america",
    "central america", "western europe", "eastern europe", "northern europe", "southern europe",
    "usa", "uk", "us", "china", "japan", "germany", "france", "italy", "spain", "russia",

    # General terms that should never be B terms
    "method", "approach", "analysis", "assessment", "evaluation", "procedure", "technique",
    "protocol", "intervention", "treatment", "outcome", "result", "effect", "impact",
    "value", "study", "trial", "research", "experiment", "observation", "publication",
    "test", "measure", "detection", "identification", "classification", "characterization",
    "determination", "calculation", "examination", "investigation", "exploration",
    "screening", "monitoring", "surveillance", "survey", "review", "overview", "summary",
    "score", "grade", "rating", "ranking", "stratification", "categorization", "grouping",
    "vehicle", "optimization", "retention"
  )

  # Enhanced term-type mapping for common terms that are frequently misclassified
  term_type_mappings <- list(
    # Analytical techniques/methods that are often misclassified as chemicals
    "faers" = "method",              # FDA Adverse Event Reporting System
    "bcpnn" = "method",              # Bayesian Confidence Propagation Neural Network
    "uplc" = "method",               # Ultra Performance Liquid Chromatography
    "frap" = "method",               # Fluorescence Recovery After Photobleaching
    "hplc" = "method",               # High Performance Liquid Chromatography
    "lc-ms" = "method",              # Liquid Chromatography-Mass Spectrometry
    "gc-ms" = "method",              # Gas Chromatography-Mass Spectrometry
    "maldi" = "method",              # Matrix-Assisted Laser Desorption/Ionization
    "elisa" = "method",              # Enzyme-Linked Immunosorbent Assay
    "ft-ir" = "method",              # Fourier Transform Infrared Spectroscopy
    "nmr" = "method",                # Nuclear Magnetic Resonance
    "pcr" = "method",                # Polymerase Chain Reaction
    "sem" = "method",                # Scanning Electron Microscopy
    "tem" = "method",                # Transmission Electron Microscopy
    "xrd" = "method",                # X-Ray Diffraction
    "saxs" = "method",               # Small-Angle X-ray Scattering
    "uv-vis" = "method",             # Ultraviolet-Visible Spectroscopy
    "ms" = "method",                 # Mass Spectrometry
    "ms/ms" = "method",              # Tandem Mass Spectrometry
    "lc" = "method",                 # Liquid Chromatography
    "gc" = "method",                 # Gas Chromatography
    "tga" = "method",                # Thermogravimetric Analysis
    "dsc" = "method",                # Differential Scanning Calorimetry
    "uv" = "method",                 # Ultraviolet
    "ir" = "method",                 # Infrared
    "rna-seq" = "method",            # RNA Sequencing
    "qtof" = "method",               # Quadrupole Time-of-Flight
    "mri" = "method",                # Magnetic Resonance Imaging
    "ct" = "method",                 # Computed Tomography
    "pet" = "method",                # Positron Emission Tomography
    "spect" = "method",              # Single-Photon Emission Computed Tomography
    "ecg" = "method",                # Electrocardiogram
    "eeg" = "method",                # Electroencephalogram
    "emg" = "method",                # Electromyography
    "fmri" = "method",               # Functional Magnetic Resonance Imaging
    "qsar" = "method",               # Quantitative Structure-Activity Relationship
    "qspr" = "method",               # Quantitative Structure-Property Relationship

    # Common biostatistical methods incorrectly classified as chemicals
    "anova" = "method",              # Analysis of Variance
    "ancova" = "method",             # Analysis of Covariance
    "manova" = "method",             # Multivariate Analysis of Variance
    "pca" = "method",                # Principal Component Analysis
    "sem" = "method",                # Structural Equation Modeling
    "glm" = "method",                # Generalized Linear Model
    "lda" = "method",                # Linear Discriminant Analysis
    "svm" = "method",                # Support Vector Machine
    "ann" = "method",                # Artificial Neural Network
    "kmeans" = "method",             # K-means clustering
    "roc" = "method",                # Receiver Operating Characteristic
    "auc" = "method",                # Area Under the Curve

    # Database and algorithm acronyms
    "kegg" = "database",             # Kyoto Encyclopedia of Genes and Genomes
    "smiles" = "method",             # Simplified Molecular-Input Line-Entry System
    "blast" = "method",              # Basic Local Alignment Search Tool
    "mace" = "method",               # Major Adverse Cardiac Events

    # Proteins and receptors
    "receptor" = "protein",
    "receptors" = "protein",
    "channel" = "protein",
    "channels" = "protein",
    "transporter" = "protein",
    "transporters" = "protein",

    # Symptoms and clinical manifestations
    "pain" = "symptom",
    "headache" = "symptom",
    "migraine" = "disease",
    "nausea" = "symptom",
    "vomiting" = "symptom",
    "dizziness" = "symptom",
    "aura" = "symptom",
    "photophobia" = "symptom",
    "phonophobia" = "symptom",

    # Biological processes
    "inflammation" = "biological_process",
    "signaling" = "biological_process",
    "activation" = "biological_process",
    "inhibition" = "biological_process",
    "regulation" = "biological_process",
    "phosphorylation" = "biological_process",
    "oxidation" = "biological_process",

    # Diseases and disorders
    "malformation" = "disease",
    "disorder" = "disease",
    "syndrome" = "disease"
  )

  # Extract A-B associations
  a_associations <- co_matrix[a_term, ]

  # Filter B terms by removing terms with low association to A
  b_terms <- names(a_associations[a_associations > min_score])

  # Remove A term from potential B terms
  b_terms <- b_terms[b_terms != a_term]

  # If C term is specified, also remove it from B terms to prevent redundancy
  if (!is.null(c_term)) {
    b_terms <- b_terms[b_terms != c_term]
  }

  # Remove blacklisted terms from potential B terms
  b_terms <- b_terms[!tolower(b_terms) %in% blacklisted_terms]

  # Enforce term-type mappings
  if (has_entity_types) {
    # Apply corrections to entity types dictionary
    for (term_lower in names(term_type_mappings)) {
      # Find matches in the co-occurrence matrix (case-insensitive)
      term_matches <- grep(paste0("^", term_lower, "$"), rownames(co_matrix), ignore.case = TRUE)
      if (length(term_matches) > 0) {
        matched_terms <- rownames(co_matrix)[term_matches]
        for (matched_term in matched_terms) {
          entity_types_dict[matched_term] <- term_type_mappings[[term_lower]]
        }
      }
    }

    # Update entity types in the co-occurrence matrix
    attr(co_matrix, "entity_types") <- entity_types_dict
  }

  # Apply enhanced biomedical entity filtering to B terms if requested
  if (exclude_general_terms) {
    original_b_count <- length(b_terms)

    # Apply improved filtering that ensures B terms are valid biomedical entities
    valid_b_terms <- character(0)

    for (b_term in b_terms) {
      # Check if term is in our mapping and get the correct type
      b_type <- NULL
      term_lower <- tolower(b_term)

      if (term_lower %in% names(term_type_mappings)) {
        b_type <- term_type_mappings[[term_lower]]
      } else if (has_entity_types && b_term %in% names(entity_types_dict)) {
        b_type <- entity_types_dict[b_term]
      }

      # Apply validation using the selected validator
      if (validator(b_term, b_type)) {
        valid_b_terms <- c(valid_b_terms, b_term)
      }
    }

    # Update b_terms with filtered list
    b_terms <- valid_b_terms

    filtered_count <- original_b_count - length(b_terms)
    percent_filtered <- if (original_b_count > 0) {
      round((filtered_count / original_b_count) * 100, 1)
    } else {
      0
    }

    if (filtered_count > 0) {
      message("Filtered ", filtered_count, " B terms (", percent_filtered, "%) that weren't valid biomedical entities")
    }

    # If filtering is too aggressive, provide a warning
    if (length(b_terms) < 0.1 * original_b_count && original_b_count > 10) {
      message("Warning: Term filtering removed most B terms (", percent_filtered, "% filtered). Results may be limited.")
    }
  }

  # Apply B-term type constraints if specified
  if (!is.null(b_term_types) && has_entity_types) {
    original_b_count <- length(b_terms)
    valid_b_terms <- character(0)

    # Get B terms with matching types
    for (b_term in b_terms) {
      b_type <- NULL
      term_lower <- tolower(b_term)

      # Check if term is in our mapping first
      if (term_lower %in% names(term_type_mappings)) {
        b_type <- term_type_mappings[[term_lower]]
      }
      # Otherwise check in entity_types_dict if available
      else if (has_entity_types && b_term %in% names(entity_types_dict)) {
        b_type <- entity_types_dict[b_term]
      }

      # If we found a type, validate it
      if (!is.null(b_type)) {
        # Additional type validation for problematic terms
        if (enforce_strict_typing) {
          # Check if b_term with claimed type is valid using the selected validator
          if (!validator(b_term, b_type)) {
            next
          }
        }

        if (b_type %in% b_term_types) {
          valid_b_terms <- c(valid_b_terms, b_term)
        }
      }
    }

    # Replace b_terms with filtered list
    b_terms <- valid_b_terms

    filtered_count <- original_b_count - length(b_terms)
    if (filtered_count > 0) {
      message("Filtered ", filtered_count, " B terms that didn't match specified entity types: ",
              paste(b_term_types, collapse = ", "))
    }
  }

  # Filter out B terms that are too similar to A term if requested
  if (filter_similar_terms && length(b_terms) > 0) {
    # Function to calculate string similarity
    string_similarity <- function(a, b) {
      # Calculate Levenshtein distance
      a_lower <- tolower(a)
      b_lower <- tolower(b)

      # Check for pluralization by removing trailing 's'
      a_singular <- sub("s$", "", a_lower)
      b_singular <- sub("s$", "", b_lower)

      # Check for stemming similarity
      stem_sim <- a_singular == b_singular

      # Calculate basic string similarity ratio
      if (nchar(a_lower) == 0 || nchar(b_lower) == 0) {
        basic_sim <- 0
      } else {
        lev_dist <- adist(a_lower, b_lower)[1,1]
        max_len <- max(nchar(a_lower), nchar(b_lower))
        basic_sim <- 1 - (lev_dist / max_len)
      }

      # Return maximum similarity from the two methods
      return(max(basic_sim, as.numeric(stem_sim)))
    }

    # Calculate similarity for each B term
    original_b_count <- length(b_terms)
    similarities <- sapply(b_terms, function(b) string_similarity(a_term, b))

    # Filter out terms that are too similar to A term
    dissimilar_indices <- which(similarities < similarity_threshold)

    if (length(dissimilar_indices) > 0) {
      b_terms <- b_terms[dissimilar_indices]
      message("Filtered out ", original_b_count - length(b_terms),
              " B terms that were too similar to A term (similarity threshold: ",
              similarity_threshold, ")")
    } else if (original_b_count > 0) {
      # As a fallback, keep at least the least similar terms
      message("Warning: All B terms were filtered due to high similarity to A term. ",
              "Using reduced similarity threshold.")

      # Sort by similarity and keep the lower half
      sorted_indices <- order(similarities)
      keep_count <- max(1, floor(length(sorted_indices) / 2))
      b_terms <- b_terms[sorted_indices[1:keep_count]]
    }
  }

  # Additional filter to remove single-character and very short B terms
  b_terms <- b_terms[nchar(b_terms) >= 3]

  # If no B terms found, return empty result
  if (length(b_terms) == 0) {
    message("No suitable B terms found with association score > ", min_score, " after filtering")
    return(data.frame(
      a_term = character(),
      b_term = character(),
      c_term = character(),
      a_b_score = numeric(),
      b_c_score = numeric(),
      abc_score = numeric(),
      stringsAsFactors = FALSE
    ))
  }

  # Initialize results
  results <- data.frame(
    a_term = character(),
    b_term = character(),
    c_term = character(),
    a_b_score = numeric(),
    b_c_score = numeric(),
    abc_score = numeric(),
    stringsAsFactors = FALSE
  )

  # If a specific C term is provided
  if (!is.null(c_term)) {
    # Check C term type constraint if specified
    if (!is.null(c_term_types) && has_entity_types) {
      c_type <- NULL
      term_lower <- tolower(c_term)

      # Check if c_term is in our mapping
      if (term_lower %in% names(term_type_mappings)) {
        c_type <- term_type_mappings[[term_lower]]
      }
      # Otherwise check in entity_types_dict if available
      else if (c_term %in% names(entity_types_dict)) {
        c_type <- entity_types_dict[c_term]
      }

      # Validate C term against its claimed type using the selected validator
      if (!is.null(c_type)) {
        if (enforce_strict_typing && !validator(c_term, c_type)) {
          message("C term '", c_term, "' does not appear to be a valid ", c_type)
        }

        if (!(c_type %in% c_term_types)) {
          message("Specified C term '", c_term, "' does not match required entity types: ",
                  paste(c_term_types, collapse = ", "))
          return(results)
        }
      }
    }

    # Get B-C associations for the specific C term
    for (b_term in b_terms) {
      b_c_score <- co_matrix[b_term, c_term]

      # Only include if B-C association exists and meets threshold
      if (b_c_score > min_score) {
        a_b_score <- a_associations[b_term]

        # Calculate ABC score based on scoring method
        abc_score <- calculate_score(a_b_score, b_c_score, scoring_method)

        results <- rbind(results, data.frame(
          a_term = a_term,
          b_term = b_term,
          c_term = c_term,
          a_b_score = a_b_score,
          b_c_score = b_c_score,
          abc_score = abc_score,
          stringsAsFactors = FALSE
        ))
      }
    }
  } else {
    # Explore all potential C terms
    # Get all terms except A term and filtered B terms
    all_terms <- rownames(co_matrix)
    potential_c_terms <- setdiff(all_terms, c(a_term, b_terms))

    # Remove blacklisted terms from potential C terms
    potential_c_terms <- potential_c_terms[!tolower(potential_c_terms) %in% blacklisted_terms]

    # Apply filtering to potential C terms if requested
    if (exclude_general_terms) {
      original_c_count <- length(potential_c_terms)

      # Apply the same improved filtering for C terms using the selected validator
      valid_c_terms <- character(0)

      for (c_term_candidate in potential_c_terms) {
        c_type <- NULL
        term_lower <- tolower(c_term_candidate)

        # Check if term is in our mapping
        if (term_lower %in% names(term_type_mappings)) {
          c_type <- term_type_mappings[[term_lower]]
        }
        # Otherwise check in entity_types_dict if available
        else if (has_entity_types && c_term_candidate %in% names(entity_types_dict)) {
          c_type <- entity_types_dict[c_term_candidate]
        }

        # Apply validation using the selected validator
        if (validator(c_term_candidate, c_type)) {
          valid_c_terms <- c(valid_c_terms, c_term_candidate)
        }
      }

      # Update potential_c_terms with filtered list
      potential_c_terms <- valid_c_terms

      filtered_count <- original_c_count - length(potential_c_terms)
      if (filtered_count > 0) {
        message("Filtered ", filtered_count, " potential C terms that weren't valid biomedical entities")
      }
    }

    # Apply C-term type constraints if specified
    if (!is.null(c_term_types) && has_entity_types) {
      original_c_count <- length(potential_c_terms)
      valid_c_terms <- character(0)

      for (c_term_candidate in potential_c_terms) {
        c_type <- NULL
        term_lower <- tolower(c_term_candidate)

        # Check if term is in our mapping
        if (term_lower %in% names(term_type_mappings)) {
          c_type <- term_type_mappings[[term_lower]]
        }
        # Otherwise check in entity_types_dict if available
        else if (c_term_candidate %in% names(entity_types_dict)) {
          c_type <- entity_types_dict[c_term_candidate]
        }

        # Additional validation for C terms using the selected validator
        if (!is.null(c_type)) {
          if (enforce_strict_typing) {
            # Skip if the term with claimed type is not valid
            if (!validator(c_term_candidate, c_type)) {
              next
            }
          }

          if (c_type %in% c_term_types) {
            valid_c_terms <- c(valid_c_terms, c_term_candidate)
          }
        }
      }

      potential_c_terms <- valid_c_terms

      if (length(potential_c_terms) == 0) {
        message("No potential C terms found matching the specified entity types: ",
                paste(c_term_types, collapse = ", "))
        return(results)
      }
    }

    # For each B term, find potential C terms
    message("Identifying potential C terms via ", length(b_terms), " B terms...")
    pb <- utils::txtProgressBar(min = 0, max = length(b_terms), style = 3)

    for (i in seq_along(b_terms)) {
      utils::setTxtProgressBar(pb, i)

      b_term <- b_terms[i]

      # Get all B-C associations
      b_associations <- co_matrix[b_term, ]

      # Filter for potential C terms with sufficient association
      potential_c_for_b <- names(b_associations[b_associations > min_score])

      # Filter by potential C terms
      potential_c_for_b <- intersect(potential_c_for_b, potential_c_terms)

      # Also filter out C terms that are too similar to A term if requested
      if (filter_similar_terms && length(potential_c_for_b) > 0) {
        # Calculate similarity for each potential C term
        c_similarities <- sapply(potential_c_for_b, function(c) string_similarity(a_term, c))

        # Filter out terms that are too similar to A term
        potential_c_for_b <- potential_c_for_b[c_similarities < similarity_threshold]
      }

      # Additional filter to remove single-character and very short C terms
      potential_c_for_b <- potential_c_for_b[nchar(potential_c_for_b) >= 3]

      # For each potential C term
      for (c_term_candidate in potential_c_for_b) {
        b_c_score <- b_associations[c_term_candidate]
        a_b_score <- a_associations[b_term]

        # Calculate ABC score based on scoring method
        abc_score <- calculate_score(a_b_score, b_c_score, scoring_method)

        results <- rbind(results, data.frame(
          a_term = a_term,
          b_term = b_term,
          c_term = c_term_candidate,
          a_b_score = a_b_score,
          b_c_score = b_c_score,
          abc_score = abc_score,
          stringsAsFactors = FALSE
        ))
      }
    }

    close(pb)
  }

  # If no results found, return empty data frame
  if (nrow(results) == 0) {
    message("No ABC connections found")
    return(results)
  }

  # Sort by ABC score
  results <- results[order(-results$abc_score), ]

  # Limit to n_results
  if (nrow(results) > n_results) {
    results <- results[1:n_results, ]
  }

  # Add entity type information if available
  if (has_entity_types) {
    # Process special type mappings
    for (term in names(term_type_mappings)) {
      if (term %in% names(entity_types_dict)) {
        entity_types_dict[term] <- term_type_mappings[[term]]
      }
    }

    results$a_type <- sapply(results$a_term, function(term) {
      term_lower <- tolower(term)
      # Check if term is in our mapping
      if (term_lower %in% names(term_type_mappings)) {
        return(term_type_mappings[[term_lower]])
      }
      # Otherwise check in entity_types_dict if available
      else if (term %in% names(entity_types_dict)) {
        return(entity_types_dict[term])
      } else {
        return(NA)
      }
    })

    results$b_type <- sapply(results$b_term, function(term) {
      term_lower <- tolower(term)
      # Check if term is in our mapping
      if (term_lower %in% names(term_type_mappings)) {
        return(term_type_mappings[[term_lower]])
      }
      # Otherwise check in entity_types_dict if available
      else if (term %in% names(entity_types_dict)) {
        return(entity_types_dict[term])
      } else {
        return(NA)
      }
    })

    results$c_type <- sapply(results$c_term, function(term) {
      term_lower <- tolower(term)
      # Check if term is in our mapping
      if (term_lower %in% names(term_type_mappings)) {
        return(term_type_mappings[[term_lower]])
      }
      # Otherwise check in entity_types_dict if available
      else if (term %in% names(entity_types_dict)) {
        return(entity_types_dict[term])
      } else {
        return(NA)
      }
    })

    # Apply final type validation where needed
    if (enforce_strict_typing) {
      # Find rows with suspicious entity type assignments
      suspicious_rows <- which(
        (!is.na(results$b_type) & !sapply(1:nrow(results), function(i)
          validator(results$b_term[i], results$b_type[i]))) |
          (!is.na(results$c_type) & !sapply(1:nrow(results), function(i)
            validator(results$c_term[i], results$c_type[i])))
      )

      if (length(suspicious_rows) > 0) {
        message("Found ", length(suspicious_rows), " results with suspicious entity type assignments")
        # Remove these rows if we have enough results left
        if (nrow(results) - length(suspicious_rows) >= min(10, n_results/2)) {
          results <- results[-suspicious_rows, ]
          message("Removed suspicious rows, ", nrow(results), " results remaining")
        } else {
          message("Too many suspicious rows to remove, keeping them for now")
        }
      }
    }
  }

  return(results)
}

#' Calculate ABC score based on specified method
#'
#' @param a_b_score A-B association score
#' @param b_c_score B-C association score
#' @param method Scoring method: "multiplication", "average", "combined", or "jaccard"
#' @return Calculated score
#' @keywords internal
calculate_score <- function(a_b_score, b_c_score, method) {
  switch(method,
         "multiplication" = a_b_score * b_c_score,
         "average" = (a_b_score + b_c_score) / 2,
         "combined" = 0.7 * (a_b_score * b_c_score) + 0.3 * ((a_b_score + b_c_score) / 2),
         "jaccard" = a_b_score * b_c_score, # Actual Jaccard calculation would require the original co-occurrence data
         a_b_score * b_c_score) # Default to multiplication
}

#' Enforce diversity by selecting top connections from each B term
#'
#' @param results Data frame with ABC model results
#' @param max_per_group Maximum number of results to keep per B term
#' @return Data frame with diverse results
#' @keywords internal
diversify_b_terms <- function(results, max_per_group = 3) {
  # Identify unique B terms
  unique_b_terms <- unique(results$b_term)

  # Initialize diverse results
  diverse_results <- data.frame(
    a_term = character(),
    b_term = character(),
    c_term = character(),
    a_b_score = numeric(),
    b_c_score = numeric(),
    abc_score = numeric(),
    stringsAsFactors = FALSE
  )

  # For each unique B term, get the top C term
  for (b_term in unique_b_terms) {
    b_results <- results[results$b_term == b_term, ]

    # Skip if no results for this B term
    if (nrow(b_results) == 0) next

    # Get unique C terms for this B term
    b_c_terms <- unique(b_results$c_term)

    # For each C term, take the best ABC score
    for (c_term in b_c_terms) {
      c_results <- b_results[b_results$c_term == c_term, ]

      # Get the row with maximum ABC score for this B-C pair
      best_idx <- which.max(c_results$abc_score)

      # Add to diverse results
      diverse_results <- rbind(diverse_results, c_results[best_idx, ])
    }
  }

  # Sort by ABC score
  diverse_results <- diverse_results[order(-diverse_results$abc_score), ]

  # Limit results per B term to max_per_group
  b_term_counts <- table(diverse_results$b_term)

  # For B terms with more than max_per_group results, keep only the top max_per_group
  for (b_term in names(b_term_counts)) {
    if (b_term_counts[b_term] > max_per_group) {
      # Identify rows for this B term
      b_rows <- which(diverse_results$b_term == b_term)

      # Keep only the top max_per_group based on ABC score (they're already sorted)
      rows_to_remove <- b_rows[(max_per_group + 1):length(b_rows)]

      # Remove excess rows
      if (length(rows_to_remove) > 0) {
        diverse_results <- diverse_results[-rows_to_remove, ]
      }
    }
  }

  return(diverse_results)
}

#' Add statistical significance testing based on hypergeometric tests
#'
#' @param results Data frame with ABC model results
#' @param co_matrix Co-occurrence matrix
#' @param alpha Significance level
#' @return Data frame with p-values and significance indicators
#' @keywords internal
add_statistical_significance <- function(results, co_matrix, alpha = 0.05) {
  # Initialize p-values
  results$p_value <- numeric(nrow(results))
  results$significant <- logical(nrow(results))

  # Get total number of documents (approximation from co-occurrence matrix)
  # Use the diagonal elements which represent term frequency
  term_freq <- diag(co_matrix)
  total_docs <- max(term_freq) # This is an approximation

  # For each result, calculate hypergeometric p-value
  for (i in 1:nrow(results)) {
    a_term <- results$a_term[i]
    b_term <- results$b_term[i]
    c_term <- results$c_term[i]

    # Get frequencies
    a_freq <- term_freq[a_term]
    b_freq <- term_freq[b_term]
    c_freq <- term_freq[c_term]

    # Get co-occurrence counts
    a_b_count <- co_matrix[a_term, b_term] * a_freq
    b_c_count <- co_matrix[b_term, c_term] * b_freq

    # Calculate expected A-C co-occurrence under independence assumption
    expected_a_c <- (a_freq * c_freq) / total_docs

    # Calculate observed A-C co-occurrence
    a_c_count <- co_matrix[a_term, c_term] * a_freq

    # Calculate p-value using hypergeometric test
    # We use phyper for hypergeometric distribution:
    # phyper(q, m, n, k, lower.tail = FALSE)
    # q = observed count - 1 (since we want P(X > a_c_count))
    # m = a_freq (number of "successes" in the population)
    # n = total_docs - a_freq (number of "failures" in the population)
    # k = c_freq (number of draws)

    # Since we're looking for higher than expected co-occurrence:
    p_value <- stats::phyper(a_c_count - 1, a_freq, total_docs - a_freq, c_freq, lower.tail = FALSE)

    # Store p-value and significance
    results$p_value[i] <- p_value
    results$significant[i] <- p_value < alpha
  }

  # Apply Benjamini-Hochberg false discovery rate correction
  if (nrow(results) > 1) {
    # Sort p-values
    sorted_indices <- order(results$p_value)
    sorted_p_values <- results$p_value[sorted_indices]

    # Calculate BH-adjusted p-values
    n <- length(sorted_p_values)
    adjusted_p_values <- sorted_p_values * n / seq_len(n)

    # Ensure monotonicity
    for (i in (n-1):1) {
      adjusted_p_values[i] <- min(adjusted_p_values[i], adjusted_p_values[i+1])
    }

    # Update results
    results$adjusted_p_value <- numeric(n)
    results$adjusted_p_value[sorted_indices] <- adjusted_p_values
    results$significant <- results$adjusted_p_value < alpha
  }

  return(results)
}

#' Find all potential ABC connections
#'
#' This function finds all potential ABC connections in a co-occurrence matrix.
#'
#' @param co_matrix A co-occurrence matrix produced by create_comat().
#' @param a_type Character string, the entity type for A terms.
#' @param c_type Character string, the entity type for C terms.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#'
#' @return A data frame with ranked discovery results.
#' @export
find_abc_all <- function(co_matrix, a_type = NULL, c_type = NULL,
                         min_score = 0.1, n_results = 1000) {
  # Check if the matrix has entity types
  has_entity_types <- !is.null(attr(co_matrix, "entity_types"))

  # If entity types are requested but not available, throw an error
  if (((!is.null(a_type) || !is.null(c_type)) && !has_entity_types)) {
    stop("Entity types requested but not available in the co-occurrence matrix")
  }

  # Get all terms
  all_terms <- rownames(co_matrix)

  # Initialize results
  all_results <- data.frame(
    a_term = character(),
    b_term = character(),
    c_term = character(),
    a_b_score = numeric(),
    b_c_score = numeric(),
    abc_score = numeric(),
    stringsAsFactors = FALSE
  )

  # Filter potential A terms by type
  potential_a_terms <- all_terms
  if (!is.null(a_type) && has_entity_types) {
    entity_types <- attr(co_matrix, "entity_types")
    a_type_terms <- names(entity_types[entity_types == a_type])
    potential_a_terms <- intersect(potential_a_terms, a_type_terms)
    if (length(potential_a_terms) == 0) {
      message("No terms found with type: ", a_type)
      return(all_results)
    }
  }

  # For each potential A term
  for (a_term in potential_a_terms) {
    message("Processing A term: ", a_term, " (", which(potential_a_terms == a_term),
            " of ", length(potential_a_terms), ")")

    # Apply ABC model with provided min_score
    abc_results <- suppressMessages(
      abc_model(co_matrix,
                a_term = a_term,
                c_term = NULL,
                min_score = min_score,
                n_results = n_results)
    )

    # Combine results
    if (nrow(abc_results) > 0) {
      all_results <- rbind(all_results, abc_results)
    }
  }

  # If no results found, return empty data frame
  if (nrow(all_results) == 0) {
    message("No ABC connections found")
    return(all_results)
  }

  # Sort by ABC score and limit to n_results
  all_results <- all_results[order(-all_results$abc_score), ]
  if (nrow(all_results) > n_results) {
    all_results <- all_results[1:n_results, ]
  }

  return(all_results)
}

#' Apply the ABC model with statistical significance testing
#'
#' This function extends the ABC model with statistical significance testing
#' to evaluate the strength of discovered connections.
#'
#' @param co_matrix A co-occurrence matrix produced by create_cooccurrence_matrix().
#' @param a_term Character string, the source term (A).
#' @param c_term Character string, the target term (C). If NULL, all potential C terms will be evaluated.
#' @param a_type Character string, the entity type for A terms. If NULL, all types are considered.
#' @param c_type Character string, the entity type for C terms. If NULL, all types are considered.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#' @param n_permutations Number of permutations for significance testing.
#' @param scoring_method Method to use for scoring ABC connections.
#'
#' @return A data frame with ranked discovery results and p-values.
#' @export
abc_model_sig <- function(co_matrix, a_term, c_term = NULL,
                          a_type = NULL, c_type = NULL,
                          min_score = 0.1, n_results = 100,
                          n_permutations = 1000,
                          scoring_method = c("multiplication", "average", "combined", "jaccard")) {

  # Match scoring_method argument
  scoring_method <- match.arg(scoring_method)

  # First get standard ABC results
  results <- abc_model(co_matrix, a_term, c_term, a_type, c_type, min_score, n_results)

  # If no results, return empty data frame
  if (nrow(results) == 0) {
    return(results)
  }

  # Calculate alternative scores based on the chosen method
  if (scoring_method == "multiplication") {
    # This is already calculated as abc_score in the original function
    results$primary_score <- results$abc_score
  } else if (scoring_method == "average") {
    results$abc_score_avg <- (results$a_b_score + results$b_c_score) / 2
    results$primary_score <- results$abc_score_avg
  } else if (scoring_method == "combined") {
    # Weighted combination of multiplication and average
    results$abc_score_combined <- 0.7 * results$abc_score +
      0.3 * ((results$a_b_score + results$b_c_score) / 2)
    results$primary_score <- results$abc_score_combined
  } else if (scoring_method == "jaccard") {
    # For Jaccard, we need to recalculate from the original co-occurrence matrix
    # This assumes the co-occurrence matrix contains raw co-occurrence counts

    # Get a-term row and sum
    a_docs <- co_matrix[a_term, ]

    # Initialize jaccard scores
    results$abc_score_jaccard <- numeric(nrow(results))

    for (i in 1:nrow(results)) {
      b_term <- results$b_term[i]
      c_term_i <- results$c_term[i]

      # Get rows for b and c terms
      b_docs <- co_matrix[b_term, ]
      c_docs <- co_matrix[c_term_i, ]

      # Calculate Jaccard coefficients
      a_b_jaccard <- sum(a_docs > 0 & b_docs > 0) / sum(a_docs > 0 | b_docs > 0)
      b_c_jaccard <- sum(b_docs > 0 & c_docs > 0) / sum(b_docs > 0 | c_docs > 0)

      # Update scores
      results$abc_score_jaccard[i] <- a_b_jaccard * b_c_jaccard
    }

    results$primary_score <- results$abc_score_jaccard
  }

  # Perform significance testing through permutation
  # We'll calculate p-values for each B term

  # Store original scores
  original_scores <- results$primary_score

  # Initialize p-values
  results$p_value <- numeric(nrow(results))

  # Permutation test
  message("Performing permutation test for statistical significance...")
  permutation_scores <- matrix(0, nrow = nrow(results), ncol = n_permutations)

  for (perm in 1:n_permutations) {
    if (perm %% 100 == 0) {
      message("  Permutation ", perm, " of ", n_permutations)
    }

    # Create a permuted co-occurrence matrix
    # Just shuffle the elements within each row
    perm_matrix <- co_matrix
    for (i in 1:nrow(perm_matrix)) {
      perm_matrix[i, ] <- sample(perm_matrix[i, ])
    }

    # Calculate scores using permuted matrix
    for (i in 1:nrow(results)) {
      b_term <- results$b_term[i]
      c_term_i <- results$c_term[i]

      # Extract scores from permuted matrix
      perm_a_b_score <- perm_matrix[a_term, b_term]
      perm_b_c_score <- perm_matrix[b_term, c_term_i]

      # Calculate score based on the chosen method
      if (scoring_method == "multiplication") {
        perm_score <- perm_a_b_score * perm_b_c_score
      } else if (scoring_method == "average") {
        perm_score <- (perm_a_b_score + perm_b_c_score) / 2
      } else if (scoring_method == "combined") {
        perm_score <- 0.7 * (perm_a_b_score * perm_b_c_score) +
          0.3 * ((perm_a_b_score + perm_b_c_score) / 2)
      } else if (scoring_method == "jaccard") {
        # For Jaccard, would need more complex permutation approach
        # Simplifying here to just use the product
        perm_score <- perm_a_b_score * perm_b_c_score
      }

      permutation_scores[i, perm] <- perm_score
    }
  }

  # Calculate p-values
  for (i in 1:nrow(results)) {
    original <- original_scores[i]
    perms <- permutation_scores[i, ]

    # p-value is proportion of permutation scores >= original score
    results$p_value[i] <- sum(perms >= original) / n_permutations
  }

  # Add significance indicator
  results$significant <- results$p_value < 0.05

  # Add false discovery rate correction
  if (nrow(results) > 1) {
    # Benjamini-Hochberg procedure
    p_sorted <- sort(results$p_value)
    rank <- 1:length(p_sorted)
    q_value <- p_sorted * length(p_sorted) / rank

    # Ensure q-values are monotonically decreasing
    for (i in (length(q_value) - 1):1) {
      q_value[i] <- min(q_value[i], q_value[i + 1])
    }

    # Map q-values back to original order
    results$q_value <- q_value[match(results$p_value, p_sorted)]

    # Add FDR-corrected significance indicator
    results$significant_fdr <- results$q_value < 0.05
  } else {
    results$q_value <- results$p_value
    results$significant_fdr <- results$significant
  }

  # Sort by primary score and limit to n_results
  results <- results[order(-results$primary_score), ]
  if (nrow(results) > n_results) {
    results <- results[1:n_results, ]
  }

  return(results)
}

#' Apply time-sliced ABC model for validation
#'
#' This function implements a time-sliced ABC model for validation.
#' It uses historical data to predict connections that will appear in the future.
#'
#' @param entity_data A data frame of entity data with time information.
#' @param time_column Name of the column containing time information.
#' @param split_time Time point to split historical and future data.
#' @param a_term Character string, the source term (A).
#' @param a_type Character string, the entity type for A terms.
#' @param c_type Character string, the entity type for C terms.
#' @param min_score Minimum score threshold for results.
#' @param n_results Maximum number of results to return.
#'
#' @return A list with prediction results and validation metrics.
#' @export
abc_timeslice <- function(entity_data, time_column = "publication_year",
                          split_time, a_term, a_type = NULL, c_type = NULL,
                          min_score = 0.1, n_results = 100) {

  # Check if time column exists
  if (!time_column %in% colnames(entity_data)) {
    stop("Time column '", time_column, "' not found in the data")
  }

  # Split data into historical and future sets
  historical_data <- entity_data[entity_data[[time_column]] < split_time, ]
  future_data <- entity_data[entity_data[[time_column]] >= split_time, ]

  message("Split data: ", nrow(historical_data), " historical records, ",
          nrow(future_data), " future records")

  # Create co-occurrence matrix for historical data
  historical_matrix <- create_comat(historical_data,
                                    normalize = TRUE)

  # Run ABC model on historical data
  historical_results <- abc_model(historical_matrix,
                                  a_term = a_term,
                                  min_score = min_score,
                                  n_results = n_results)

  # Extract predicted A-C connections
  predicted_connections <- unique(data.frame(
    a_term = historical_results$a_term,
    c_term = historical_results$c_term,
    stringsAsFactors = FALSE
  ))

  # Create co-occurrence matrix for future data
  future_matrix <- create_comat(future_data,
                                normalize = TRUE)

  # Check which predicted connections appear in future data
  validated_connections <- data.frame(
    a_term = character(),
    c_term = character(),
    predicted_score = numeric(),
    future_score = numeric(),
    validated = logical(),
    stringsAsFactors = FALSE
  )

  for (i in 1:nrow(predicted_connections)) {
    a <- predicted_connections$a_term[i]
    c <- predicted_connections$c_term[i]

    # Get prediction score from historical results
    idx <- which(historical_results$a_term == a & historical_results$c_term == c)
    if (length(idx) > 0) {
      predicted_score <- max(historical_results$abc_score[idx])
    } else {
      predicted_score <- NA
    }

    # Check if a and c appear together in future data
    future_score <- NA
    validated <- FALSE

    if (a %in% rownames(future_matrix) && c %in% rownames(future_matrix)) {
      # Check for direct connection in future
      direct_score <- future_matrix[a, c]

      if (direct_score > 0) {
        future_score <- direct_score
        validated <- TRUE
      } else {
        # Check for indirect connections through any B term
        for (b in rownames(future_matrix)) {
          if (b != a && b != c) {
            a_b_score <- future_matrix[a, b]
            b_c_score <- future_matrix[b, c]

            if (a_b_score > 0 && b_c_score > 0) {
              if (is.na(future_score) || a_b_score * b_c_score > future_score) {
                future_score <- a_b_score * b_c_score
                validated <- TRUE
              }
            }
          }
        }
      }
    }

    validated_connections <- rbind(validated_connections, data.frame(
      a_term = a,
      c_term = c,
      predicted_score = predicted_score,
      future_score = future_score,
      validated = validated,
      stringsAsFactors = FALSE
    ))
  }

  # Calculate validation metrics
  total_predictions <- nrow(validated_connections)
  total_validated <- sum(validated_connections$validated)
  validation_rate <- total_validated / total_predictions

  # Return results
  return(list(
    predictions = historical_results,
    validations = validated_connections,
    validation_metrics = list(
      total_predictions = total_predictions,
      total_validated = total_validated,
      validation_rate = validation_rate
    )
  ))
}

#' Apply statistical validation to ABC model results with support for large matrices
#'
#' This function performs statistical tests to validate ABC model results.
#' It calculates p-values using hypergeometric tests and applies correction for multiple testing.
#' The function is optimized to work with very large co-occurrence matrices.
#'
#' @param abc_results A data frame containing ABC results.
#' @param co_matrix The co-occurrence matrix used to generate the ABC results.
#' @param alpha Significance level (p-value threshold).
#' @param correction Method for multiple testing correction.
#' @param filter_by_significance Logical. If TRUE, only returns significant results.
#'
#' @return A data frame with ABC results and statistical significance measures.
#' @export
validate_abc <- function(abc_results, co_matrix,
                         alpha = 0.05,
                         correction = c("BH", "bonferroni", "none"),
                         filter_by_significance = FALSE) {

  # Match correction argument
  correction <- match.arg(correction)

  # Check if results are empty
  if (nrow(abc_results) == 0) {
    message("ABC results are empty")
    return(abc_results)
  }

  # Add p-value and significance columns
  results <- abc_results
  results$p_value <- numeric(nrow(results))
  results$significant <- logical(nrow(results))

  # Instead of relying on diag(), we'll estimate frequencies directly
  message("Using optimized approach for large matrix validation...")

  # Try to get document count from metadata if available
  if (!is.null(attr(co_matrix, "metadata")) && !is.null(attr(co_matrix, "metadata")$n_docs)) {
    total_docs <- attr(co_matrix, "metadata")$n_docs
    message("Using metadata for document count: ", total_docs)
  } else {
    # If not in metadata, use a reasonable estimate based on matrix properties
    # We'll use the maximum self-co-occurrence (diagonal element) as an approximation

    # Get unique terms from the ABC results to limit our search
    unique_terms <- unique(c(results$a_term, results$b_term, results$c_term))

    # Find the maximum frequency by checking diagonal elements for these terms
    max_freq <- 0
    for (term in unique_terms) {
      if (term %in% rownames(co_matrix)) {
        term_freq <- co_matrix[term, term]
        max_freq <- max(max_freq, term_freq)
      }
    }

    # If we found a reasonable max frequency, use it
    if (max_freq > 0) {
      total_docs <- ceiling(max_freq)
    } else {
      # As a fallback, use a reasonable default or estimate from matrix size
      total_docs <- max(100, ceiling(sqrt(nrow(co_matrix))))
    }

    message("Estimated document count: ", total_docs)
  }

  # Calculate p-values using hypergeometric test
  message("Calculating statistical significance using hypergeometric test...")

  for (i in 1:nrow(results)) {
    a_term <- results$a_term[i]
    b_term <- results$b_term[i]
    c_term <- results$c_term[i]

    # Skip calculation if any term is missing from the matrix
    if (!all(c(a_term, b_term, c_term) %in% rownames(co_matrix))) {
      results$p_value[i] <- 1.0  # Set to 1.0 to indicate no significance
      next
    }

    # Get term frequencies from diagonal elements (one at a time)
    a_freq <- co_matrix[a_term, a_term]
    b_freq <- co_matrix[b_term, b_term]
    c_freq <- co_matrix[c_term, c_term]

    # Get co-occurrence values
    a_b_score <- co_matrix[a_term, b_term]
    b_c_score <- co_matrix[b_term, c_term]
    a_c_score <- co_matrix[a_term, c_term]

    # Handle normalized scores (if values are between 0 and 1)
    if (max(a_b_score, b_c_score, a_c_score) <= 1) {
      # Convert normalized scores to approximate counts
      # Use minimum to avoid over-counting
      a_b_count <- round(a_b_score * min(a_freq, b_freq))
      b_c_count <- round(b_c_score * min(b_freq, c_freq))
      a_c_count <- round(a_c_score * min(a_freq, c_freq))
    } else {
      # Assume these are already counts
      a_b_count <- a_b_score
      b_c_count <- b_c_score
      a_c_count <- a_c_score
    }

    # Calculate expected A-C co-occurrence under independence
    expected_a_c <- (a_freq * c_freq) / total_docs

    # Skip calculation if we don't have valid frequencies
    if (a_freq <= 0 || c_freq <= 0 || total_docs <= 0) {
      results$p_value[i] <- 1.0
      next
    }

    # Calculate p-value using hypergeometric test
    # Test for over-representation (common in LBD)
    p_value <- tryCatch({
      stats::phyper(a_c_count - 1, a_freq, total_docs - a_freq, c_freq,
                    lower.tail = FALSE)
    }, error = function(e) {
      # If the hypergeometric test fails, use a simple approximation
      # based on the ratio of observed to expected
      if (expected_a_c > 0) {
        ratio <- a_c_count / expected_a_c
        # Convert ratio to a p-value (higher ratio = lower p-value)
        1 / (1 + ratio^2)
      } else {
        1.0  # Default to no significance
      }
    })

    # Store p-value
    results$p_value[i] <- p_value
  }

  # Apply multiple testing correction
  if (correction == "BH") {
    # Benjamini-Hochberg procedure (controls false discovery rate)
    adjusted_p <- stats::p.adjust(results$p_value, method = "BH")
    results$adjusted_p_value <- adjusted_p
    results$significant <- adjusted_p < alpha
  } else if (correction == "bonferroni") {
    # Bonferroni correction (controls family-wise error rate)
    adjusted_p <- stats::p.adjust(results$p_value, method = "bonferroni")
    results$adjusted_p_value <- adjusted_p
    results$significant <- adjusted_p < alpha
  } else {
    # No correction (just use raw p-values)
    results$adjusted_p_value <- results$p_value
    results$significant <- results$p_value < alpha
  }

  # Calculate percentage of significant results
  percent_significant <- sum(results$significant) / nrow(results) * 100
  message(sprintf("%.1f%% of connections are statistically significant (p < %.2f%s)",
                  percent_significant, alpha,
                  ifelse(correction == "none", "", paste0(", ", correction, " correction"))))

  # Filter by significance if requested
  if (filter_by_significance) {
    results <- results[results$significant, ]
    if (nrow(results) == 0) {
      warning("No statistically significant results found. Returning unfiltered results.")
      return(abc_results)
    }
  }

  # Sort by adjusted p-value (or raw p-value if no correction)
  results <- results[order(results$adjusted_p_value, -results$abc_score), ]

  return(results)
}

#' Standard validation method using hypergeometric tests
#' @keywords internal
standard_validation <- function(abc_results, co_matrix, alpha, correction) {
  # Add p-value and significance columns
  results <- abc_results
  results$p_value <- numeric(nrow(results))
  results$significant <- logical(nrow(results))

  # Get total number of documents from co-occurrence matrix
  # This is an approximation based on term frequency
  term_freq <- diag(co_matrix)
  total_docs <- max(term_freq) # This is an approximation

  # Calculate p-values using hypergeometric test
  message("Calculating statistical significance using hypergeometric test...")

  for (i in 1:nrow(results)) {
    a_term <- results$a_term[i]
    b_term <- results$b_term[i]
    c_term <- results$c_term[i]

    # Get frequencies
    a_freq <- term_freq[a_term]
    b_freq <- term_freq[b_term]
    c_freq <- term_freq[c_term]

    # Get co-occurrence counts (actual co-occurrence * frequency)
    a_b_count <- co_matrix[a_term, b_term] * min(a_freq, b_freq)
    b_c_count <- co_matrix[b_term, c_term] * min(b_freq, c_freq)

    # Calculate observed A-C co-occurrence
    a_c_count <- co_matrix[a_term, c_term] * min(a_freq, c_freq)

    # Calculate expected A-C co-occurrence under independence
    expected_a_c <- (a_freq * c_freq) / total_docs

    # Calculate p-value using hypergeometric test
    # Evaluate the probability of observing a_c_count or more co-occurrences by chance
    p_value <- stats::phyper(a_c_count - 1, a_freq, total_docs - a_freq, c_freq,
                             lower.tail = FALSE)

    # Store p-value
    results$p_value[i] <- p_value
  }

  # Apply multiple testing correction
  return(apply_correction(results, correction, alpha))
}

#' Alternative validation for large matrices
#' @keywords internal
alternative_validation <- function(abc_results, co_matrix, alpha, correction) {
  # Add columns to results
  results <- abc_results
  results$p_value <- numeric(nrow(results))
  results$significant <- logical(nrow(results))

  # For large matrices, use a simplified approach that avoids diag() operations
  message("Using alternative statistical validation for large matrix...")

  # Instead of extracting all diagonal elements at once, process terms individually
  for (i in 1:nrow(results)) {
    a_term <- results$a_term[i]
    b_term <- results$b_term[i]
    c_term <- results$c_term[i]

    # Get individual frequencies directly (avoids extracting full diagonal)
    # Note: In a co-occurrence matrix, diagonal element [i,i] represents term i's frequency
    tryCatch({
      a_freq <- co_matrix[a_term, a_term]
      b_freq <- co_matrix[b_term, b_term]
      c_freq <- co_matrix[c_term, c_term]

      # Calculate total docs (approximate)
      total_docs <- max(a_freq, b_freq, c_freq) * 10  # Multiplier as a heuristic

      # Get connection scores
      a_b_score <- co_matrix[a_term, b_term]
      b_c_score <- co_matrix[b_term, c_term]
      a_c_score <- co_matrix[a_term, c_term]

      # Calculate approximate co-occurrence counts
      a_b_count <- a_b_score * min(a_freq, b_freq)
      b_c_count <- b_c_score * min(b_freq, c_freq)
      a_c_count <- a_c_score * min(a_freq, c_freq)

      # Calculate expected co-occurrence under independence
      expected_a_c <- (a_freq * c_freq) / total_docs

      # Calculate p-value using hypergeometric test
      p_value <- stats::phyper(max(1, a_c_count) - 1, a_freq, total_docs - a_freq, c_freq,
                               lower.tail = FALSE)

      # Prevent very low p-values from becoming 0
      p_value <- max(p_value, .Machine$double.eps)

      # Store p-value
      results$p_value[i] <- p_value
    }, error = function(e) {
      # If even individual access fails, use abc score as proxy
      message("Error accessing matrix elements for terms. Using score-based approximation.")
      results$p_value[i] <<- 1 - results$abc_score[i] / max(results$abc_score)
    })
  }

  # Apply multiple testing correction
  return(apply_correction(results, correction, alpha))
}

#' Apply correction to p-values
#' @keywords internal
apply_correction <- function(results, correction, alpha) {
  # Apply multiple testing correction
  if (correction == "BH") {
    # Benjamini-Hochberg procedure (controls false discovery rate)
    adjusted_p <- stats::p.adjust(results$p_value, method = "BH")
    results$adjusted_p_value <- adjusted_p
    results$significant <- adjusted_p < alpha
  } else if (correction == "bonferroni") {
    # Bonferroni correction (controls family-wise error rate)
    adjusted_p <- stats::p.adjust(results$p_value, method = "bonferroni")
    results$adjusted_p_value <- adjusted_p
    results$significant <- adjusted_p < alpha
  } else {
    # No correction (just use raw p-values)
    results$adjusted_p_value <- results$p_value
    results$significant <- results$p_value < alpha
  }

  # Calculate percentage of significant results
  percent_significant <- sum(results$significant) / nrow(results) * 100
  message(sprintf("%.1f%% of connections are statistically significant (p < %.2f%s)",
                  percent_significant, alpha,
                  ifelse(correction == "none", "", paste0(", ", correction, " correction"))))

  # Sort by adjusted p-value (or raw p-value if no correction), then by abc_score
  results <- results[order(results$adjusted_p_value, -results$abc_score), ]

  return(results)
}

#' Perform randomization test for ABC model
#'
#' This function assesses the significance of ABC model results through randomization.
#' It generates a null distribution by permuting the co-occurrence matrix.
#'
#' @param abc_results A data frame containing ABC results.
#' @param co_matrix The co-occurrence matrix used to generate the ABC results.
#' @param n_permutations Number of permutations to perform.
#' @param alpha Significance level.
#'
#' @return A data frame with ABC results and permutation-based significance measures.
#' @export
perm_test_abc <- function(abc_results, co_matrix, n_permutations = 1000, alpha = 0.05) {
  # Check if results are empty
  if (nrow(abc_results) == 0) {
    message("ABC results are empty")
    return(abc_results)
  }

  # Initialize results with permutation p-values
  results <- abc_results
  results$perm_p_value <- numeric(nrow(results))
  results$perm_significant <- logical(nrow(results))

  # Store original ABC scores
  original_scores <- results$abc_score

  # Initialize matrix to store permutation scores
  perm_scores <- matrix(0, nrow = nrow(results), ncol = n_permutations)

  # Run permutations
  message("Performing randomization test with ", n_permutations, " permutations...")
  pb <- utils::txtProgressBar(min = 0, max = n_permutations, style = 3)

  for (p in 1:n_permutations) {
    # Create a permuted co-occurrence matrix by shuffling cells
    perm_matrix <- co_matrix

    # Shuffle within each row to preserve row sums
    for (i in 1:nrow(perm_matrix)) {
      # Get non-zero, non-diagonal elements in this row
      values <- perm_matrix[i, ]
      values[i] <- 0  # Exclude diagonal
      idx <- which(values > 0)

      if (length(idx) > 1) {
        # Shuffle the non-zero values
        values[idx] <- sample(values[idx])
        perm_matrix[i, ] <- values
      }
    }

    # Calculate permuted ABC scores
    for (i in 1:nrow(results)) {
      a_term <- results$a_term[i]
      b_term <- results$b_term[i]
      c_term <- results$c_term[i]

      # Calculate permuted scores
      a_b_score <- perm_matrix[a_term, b_term]
      b_c_score <- perm_matrix[b_term, c_term]
      abc_score <- a_b_score * b_c_score

      # Store permuted score
      perm_scores[i, p] <- abc_score
    }

    utils::setTxtProgressBar(pb, p)
  }

  close(pb)

  # Calculate permutation p-values
  # For each result, p-value is the proportion of permuted scores >= original score
  for (i in 1:nrow(results)) {
    # Calculate p-value from permutation distribution
    perm_p_value <- sum(perm_scores[i, ] >= original_scores[i]) / n_permutations

    # Store permutation p-value
    results$perm_p_value[i] <- perm_p_value
    results$perm_significant[i] <- perm_p_value < alpha
  }

  # Apply Benjamini-Hochberg correction to permutation p-values
  results$perm_adjusted_p <- stats::p.adjust(results$perm_p_value, method = "BH")
  results$perm_significant_adj <- results$perm_adjusted_p < alpha

  # Report percentage of significant results after permutation test
  percent_significant <- sum(results$perm_significant_adj) / nrow(results) * 100
  message(sprintf("%.1f%% of connections are significant after randomization test (p < %.2f, BH correction)",
                  percent_significant, alpha))

  # Sort by permutation p-value
  results <- results[order(results$perm_adjusted_p, -results$abc_score), ]

  return(results)
}

#' Enforce diversity in ABC model results
#'
#' This function applies diversity enforcement to ABC model results by:
#' 1. Removing duplicate paths to the same C term
#' 2. Ensuring B term diversity by selecting top results from each B term group
#' 3. Preventing A and C terms from appearing as B terms
#'
#' @param abc_results A data frame containing ABC results.
#' @param diversity_method Method for enforcing diversity: "b_term_groups", "unique_c_paths", or "both".
#' @param max_per_group Maximum number of results to keep per B term or C term.
#' @param min_score Minimum score threshold for including connections.
#'
#' @return A data frame with diverse ABC results.
#' @export
diversify_abc <- function(abc_results,
                          diversity_method = c("both", "b_term_groups", "unique_c_paths"),
                          max_per_group = 3,
                          min_score = 0.1) {

  # Match diversity_method argument
  diversity_method <- match.arg(diversity_method)

  # Check if results are empty
  if (nrow(abc_results) == 0) {
    message("ABC results are empty")
    return(abc_results)
  }

  # Filter results by minimum score
  results <- abc_results[abc_results$abc_score >= min_score, ]

  # If no results after filtering, return empty data frame
  if (nrow(results) == 0) {
    message("No results remain after filtering by minimum score")
    return(results)
  }

  # Sort by ABC score
  results <- results[order(-results$abc_score), ]

  # Initialize diverse results
  diverse_results <- results

  # Apply selected diversity methods
  if (diversity_method %in% c("both", "b_term_groups")) {
    diverse_results <- diversify_b_terms(diverse_results, max_per_group)
  }

  if (diversity_method %in% c("both", "unique_c_paths")) {
    diverse_results <- diversify_c_paths(diverse_results, max_per_group)
  }

  # Remove A and C terms that appear as B terms
  diverse_results <- remove_ac_terms(diverse_results)

  return(diverse_results)
}

#' Enforce diversity for C term paths
#'
#' @param results Data frame with ABC model results
#' @param max_per_c Maximum number of paths to keep per C term
#' @return Data frame with C term path diversity enforced
#' @keywords internal
diversify_c_paths <- function(results, max_per_c = 3) {
  # Group results by C term
  unique_c_terms <- unique(results$c_term)

  # Initialize diverse results
  diverse_results <- data.frame()

  # For each C term, select top paths (limited by max_per_c)
  for (c_term in unique_c_terms) {
    c_results <- results[results$c_term == c_term, ]

    # Select the best paths using unique B terms
    selected_paths <- data.frame()

    # Track which B terms we've already used for this C term
    used_b_terms <- character(0)

    # Add paths until we reach max_per_c or run out of options
    for (i in 1:nrow(c_results)) {
      # If we've reached max_per_c paths or used all B terms, break
      if (nrow(selected_paths) >= max_per_c || i > nrow(c_results)) {
        break
      }

      b_term <- c_results$b_term[i]

      # Skip if we've already used this B term for this C term
      if (b_term %in% used_b_terms) {
        next
      }

      # Add path and mark B term as used
      selected_paths <- rbind(selected_paths, c_results[i, ])
      used_b_terms <- c(used_b_terms, b_term)
    }

    # Add to diverse results
    diverse_results <- rbind(diverse_results, selected_paths)
  }

  # Re-sort by ABC score
  diverse_results <- diverse_results[order(-diverse_results$abc_score), ]

  return(diverse_results)
}

#' Remove A and C terms that appear as B terms
#'
#' @param results Data frame with ABC model results
#' @return Data frame with A and C terms removed from B terms
#' @keywords internal
remove_ac_terms <- function(results) {
  # Collect A and C terms
  a_terms <- unique(results$a_term)
  c_terms <- unique(results$c_term)
  ac_terms <- c(a_terms, c_terms)

  # Filter out results where B terms are also A or C terms
  filtered_results <- results[!results$b_term %in% ac_terms, ]

  # If we removed all results, return the original to prevent empty result
  if (nrow(filtered_results) == 0) {
    warning("All B terms were also A or C terms. Returning original results.")
    return(results)
  }

  return(filtered_results)
}

#' Get entity type distribution from co-occurrence matrix
#'
#' @param co_matrix A co-occurrence matrix produced by create_typed_comat().
#'
#' @return A data frame with entity type counts and percentages.
#' @export
get_type_dist <- function(co_matrix) {
  # Check if matrix has entity types
  if (is.null(attr(co_matrix, "entity_types"))) {
    stop("Co-occurrence matrix does not have entity type information")
  }

  # Get entity types
  entity_types <- attr(co_matrix, "entity_types")

  # Count entities by type
  type_counts <- table(entity_types)

  # Create data frame
  result <- data.frame(
    entity_type = names(type_counts),
    count = as.numeric(type_counts),
    percentage = round(as.numeric(type_counts) / length(entity_types) * 100, 2),
    stringsAsFactors = FALSE
  )

  # Sort by count
  result <- result[order(-result$count), ]

  return(result)
}

#' Filter a co-occurrence matrix by entity type
#'
#' @param co_matrix A co-occurrence matrix produced by create_typed_comat().
#' @param types Character vector of entity types to include.
#'
#' @return A filtered co-occurrence matrix.
#' @export
filter_by_type <- function(co_matrix, types) {
  # Check if matrix has entity types
  if (is.null(attr(co_matrix, "entity_types"))) {
    stop("Co-occurrence matrix does not have entity type information")
  }

  # Get entity types
  entity_types <- attr(co_matrix, "entity_types")

  # Get entities of the specified types
  entities_to_keep <- names(entity_types[entity_types %in% types])

  # Filter the matrix
  filtered_matrix <- co_matrix[entities_to_keep, entities_to_keep, drop = FALSE]

  # Preserve attributes
  attr(filtered_matrix, "entity_types") <- entity_types[entities_to_keep]

  if (!is.null(attr(co_matrix, "entity_freq"))) {
    attr(filtered_matrix, "entity_freq") <- attr(co_matrix, "entity_freq")[entities_to_keep]
  }

  if (!is.null(attr(co_matrix, "metadata"))) {
    metadata <- attr(co_matrix, "metadata")
    metadata$n_entities <- length(entities_to_keep)
    attr(filtered_matrix, "metadata") <- metadata
  }

  return(filtered_matrix)
}

#' Validate entity types using NLP-based entity recognition with improved accuracy
#'
#' @param term Character string, the term to validate
#' @param claimed_type Character string, the claimed entity type
#' @param nlp_model The loaded NLP model to use for validation
#' @return Logical indicating if the term is likely of the claimed type
validate_entity_with_nlp <- function(term, claimed_type, nlp_model = NULL) {
  # Load required packages
  if (!requireNamespace("spacyr", quietly = TRUE)) {
    message("Installing spacyr package for NLP-based entity recognition")
    # Initialize spaCy
    spacyr::spacy_initialize()
  }

  # Convert term and claimed_type to lowercase for better matching
  term_lower <- tolower(term)
  if (!is.null(claimed_type)) {
    claimed_type <- tolower(claimed_type)
  }

  # Check for common general terms that should be excluded
  general_terms <- c(
    # Geographic locations
    "europe", "asia", "africa", "america", "australia", "united states", "canada", "uk", "china", "japan",
    "germany", "france", "italy", "spain", "russia", "brazil", "india", "mexico", "germany", "switzerland",

    # Common verbs and generic concepts
    "optimization", "retention", "vehicle", "publication", "test", "review", "experiment",
    "analysis", "development", "application", "investigation", "evaluation", "protocol",
    "survey", "interview", "questionnaire", "scale", "assessment",

    # Other non-biomedical terms
    "method", "model", "approach", "strategy", "design", "value", "cost", "benefit", "risk",
    "measure", "calculate", "determine", "perform", "conduct", "report", "describe", "discuss",
    "recommend", "suggest", "propose", "prove", "demonstrate", "argue", "claim", "state",
    "conclude", "summarize", "clarify", "classification", "categorization", "identification",
    "characterization", "qualification", "quantification", "estimation", "calculation",
    "determination", "measurement", "assessment", "evaluation", "analysis", "interpretation",
    "explanation", "description", "discussion", "recommendation", "conclusion", "review"
  )

  # List of regions, countries, and cities that should be rejected
  geographic_locations <- c(
    "africa", "america", "asia", "australia", "europe", "north america", "south america",
    "central america", "western europe", "eastern europe", "northern europe", "southern europe",
    "middle east", "southeast asia", "east asia", "central asia", "south asia", "north africa",
    "sub-saharan africa", "oceania", "antarctica", "arctic", "caribbean", "mediterranean",
    "scandinavia", "benelux", "balkans", "pacific", "atlantic", "central europe"
  )

  # Immediately reject if the term is a geographic location
  if (term_lower %in% geographic_locations) {
    return(FALSE)
  }

  # Immediately reject if the term is in our general terms list
  if (term_lower %in% general_terms) {
    return(FALSE)
  }

  # Process the term with spaCy
  parsed <- spacyr::spacy_parse(term)

  # Expanded and improved mapping of entity types to spaCy entity types
  type_mapping <- list(
    # Chemical entities
    "chemical" = c("CHEMICAL", "ORG", "PRODUCT", "SUBSTANCE", "FAC"),

    # Disease entities
    "disease" = c("DISEASE", "CONDITION", "SYNDROME", "DISORDER", "PATHOLOGY",
                  "DIAGNOSIS", "ILLNESS", "HEALTH_CONDITION"),

    # Gene entities
    "gene" = c("GENE", "DNA", "RNA", "GENOMIC", "GENETIC_MARKER"),

    # Protein entities - extremely important to get right
    "protein" = c("PROTEIN", "ENZYME", "PEPTIDE", "ANTIBODY", "HORMONE",
                  "RECEPTOR", "GLYCOPROTEIN", "LIPOPROTEIN", "CHANNEL"),

    # Pathway entities
    "pathway" = c("PATHWAY", "PROCESS", "MECHANISM", "SIGNALING", "CASCADE",
                  "METABOLISM", "CYCLE", "NETWORK"),

    # Symptom entities
    "symptom" = c("SYMPTOM", "SIGN", "MANIFESTATION", "PRESENTATION", "COMPLAINT"),

    # Drug entities
    "drug" = c("CHEMICAL", "SUBSTANCE", "MEDICATION", "PRODUCT", "PHARMACEUTICAL",
               "DRUG", "COMPOUND", "MEDICINE"),

    # Biological process entities
    "biological_process" = c("PROCESS", "FUNCTION", "MECHANISM", "ACTIVITY",
                             "PATHWAY", "REGULATION"),

    # Cell entities
    "cell" = c("ANATOMY", "CELL", "TISSUE", "STRUCTURE", "ORG", "ENTITY"),

    # Tissue entities
    "tissue" = c("ANATOMY", "TISSUE", "STRUCTURE", "ORGAN", "ORG"),

    # Organism entities
    "organism" = c("ORG", "ORGANISM", "SPECIES", "BACTERIA", "VIRUS",
                   "MICROORGANISM", "FUNGUS", "PLANT", "ANIMAL"),

    # Molecular function entities
    "molecular_function" = c("FUNCTION", "PROCESS", "ACTIVITY", "MECHANISM"),

    # Cellular component entities
    "cellular_component" = c("CELL", "COMPONENT", "STRUCTURE", "ORGANELLE",
                             "MEMBRANE", "COMPARTMENT"),

    # Diagnostic procedure entities
    "diagnostic_procedure" = c("PROCEDURE", "TEST", "EXAMINATION", "TECHNIQUE",
                               "DIAGNOSIS", "IMAGING"),

    # Therapeutic procedure entities
    "therapeutic_procedure" = c("PROCEDURE", "THERAPY", "TREATMENT", "INTERVENTION",
                                "SURGERY", "MEDICATION"),

    # Anatomy entities
    "anatomy" = c("ANATOMY", "BODY", "STRUCTURE", "ORGAN", "SYSTEM", "TISSUE")
  )

  # Get expected spaCy entity types for claimed type
  expected_types <- type_mapping[[claimed_type]]
  if (is.null(expected_types)) {
    # If we don't have a mapping, be conservative and return FALSE
    return(FALSE)
  }

  # Domain-specific pattern checks (more comprehensive than before)
  type_patterns <- list(
    # Chemical patterns with more specific terms and avoiding general concepts
    "chemical" = "\\b(acid|oxide|ester|amine|compound|element|ion|molecule|solvent|reagent|catalyst|inhibitor|activator|hydroxide|chloride|phosphate|sulfate|nitrate|carbonate)\\b",

    # Disease patterns focusing on explicit disease terminology
    "disease" = "\\b(disease|disorder|syndrome|itis|emia|pathy|oma|infection|deficiency|failure|dysfunction|lesion|malignancy|neoplasm|tumor|cancer|fibrosis|inflammation|sclerosis|atrophy|dystrophy)\\b",

    # Gene patterns with specific genetic terminology
    "gene" = "\\b(gene|allele|locus|promoter|repressor|transcription|expression|mutation|variant|polymorphism|genotype|phenotype|hereditary|dna|chromosome|genomic|rna|mrna|nucleotide)\\b",

    # Protein patterns focusing on protein-specific terminology - very important for fixing the receptor issue
    "protein" = "\\b(protein|enzyme|receptor|antibody|hormone|kinase|phosphatase|transporter|factor|channel|carrier|ase\\b|globulin|albumin|transferase|reductase|oxidase|ligase|protease|peptidase|hydrolase)\\b",

    # Pathway patterns
    "pathway" = "\\b(pathway|cascade|signaling|transduction|regulation|metabolism|synthesis|biosynthesis|degradation|catabolism|anabolism|cycle|flux|transport|secretion|activation|inhibition|phosphorylation)\\b",

    # Symptom patterns
    "symptom" = "\\b(pain|ache|discomfort|swelling|redness|fatigue|weakness|fever|nausea|vomiting|dizziness|vertigo|headache|cough|dyspnea|tachycardia|bradycardia|edema|pallor|cyanosis)\\b",

    # Drug patterns
    "drug" = "\\b(drug|medication|therapy|treatment|compound|dose|inhibitor|agonist|antagonist|blocker|stimulant|suppressant|antidepressant|antibiotic|analgesic|sedative|hypnotic|vaccine|antiviral|antifungal)\\b",

    # Biological process patterns
    "biological_process" = "\\b(process|function|regulation|activity|response|mechanism|homeostasis|apoptosis|autophagy|proliferation|differentiation|migration|adhesion|division|fusion|cycle|phagocytosis|endocytosis|exocytosis)\\b",

    # Cell patterns
    "cell" = "\\b(cell|neuron|microglia|astrocyte|fibroblast|macrophage|lymphocyte|erythrocyte|platelet|epithelial|endothelial|muscle|myocyte|adipocyte|hepatocyte|keratinocyte|melanocyte|osteocyte|chondrocyte)\\b",

    # Tissue patterns
    "tissue" = "\\b(tissue|membrane|epithelium|endothelium|mucosa|connective|muscle|nerve|vessel|artery|vein|capillary|ligament|tendon|cartilage|bone|stroma|parenchyma|dermis|epidermis)\\b",

    # Organism patterns
    "organism" = "\\b(bacteria|virus|fungus|parasite|pathogen|microbe|species|strain|microorganism|prokaryote|eukaryote|archaea|protozoa|helminth|bacillus|coccus|spirochete|mycoplasma|chlamydia|rickettsia)\\b"
  )

  # Check if term matches any patterns for its claimed type
  pattern_match <- FALSE
  if (!is.null(type_patterns[[claimed_type]])) {
    pattern_match <- grepl(type_patterns[[claimed_type]], term_lower)
  }

  # Check if any token has an expected entity type
  entity_types <- parsed$entity_type
  has_expected_type <- any(entity_types %in% expected_types)

  # Check for explicit special cases - ensuring common issues are fixed
  special_case_check <- FALSE

  # Special case: Ensure "receptor" and related terms are recognized as proteins
  if (claimed_type == "protein" &&
      (grepl("receptor", term_lower) ||
       grepl("channel", term_lower) ||
       grepl("transporter", term_lower))) {
    special_case_check <- TRUE
  }

  # Special case: Ensure "malformation" is recognized as a disease/disorder
  if (claimed_type == "disease" && grepl("malformation", term_lower)) {
    special_case_check <- TRUE
  }

  # Special case: Ensure "optimization" is not classified as a chemical
  if (claimed_type == "chemical" &&
      (grepl("optimization", term_lower) ||
       grepl("retention", term_lower) ||
       grepl("vehicle", term_lower))) {
    return(FALSE)
  }

  # List of terms commonly misclassified, expanded with specific examples
  misclassified_terms <- list(
    "chemical" = c("sociodemographic", "demographic", "social", "economic", "education",
                   "income", "status", "cultural", "ethical", "society", "community",
                   "population", "questionnaire", "survey", "interview", "assessment",
                   "scale", "score", "index", "measurement", "evaluation", "analysis",
                   "methodology", "approach", "strategy", "procedure", "protocol",
                   "optimization", "retention", "vehicle", "europe", "usa", "africa"),

    "gene" = c("family", "type", "group", "class", "series", "variety", "category",
               "classification", "collection", "list", "set", "batch", "assortment"),

    "protein" = c("factor", "element", "component", "ingredient", "constituent",
                  "parameter", "variable", "characteristic", "feature", "aspect", "attribute"),

    "pathway" = c("approach", "method", "technique", "procedure", "course", "direction",
                  "route", "channel", "corridor", "passage", "street", "road", "track"),

    "disease" = c("europe", "asia", "africa", "america", "australia", "vehicle",
                  "optimization", "retention", "procedure", "method", "technique")
  )

  # Check if the term is commonly misclassified for its claimed type
  is_misclassified <- FALSE
  if (!is.null(misclassified_terms[[claimed_type]])) {
    if (any(sapply(misclassified_terms[[claimed_type]], function(t)
      grepl(paste0("\\b", t, "\\b"), term_lower)))) {
      is_misclassified <- TRUE
    }
  }

  # Special checks for specific entity types
  is_valid_by_special_check <- FALSE

  # Check if term is a chemical formula or has chemical structure
  if (claimed_type == "chemical" && grepl("[A-Z][a-z]?[0-9]+", term)) {
    is_valid_by_special_check <- TRUE
  }

  # Check if term is a likely gene name (e.g., BRCA1, TP53)
  if (claimed_type == "gene" && grepl("^[A-Z0-9]{2,}[0-9]*$", term)) {
    is_valid_by_special_check <- TRUE
  }

  # Check if term is a likely protein or enzyme
  if (claimed_type == "protein" && grepl("(ase|in)$", term_lower)) {
    is_valid_by_special_check <- TRUE
  }

  # Check if term is a likely drug name
  if (claimed_type == "drug" && grepl("(mab|nib|olol|pril|sartan|prazole|statin)$", term_lower)) {
    is_valid_by_special_check <- TRUE
  }

  # Check if term is a likely disease
  if (claimed_type == "disease" && grepl("(itis|osis|emia|oma|pathy)$", term_lower)) {
    is_valid_by_special_check <- TRUE
  }

  # Return TRUE if any validation method confirms the type and it's not misclassified
  # Also include special_case_check in the condition
  return((has_expected_type || pattern_match || is_valid_by_special_check || special_case_check) && !is_misclassified)
}

#' Validate biomedical entities using BioBERT or other ML models
#'
#' @param term Character string, the term to validate
#' @param claimed_type Character string, the claimed entity type
#' @return Logical indicating if the term is validated
validate_biomedical_entity <- function(term, claimed_type) {
  # If reticulate and Python environment are available
  if (!requireNamespace("reticulate", quietly = TRUE)) {
    message("Installing reticulate package for Python integration")
  }

  # First, perform basic regex-based validation before trying more complex ML-based validation
  # This helps filter out obviously incorrect terms efficiently
  term_lower <- tolower(term)

  # Extended list of non-biomedical terms by category
  non_biomedical_terms <- list(
    # Academic/research terms
    "academic" = c("introduction", "method", "methodology", "results", "discussion",
                   "conclusion", "abstract", "background", "objective", "aim", "purpose",
                   "significance", "rationale", "approach", "analysis", "evaluation",
                   "assessment", "measurement", "investigation", "experiment", "observation",
                   "survey", "review", "study", "research", "trial", "test", "examination",
                   "procedure", "technique", "protocol", "design", "framework", "model",
                   "theory", "concept", "paradigm", "hypothesis", "assumption", "limitation",
                   "implication", "recommendation", "reference", "citation", "bibliography",
                   "appendix", "table", "figure", "illustration", "chart", "graph", "diagram",
                   "plot", "map", "image", "photograph", "picture", "drawing", "report",
                   "paper", "article", "manuscript", "publication", "journal", "chapter",
                   "section", "subsection", "paragraph", "sentence", "word", "phrase",
                   "term", "definition", "explanation", "clarification", "interpretation",
                   "understanding", "knowledge", "information", "fact", "finding", "evidence",
                   "data", "statistic", "number", "figure", "calculation", "computation",
                   "algorithm", "formula", "equation", "variable", "parameter", "constant",
                   "coefficient", "factor", "element", "component", "aspect", "dimension",
                   "feature", "characteristic", "property", "attribute", "quality",
                   "recruitment", "benefit"),

    # Statistical/analytical terms
    "statistical" = c("mean", "median", "mode", "average", "standard", "deviation",
                      "variance", "range", "distribution", "normal", "poisson", "binomial",
                      "skewness", "kurtosis", "quartile", "percentile", "correlation",
                      "regression", "covariance", "confidence", "significance", "probability",
                      "likelihood", "hypothesis", "null", "alternative", "p-value", "t-test",
                      "f-test", "chi-square", "anova", "ancova", "manova", "parameter",
                      "statistic", "predictor", "outcome", "dependent", "independent",
                      "variable", "factor", "covariate", "confounder", "moderator", "mediator",
                      "interaction", "random", "fixed", "mixed", "linear", "nonlinear",
                      "parametric", "nonparametric", "univariate", "bivariate", "multivariate",
                      "sample", "population", "estimate", "estimator", "bias", "error",
                      "residual", "outlier", "power", "effect", "size", "sensitivity",
                      "specificity"),

    # Demographic/socioeconomic terms
    "demographic" = c("sociodemographic", "demographic", "social", "economic", "education",
                      "income", "status", "cultural", "ethical", "society", "community",
                      "population", "questionnaire", "survey", "interview", "assessment",
                      "scale", "score", "index", "measurement", "evaluation", "analysis",
                      "nationality", "ethnicity", "race", "gender", "sex", "age", "occupation",
                      "employment", "marital", "household", "residence", "urban", "rural",
                      "metropolitan", "suburban", "literacy", "socioeconomic"),

    # General descriptive terms
    "descriptive" = c("high", "low", "normal", "abnormal", "increase", "decrease",
                      "change", "difference", "similar", "different", "significant",
                      "insignificant", "positive", "negative", "higher", "lower",
                      "larger", "smaller", "greater", "lesser", "better", "worse",
                      "improvement", "deterioration", "mild", "moderate", "severe",
                      "acute", "chronic", "transient", "persistent", "intermittent",
                      "continuous", "progressive", "regressive", "stable", "unstable",
                      "regular", "irregular", "frequent", "infrequent", "rare", "common",
                      "typical", "atypical", "classic", "nonspecific", "specific",
                      "general", "special", "primary", "secondary", "tertiary", "initial",
                      "final", "early", "late", "short", "long", "brief", "extended",
                      "rapid", "slow", "fast", "delay", "immediate", "sudden", "gradual",
                      "partial", "complete", "total", "partial", "focal", "diffuse",
                      "localized", "generalized", "unilateral", "bilateral", "central",
                      "peripheral")
  )

  # Check if term matches any non-biomedical terms
  for (category in names(non_biomedical_terms)) {
    if (term_lower %in% non_biomedical_terms[[category]]) {
      return(FALSE)
    }
  }

  # Try to use Python-based biomedical NER
  tryCatch({
    # Import the necessary Python modules
    transformers <- reticulate::import("transformers")
    torch <- reticulate::import("torch")

    # Load BioBERT NER model
    tokenizer <- transformers$AutoTokenizer$from_pretrained("dmis-lab/biobert-base-cased-v1.1")
    model <- transformers$AutoModelForTokenClassification$from_pretrained("dmis-lab/biobert-base-cased-v1.1-ner")

    # Process the term
    encoded_input <- tokenizer(term, return_tensors = "pt")
    outputs <- model$forward(input_ids = encoded_input$input_ids)
    predictions <- torch$argmax(outputs$logits, dim = 2L)

    # Get predicted labels
    predicted_labels <- tokenizer$batch_decode(predictions)

    # Extended mapping of BioBERT labels to entity types
    biobert_mapping <- list(
      "chemical" = c("B-CHEM", "I-CHEM"),
      "disease" = c("B-DISO", "I-DISO"),
      "gene" = c("B-GENE", "I-GENE"),
      "protein" = c("B-PROT", "I-PROT"),
      "cell" = c("B-CELL", "I-CELL"),
      "species" = c("B-SPEC", "I-SPEC"),
      "pathway" = c("B-PATH", "I-PATH"),
      "drug" = c("B-DRUG", "I-DRUG", "B-CHEM", "I-CHEM"),
      "biological_process" = c("B-PROC", "I-PROC"),
      "tissue" = c("B-TISS", "I-TISS"),
      "cellular_component" = c("B-COMP", "I-COMP"),
      "anatomy" = c("B-ANAT", "I-ANAT"),
      "molecular_function" = c("B-FUNC", "I-FUNC"),
      "organism" = c("B-ORG", "I-ORG", "B-SPEC", "I-SPEC"),
      "symptom" = c("B-SYMPT", "I-SYMPT", "B-DISO", "I-DISO"),
      "diagnostic_procedure" = c("B-PROC", "I-PROC"),
      "therapeutic_procedure" = c("B-PROC", "I-PROC", "B-TREAT", "I-TREAT")
    )

    expected_labels <- biobert_mapping[[claimed_type]]

    # If no mapping exists for this type, fallback to pattern matching
    if (is.null(expected_labels)) {
      return(is_valid_biomedical_entity(term, claimed_type))
    }

    is_valid <- any(sapply(expected_labels, function(label) grepl(label, predicted_labels)))

    return(is_valid)
  }, error = function(e) {
    message("BioBERT validation failed: ", e$message)
    # Fall back to basic validation if BioBERT fails
    return(is_valid_biomedical_entity(term, claimed_type))
  })
}

#' Comprehensive entity validation using multiple techniques
#'
#' @param term Character string, the term to validate
#' @param claimed_type Character string, the claimed entity type
#' @param use_nlp Logical, whether to use NLP-based validation
#' @param use_pattern Logical, whether to use pattern-based validation
#' @param use_external_api Logical, whether to query external APIs
#' @return Logical indicating if the term is validated
validate_entity_comprehensive <- function(term, claimed_type,
                                          use_nlp = TRUE,
                                          use_pattern = TRUE,
                                          use_external_api = FALSE) {

  # Convert type to lowercase for consistent comparison
  if (!is.null(claimed_type)) {
    claimed_type <- tolower(claimed_type)
  }

  # Skip validation for terms too short (e.g., single characters)
  if (is.null(term) || is.na(term) || nchar(term) < 2) {
    return(FALSE)
  }

  # Special case for problematic terms
  problematic_terms <- c(
    "sociodemographic", "demographic", "social", "economic", "education",
    "income", "status", "cultural", "ethical", "society", "community",
    "population", "questionnaire", "survey", "interview", "assessment",
    "scale", "score", "index", "measurement", "evaluation", "analysis",
    "methodology", "approach", "strategy", "procedure", "protocol",
    "high", "low", "normal", "abnormal", "increase", "decrease",
    "change", "difference", "significant", "insignificant", "positive",
    "higher", "lower", "larger", "smaller", "greater", "lesser", "better"
  )

  if (tolower(term) %in% problematic_terms) {
    return(FALSE)
  }

  # Initialize validation results
  results <- logical(0)

  # Pattern-based validation (our original approach)
  if (use_pattern) {
    pattern_result <- is_valid_biomedical_entity(term, claimed_type)
    results <- c(results, pattern_result)
  }

  # NLP-based validation
  if (use_nlp) {
    nlp_result <- tryCatch({
      validate_entity_with_nlp(term, claimed_type)
    }, error = function(e) {
      message("NLP validation failed: ", e$message)
      return(is_valid_biomedical_entity(term, claimed_type))  # Fallback to pattern-based
    })
    results <- c(results, nlp_result)
  }

  # External API validation (e.g., PubChem, UniProt)
  if (use_external_api) {
    api_result <- tryCatch({
      validate_biomedical_entity(term, claimed_type)
    }, error = function(e) {
      message("API validation failed: ", e$message)
      return(is_valid_biomedical_entity(term, claimed_type))  # Fallback to pattern-based
    })
    results <- c(results, api_result)
  }

  # Special handling for known categories of terms
  term_lower <- tolower(term)

  # Check for statistical or methodological terms that aren't biomedical entities
  stat_method_terms <- c("significant", "analysis", "result", "correlation", "association",
                         "outcome", "variable", "factor", "parameter", "cohort", "group",
                         "control", "case", "study", "research", "trial", "experiment",
                         "observation", "model", "algorithm", "data", "sample", "population")

  if (term_lower %in% stat_method_terms) {
    return(FALSE)
  }

  # If any method validates the term, consider it valid
  # But if the term is explicitly rejected by any method, reject it
  if (length(results) > 0) {
    return(any(results) && !all(!results))
  }

  # Default to FALSE for terms that couldn't be validated
  return(FALSE)
}

#' Query external biomedical APIs to validate entity types
#'
#' @param term Character string, the term to validate
#' @param claimed_type Character string, the claimed entity type
#' @return Logical indicating if the term was found in the appropriate database
query_external_api <- function(term, claimed_type) {
  # Define API endpoints based on entity type
  api_endpoints <- list(
    "chemical" = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/",
    "gene" = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=",
    "protein" = "https://www.uniprot.org/uniprot/?query=",
    "disease" = "https://www.diseaseontology.org/api/search?q="
  )

  endpoint <- api_endpoints[[claimed_type]]
  if (is.null(endpoint)) {
    return(TRUE)  # No endpoint for this type, be conservative
  }

  # Prepare URL-safe term
  safe_term <- utils::URLencode(term)

  # Query the API
  response <- tryCatch({
    httr::GET(paste0(endpoint, safe_term))
  }, error = function(e) {
    message("API query failed: ", e$message)
    return(NULL)
  })

  # Check if we got a valid response
  if (is.null(response) || httr::status_code(response) != 200) {
    return(FALSE)
  }

  # Different APIs have different response formats
  # This is a simplified check - production code would need more parsing
  content <- httr::content(response, "text")
  return(!grepl("No items found", content) && !grepl("0 results", content))
}

Try the LBDiscover package in your browser

Any scripts or data that you put into this service are public.

LBDiscover documentation built on June 16, 2025, 5:09 p.m.