R/utils_domain.R

Defines functions detect_concepts_from_text find_best_concept_match

#' Generic fuzzy matching function for concepts
#'
#' @param concept Concept name to match
#' @param d_all_concepts Data frame of all available concepts
#'
#' @return Best matching concept name or NULL
#'
#' @keywords internal
#' @noRd
find_best_concept_match <- function(concept, d_all_concepts) {
  if (is.na(concept) || nchar(trimws(concept)) == 0) {
    return(NULL)
  }

  normalized_concept <- tolower(gsub("[_-]", " ", trimws(concept)))
  normalized_concept <- gsub("\\s+", " ", normalized_concept)

  # exact match
  exact_matches <- d_all_concepts[
    tolower(d_all_concepts$concept) == normalized_concept,
  ]
  if (nrow(exact_matches) > 0) {
    return(exact_matches$concept[1])
  }

  # specific test cases, handle common variations
  if (grepl("vizual", normalized_concept)) {
    normalized_concept <- gsub("vizual", "visual", normalized_concept)
  }
  if (grepl("principal", normalized_concept)) {
    normalized_concept <- gsub("principal", "principle", normalized_concept)
  }

  # exact match again after corrections
  exact_matches <- d_all_concepts[
    tolower(d_all_concepts$concept) == normalized_concept,
  ]
  if (nrow(exact_matches) > 0) {
    return(exact_matches$concept[1])
  }

  # contains match (both ways)
  contains_matches <- d_all_concepts[
    grepl(normalized_concept, tolower(d_all_concepts$concept)),
  ]
  if (nrow(contains_matches) > 0) {
    return(contains_matches$concept[1])
  }

  # reverse contains for partial matches
  reverse_contains <- d_all_concepts[
    grepl(
      paste(strsplit(normalized_concept, " ")[[1]], collapse = ".*"),
      tolower(d_all_concepts$concept)
    ),
  ]
  if (nrow(reverse_contains) > 0) {
    return(reverse_contains$concept[1])
  }

  # word matching with priority for key words
  words <- strsplit(normalized_concept, "\\s+")[[1]]
  if (length(words) > 0) {
    key_words <- c("visual", "hierarchy", "proximity", "cognitive", "load")
    priority_words <- words[words %in% key_words]
    other_words <- words[!words %in% key_words]
    ordered_words <- c(priority_words, other_words)

    for (word in ordered_words) {
      if (nchar(word) >= 3) {
        word_pattern <- paste0("\\b", word, "\\b")
        word_matches <- d_all_concepts[
          grepl(word_pattern, tolower(d_all_concepts$concept)),
        ]
        if (nrow(word_matches) > 0) {
          return(word_matches$concept[1])
        }
      }
    }
  }

  # fuzzy matching
  if (requireNamespace("stringdist", quietly = TRUE)) {
    distances <- stringdist::stringdistmatrix(
      normalized_concept,
      tolower(d_all_concepts$concept),
      method = "jw"
    )

    best_match_idx <- which.min(distances)
    best_score <- 1 - distances[best_match_idx]

    if (best_score > 0.6) {
      matched_concept <- d_all_concepts$concept[best_match_idx]
      cli::cli_alert_info(paste0(
        "Fuzzy matched '",
        concept,
        "' to '",
        matched_concept,
        "' (similarity: ",
        janitor::round_half_up(best_score * 100),
        "%)"
      ))
      return(matched_concept)
    }
  }

  return(NULL)
}

#' Generic text analysis for concept detection
#'
#' @param text Text to analyze for concepts
#' @param source_type Type of source (for logging)
#'
#' @return Character vector of detected concept names
#'
#' @keywords internal
#' @noRd
detect_concepts_from_text <- function(text, source_type = "general") {
  if (is.na(text) || nchar(trimws(text)) == 0) {
    return(character(0))
  }

  text_lower <- tolower(trimws(text))
  detected_concepts <- character(0)

  concept_keywords <- list(
    "Visual Hierarchy" = c(
      "focus",
      "attention",
      "important",
      "priority",
      "hierarchy",
      "prominence"
    ),
    "Principle of Proximity" = c(
      "group",
      "related",
      "together",
      "proximity",
      "association",
      "arrange"
    ),
    "Dual-Processing Theory" = c(
      "overview",
      "detail",
      "quick",
      "depth",
      "glance",
      "dig"
    ),
    "Breathable Layouts" = c(
      "space",
      "clean",
      "clear",
      "simple",
      "uncluttered",
      "whitespace"
    ),
    "Progressive Disclosure" = c(
      "gradually",
      "reveal",
      "step",
      "complexity",
      "details",
      "level"
    ),
    "Default Effect" = c(
      "default",
      "preset",
      "initial",
      "automatic",
      "standard",
      "starting"
    ),
    "Information Hierarchy" = c(
      "organize",
      "structure",
      "arrange",
      "categorize",
      "classify"
    )
  )

  for (concept_name in names(concept_keywords)) {
    keywords <- concept_keywords[[concept_name]]
    if (any(sapply(keywords, function(k) grepl(k, text_lower)))) {
      detected_concepts <- c(detected_concepts, concept_name)
    }
  }

  if (length(detected_concepts) > 0) {
    cli::cli_alert_info(paste0(
      "Detected ",
      length(detected_concepts),
      " concepts from ",
      source_type,
      " description: ",
      paste(detected_concepts, collapse = ", ")
    ))
  }

  return(unique(detected_concepts))
}

Try the bidux package in your browser

Any scripts or data that you put into this service are public.

bidux documentation built on Nov. 20, 2025, 1:06 a.m.