R/detect.R

Defines functions .check_keyboard_plausibility analyze_gaffe .edit_distance suggest_corrections detect_typo_type

Documented in analyze_gaffe detect_typo_type suggest_corrections

# R/detect.R
# Detection and Analysis Tools functions

#' Detect likely typo type
#'
#' @param typo Character. The typo'd word
#' @param intended Character. The intended word (if known)
#' @return List with possible_types and confidence scores
#' @export
#' @examples
#' detect_typo_type("tesy", "test")
detect_typo_type <- function(typo, intended = NULL) {
  types <- character(0)
  scores <- numeric(0)

  if (is.null(intended)) {
    # Can only do basic analysis
    return(list(possible_types = "unknown", confidence = 0))
  }

  typo_chars <- strsplit(tolower(typo), "")[[1]]
  intended_chars <- strsplit(tolower(intended), "")[[1]]

  len_diff <- length(typo_chars) - length(intended_chars)

  # Check for double letter
  if (len_diff == 1) {
    for (i in seq_len(length(typo_chars) - 1)) {
      if (typo_chars[i] == typo_chars[i + 1]) {
        # Found doubled char
        types <- c(types, "double")
        scores <- c(scores, 0.8)
        break
      }
    }
  }

  # Check for omission
  if (len_diff == -1) {
    types <- c(types, "omission")
    scores <- c(scores, 0.8)
  }

  # Check for keyboard adjacency
  if (len_diff == 0) {
    for (i in seq_along(typo_chars)) {
      if (typo_chars[i] != intended_chars[i]) {
        if (is_keyboard_adjacent(typo_chars[i], intended_chars[i])) {
          types <- c(types, "keyboard")
          scores <- c(scores, 0.9)
          break
        }
      }
    }
  }

  # Check for phonetic similarity
  if (len_diff == 0) {
    for (i in seq_along(typo_chars)) {
      if (typo_chars[i] != intended_chars[i]) {
        if (typo_chars[i] %in% names(consonant_pairs) &&
            consonant_pairs[[typo_chars[i]]] == intended_chars[i]) {
          types <- c(types, "phonetic")
          scores <- c(scores, 0.85)
          break
        }
      }
    }
  }

  # Check for swap
  if (len_diff == 0 && length(typo_chars) >= 2) {
    for (i in seq_len(length(typo_chars) - 1)) {
      swapped <- typo_chars
      tmp <- swapped[i]
      swapped[i] <- swapped[i + 1]
      swapped[i + 1] <- tmp
      if (identical(swapped, intended_chars)) {
        types <- c(types, "swap")
        scores <- c(scores, 0.9)
        break
      }
    }
  }

  if (length(types) == 0) {
    types <- "unknown"
    scores <- 0
  }

  list(possible_types = types, confidence = scores)
}

#' Suggest corrections for a typo
#'
#' @param typo Character. The typo'd word
#' @param dictionary Character vector. Optional custom dictionary
#' @return Character vector of suggested corrections
#' @export
#' @examples
#' suggest_corrections("tesy")
suggest_corrections <- function(typo, dictionary = NULL) {
  if (is.null(dictionary)) {
    # Use a minimal built-in dictionary
    dictionary <- c("test", "testing", "tested", "text", "best", "rest",
                    "the", "this", "that", "them", "then", "there",
                    "coverage", "president", "political", "government")
  }

  # Calculate edit distance to each word
  distances <- vapply(dictionary, function(word) {
    .edit_distance(tolower(typo), tolower(word))
  }, numeric(1))

  # Return words within distance 2
  close_words <- dictionary[distances <= 2]

  if (length(close_words) == 0) {
    return(character(0))
  }

  # Sort by distance
  close_words[order(distances[distances <= 2])]
}

# Internal: simple edit distance (Levenshtein)
.edit_distance <- function(s1, s2) {
  n1 <- nchar(s1)
  n2 <- nchar(s2)

  if (n1 == 0) return(n2)
  if (n2 == 0) return(n1)

  # Initialize matrix
  d <- matrix(0, nrow = n1 + 1, ncol = n2 + 1)
  d[, 1] <- 0:n1
  d[1, ] <- 0:n2

  chars1 <- strsplit(s1, "")[[1]]
  chars2 <- strsplit(s2, "")[[1]]

  for (i in seq_len(n1)) {
    for (j in seq_len(n2)) {
      cost <- if (chars1[i] == chars2[j]) 0 else 1
      d[i + 1, j + 1] <- min(
        d[i, j + 1] + 1,      # deletion
        d[i + 1, j] + 1,      # insertion
        d[i, j] + cost        # substitution
      )
    }
  }

  d[n1 + 1, n2 + 1]
}

#' Analyze a gaffe comprehensively
#'
#' @param gaffe Character. The typo/gaffe
#' @param intended Character. The intended word
#' @return List with comprehensive analysis
#' @export
#' @examples
#' analyze_gaffe("covfefe", "coverage")
analyze_gaffe <- function(gaffe, intended) {
  typo_analysis <- detect_typo_type(gaffe, intended)

  list(
    gaffe = gaffe,
    intended = intended,
    distance = .edit_distance(tolower(gaffe), tolower(intended)),
    typo_types = typo_analysis$possible_types,
    confidence = typo_analysis$confidence,
    phonetic_similarity = 1 - phonetic_distance(gaffe, intended) / 4,
    keyboard_plausible = .check_keyboard_plausibility(gaffe, intended)
  )
}

# Internal: check if typo is keyboard-plausible
.check_keyboard_plausibility <- function(typo, intended) {
  typo_chars <- strsplit(tolower(typo), "")[[1]]
  intended_chars <- strsplit(tolower(intended), "")[[1]]

  if (length(typo_chars) != length(intended_chars)) {
    return(FALSE)
  }

  different_positions <- which(typo_chars != intended_chars)

  if (length(different_positions) == 0) return(TRUE)

  all(vapply(different_positions, function(i) {
    is_keyboard_adjacent(typo_chars[i], intended_chars[i])
  }, logical(1)))
}

Try the covfefe package in your browser

Any scripts or data that you put into this service are public.

covfefe documentation built on Jan. 26, 2026, 5:08 p.m.