R/corrupt.R

Defines functions garble_sentence .corrupt_omit .corrupt_swap .corrupt_truncate .corrupt_double .corrupt_phonetic .corrupt_keyboard corrupt_text

Documented in corrupt_text garble_sentence

# R/corrupt.R
# Text Corruption Simulator functions

#' Corrupt text with specified error type
#'
#' @param text Character. Text to corrupt
#' @param type Character. Type of corruption: "keyboard", "phonetic", "double",
#'   "truncate", "swap", "omit", "random"
#' @param position Integer. Position for positional corruptions (NULL = random)
#' @return Character. Corrupted text
#' @export
#' @examples
#' set.seed(42)
#' corrupt_text("hello", type = "keyboard")
#' corrupt_text("coverage", type = "truncate", position = 3)
corrupt_text <- function(text, type = "random", position = NULL) {
  type <- match.arg(type, c("keyboard", "phonetic", "double", "truncate",
                            "swap", "omit", "random"))

  if (type == "random") {
    type <- sample(c("keyboard", "phonetic", "double", "truncate", "swap", "omit"), 1)
  }

  switch(type,
    keyboard = .corrupt_keyboard(text, position),
    phonetic = .corrupt_phonetic(text, position),
    double = .corrupt_double(text, position),
    truncate = .corrupt_truncate(text, position),
    swap = .corrupt_swap(text, position),
    omit = .corrupt_omit(text, position)
  )
}

# Internal corruption functions

.corrupt_keyboard <- function(text, position) {
  words <- strsplit(text, " ")[[1]]
  if (is.null(position)) position <- sample(seq_along(words), 1)
  position <- min(position, length(words))

  words[position] <- adjacent_key_typo(words[position])
  paste(words, collapse = " ")
}

.corrupt_phonetic <- function(text, position) {
  # Swap a consonant for phonetically similar one
  consonants <- c("b", "d", "f", "g", "k", "m", "n", "p", "s", "t", "v", "z")
  chars <- strsplit(tolower(text), "")[[1]]
  consonant_pos <- which(chars %in% consonants)

  if (length(consonant_pos) == 0) return(text)

  if (is.null(position)) position <- sample(consonant_pos, 1)
  else position <- consonant_pos[min(position, length(consonant_pos))]

  target <- chars[position]
  if (target %in% names(consonant_pairs)) {
    chars[position] <- consonant_pairs[[target]]
  }

  paste(chars, collapse = "")
}

.corrupt_double <- function(text, position) {
  chars <- strsplit(text, "")[[1]]
  if (is.null(position)) position <- sample(seq_along(chars), 1)
  position <- min(position, length(chars))

  # Double the character at position
  chars <- append(chars, chars[position], after = position)
  paste(chars, collapse = "")
}

.corrupt_truncate <- function(text, position) {
  words <- strsplit(text, " ")[[1]]
  if (length(words) < 2) return(text)
  if (is.null(position)) position <- sample(2:(length(words) - 1), 1)
  position <- min(position, length(words) - 1)

  paste(words[1:position], collapse = " ")
}

.corrupt_swap <- function(text, position) {
  chars <- strsplit(text, "")[[1]]
  if (length(chars) < 2) return(text)

  if (is.null(position)) position <- sample(seq_len(length(chars) - 1), 1)
  position <- min(position, length(chars) - 1)

  # Swap adjacent characters
  tmp <- chars[position]
  chars[position] <- chars[position + 1]
  chars[position + 1] <- tmp

  paste(chars, collapse = "")
}

.corrupt_omit <- function(text, position) {
  chars <- strsplit(text, "")[[1]]
  if (length(chars) < 2) return(text)

  if (is.null(position)) position <- sample(seq_along(chars), 1)
  position <- min(position, length(chars))

  paste(chars[-position], collapse = "")
}

#' Garble a sentence with random corruptions
#'
#' @param sentence Character. Sentence to garble
#' @param corruption_rate Numeric. Fraction of words to corrupt (0-1)
#' @return Character. Garbled sentence
#' @export
#' @examples
#' set.seed(42)
#' garble_sentence("This is a test", corruption_rate = 0.5)
garble_sentence <- function(sentence, corruption_rate = 0.3) {
  words <- strsplit(sentence, " ")[[1]]
  n_corrupt <- ceiling(length(words) * corruption_rate)

  if (n_corrupt == 0) return(sentence)

  positions <- sample(seq_along(words), n_corrupt)

  for (pos in positions) {
    words[pos] <- corrupt_text(words[pos], type = "random")
  }

  paste(words, collapse = " ")
}

Try the covfefe package in your browser

Any scripts or data that you put into this service are public.

covfefe documentation built on Jan. 26, 2026, 5:08 p.m.