R/spellcheckr.R
In spellcheckr: Correct the Spelling of a Given Word in the English Language

Documented in correct splits

#' dict
#'
#' A dataset containing the valid english dictionary words originally obtained from a work of Jane Austen.
#'
#' \itemize{
#'   \item . The actual word
#'   \item N The number of occurences of the word in the document.
#' }
#'
#' @docType data
#' @keywords datasets
#' @name dict
#' @usage data(dict)
#' @format A data frame with 245462 rows and 2 columns



# Splits: "Create all combinations of splits for a given word."
splits <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }
  w <- unlist(str_split(wd, ""))
  len <- length(w)
  out <- vector(mode="list", length=len+1)
  out[[1]] <- c("", paste0(w, collapse=""))  # first pair
  out[[len+1]] <- c(paste0(w, collapse=""), "")  # last pair
  if(length(w) > 1){
    for(i in 1:(len-1)){
      out[[i+1]] <- c(paste0(w[1:i], collapse=""), paste0(w[(i+1):len], collapse=""))
    }
  }
  return(out)
}

# deletes: generate all the possible candidates from a word where 1 character is deleted.
deletes <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }
  pairs <- splits(wd)
  lapply(pairs, function(x){paste0(c(x[1],
                                     str_split(x[2], "") %>% unlist %>% `[`(-1)), collapse="")
  })
}

# transposes: all possible combination of transposes
# tp: Used inside transposes.
tp <- function(x){
  x2 <- str_split(x[2], "") %>% unlist
  len <- length(x2);
  if(len > 2){
    paste0(c(x[1], x2 %>% `[`(c(2, 1, 3:len))), collapse="")
  }else if(len==2){
    paste0(c(x[1], x2 %>% `[`(c(2, 1))), collapse="")
  }
}
transposes <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }
  pairs <- splits(wd)
  lapply(pairs[1:(length(pairs)-2)], tp)
}
# transposes("prabhu")


# replaces: all combinations of 1 replacement from alphabets a to z.
rp <- function(x){
  x2 <- str_split(x[2], "") %>% unlist
  len <- length(x2);
  if(len>1){
    p1 <- paste0(x2 %>% `[`(c(2:len)), collapse="")
    p2 <- paste0(letters, p1, sep="")
    return(paste(x[1], p2, sep=""))
  }else{  # when the second part of the pair has only 1 character.
    return(paste(x[1], letters, sep=""))
  }
}

replaces <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }

  pairs <- splits(wd)
  lapply(pairs[-length(pairs)], rp)
}
# replaces("prabhu")


# inserts: all combinations of 1 insert of alphabets a-z
ins <- function(x){
  x2 <- str_split(x[2], "") %>% unlist
  len <- length(x2);
  p1 <- paste0(x2, collapse="")
  p2 <- paste0(letters, p1, sep="")
  return(paste(x[1], p2, sep=""))
}

inserts <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }
  pairs <- splits(wd)
  lapply(pairs, ins)
}
# inserts("prabhu")

# edit0: Words that are 0 edit away
edit0 <- function(wd=NULL){
  return(wd)
}

# edit1: Words that are 1 edit away
edit1 <- function(wd=NULL){
  if(is.null(wd) | (!is.character(wd))){
    stop("The input should be a character string")
  }

  pairs <- splits(wd)
  deletes <- deletes(wd)  # all possible 1 char deletes
  transposes <- transposes(wd)  # all possible transposes
  replaces <- replaces(wd)  # all possible replaces
  inserts <- inserts(wd)  # all possible inserts.
  return(list(deletes, transposes, replaces, inserts) %>% unlist %>% unique)
}

# edit2: Words that are 2 edits away
edit2 <- function(wd=NULL){
  out <- character()
  edits1 <- edit1(wd)
  for (e in edits1){
    out <- c(out, edit1(e))
  }
  return(unique(out))
}

# edit3 : words that are 3 edits away. Time consuming.
edit3 <- function(wd=NULL){
  edits3 <- character()
  edits2 <- character()
  edits1 <- edit1(wd)
  for (e1 in edits1){
    edits2 <- c(edits2, edit1(e1))
  }

  for (e2 in edits2){
    edits3 <- c(edits3, edit1(e2))
  }
  return(edits3)
}




## Spell correct
#' @title correct
#' @description Correct the spelling of a given word in the english language.
#' @details This is based on Peter Norvig's spell correct algorithm \url{http://norvig.com/spell.py}.
#' But this one is modified to handle upto three edits.
#' @author Selva Prabhakaran \email{selva86@@gmail.com}
#' @export correct
#' @param wd Character. The word to be spell corrected.
#' @param maxedit Integer. The maximum number of edits allowed to reach the correct word. Max allowed is 3.
#' @param dictionary Character Vector of eligible words to be considered. Repeated words will get more weightage.
#' @return The corrected word.
#' @examples
#' data(dict)
#' correct("scaret")
#' correct("beliebe")

correct <- function(wd=NULL, maxedit=2, dictionary=NULL){
  if(!is.null(dictionary)){
    dict <- as.data.table(table(unlist(grep('[a-z]+', tolower(dictionary), value = T))))
    setnames(dict, c(".", "N"))
    setkey(dict, ".")
  }

  known <- function(wds){
    wds[wds %in% dict$.]
  }

  k <- known(edit0(wd))
  if(NROW(k) == 0){
    k <- known(edit1(wd))
  }

  if(NROW(k) == 0){
    k <- known(edit2(wd))
  }

  if(maxedit>2){
    if(NROW(k) == 0){
      k <- known(edit3(wd))
    }
  }

  if(NROW(k) == 0){
    return(wd)
  }
  j <- which.max(unlist(dict[k, c(2), with=F]))
  return(k[j])
}

Any scripts or data that you put into this service are public.

spellcheckr documentation built on May 2, 2019, 4:18 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

spellcheckr
Correct the Spelling of a Given Word in the English Language

R/spellcheckr.R
In spellcheckr: Correct the Spelling of a Given Word in the English Language

Defines functions splits deletes tp transposes rp replaces ins inserts edit0 edit1 edit2 edit3 correct

Documented in correct splits

Try the spellcheckr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

spellcheckr Correct the Spelling of a Given Word in the English Language

R/spellcheckr.R In spellcheckr: Correct the Spelling of a Given Word in the English Language

Defines functions splits deletes tp transposes rp replaces ins inserts edit0 edit1 edit2 edit3 correct

Documented in correct splits

Try the spellcheckr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

spellcheckr
Correct the Spelling of a Given Word in the English Language

R/spellcheckr.R
In spellcheckr: Correct the Spelling of a Given Word in the English Language