R/find_typos.R

# # find typos
#
# # idea from dk:
# # string distance calculation that weights letters that are close in qwerty-space
#
# library(tidyverse)
# library(messy.cats)
#
# data("clean_caterpillars")
# data("messy_caterpillars")
#
#
#
# # testing dataset ----
# rep(clean_caterpillars$species,clean_caterpillars$count) -> clean_caterpillars_rep
#
# append(clean_caterpillars_rep,messy_caterpillars$CaterpillarSpecies) -> typo_caterpillars
#
# typo_df <- as.data.frame(typo_caterpillars)
#
# typo_df %>% group_by(typo_caterpillars) %>% count() %>%
#   arrange(desc(n)) %>% select(typo_caterpillars) %>% unlist(use.names = F) -> typo_freqs
#
#
#
# typo_df %>% group_by(typo_caterpillars) %>% count() -> counts
#
# mispellings <- counts %>% filter(n <= quantile(counts$n,0.25))
# references <- counts %>% filter(n > quantile(counts$n,0.25))
#
# cat_match(mispellings[["typo_caterpillars"]],references[["typo_caterpillars"]])
#
#
#
# # hand edited data
# typos = read_csv("data/typos.csv")
#
# # main code ----
# find_typos1.0 <- function(df,messy_column){
#
#   eval(parse(text = paste0("df %>% group_by(",messy_column,") %>% count() -> x")))
#   typos <- x %>% filter(n <= quantile(x$n,0.25))
#   correct <- x %>% filter(n > quantile(x$n,0.25))
#   return(cat_match(typos[[messy_column]],correct[[messy_column]]))
#
# }
#
# find_typos1.0(df = typo_df,messy_column = "typo_caterpillars")
#
# # want to transitively group any pairs with a string distance less than 0.2
# # hopefully create clusters of mispellings
#
#
# find_typos <- function(messy_column){
#   u_m_column = unique(messy_column)
#   as.data.frame(stringdistmatrix(u_m_column,u_m_column,method="jw")) -> x
# }
#
#
# messy_column = typos$species
# u_m_column = unique(messy_column)
# as.data.frame(stringdistmatrix(u_m_column,u_m_column,method="jw")) -> x
#
# row.names(x) = u_m_column
# colnames(x) =  u_m_column
#
# which(x<0.2,arr.ind=TRUE,useNames = F) -> t
#
# f <- function(a,b){
#   stringdist(a,b,method="jw")
# }
#
# l = list(c(1,2),c(3,4))
#
# x[l[[1]][1],l[[1]][2]]
#
#
# # how to decide which is the correctly spelled word in the group:
#
# # thing with lowest average distance between whole cluster?
#
# # make/let user choose
#
#
# # test ----
# find_typos(messy_column = typo_df$typo_caterpillars) -> df
#
#
# messy_column = typo_df$typo_caterpillars
# stringdistmatrix(messy_column,messy_column,method="jw")
#
#
# ### Brainstorming ----
#
# stringdist("caret","carrot",method = "dl")
# stringdist("library","lirebary",method = "dl")
# library
#
#
# # Threshold----
# # how to determine what is a typo?
# stringdistmatrix("Pyreferra hesperidago",typos$species,method="jw") -> x
# # so far most things in jw with a dist < 0.15 seem to be typos
#
# # stolen from https://www.joyofdata.de/blog/comparison-of-string-distance-algorithms/
# b <- c(
#   "Cosmo Kramer",
#   "Kosmo Kramer",
#   "Comso Kramer",
#   "Csmo Kramer",
#   "Cosmo X. Kramer",
#   "Kramer, Cosmo",
#   "Jerry Seinfeld",
#   " CKaemmoorrs",
#   "Cosmer Kramo",
#   "Kosmoo Karme",
#   "George Costanza",
#   "Elaine Benes",
#   "Dr. Van Nostren",
#   "remarK omsoC",
#   "Mr. Kramer",
#   "Sir Cosmo Kramer",
#   "C.o.s.m.o. .K.r.a.m.e.r",
#   "CsoKae",
#   "Coso Kraer"
# )
#
# stringdistmatrix("Cosmo Kramer",b,method="jw") -> x2
# # based on this example it looks like typos could even be restricted to 0.20 for now
# # going to try with some shorter words and see how it works out
#
# # poggers
# d <- c(
#   "duck",
#   "fuck",
#   "luck",
#   "dog",
#   "fog",
#   "sog",
#   "sag",
#   "pog",
#   "bat",
#   "cat",
#   "sat",
#   "pat",
#   "batt"
# )
#
# df <- as.data.frame(d)
# df[,2] = df[,1]
#
# stringdistmatrix(df[,2],df[,1],method="jw") -> x3
#
# row.names(x3) = df[,2]
# colnames(x3) = df[,1]
# # the less letters in the strings the worse jw is
#
# stringdistmatrix(df[,2],df[,1],method="cosine") -> x4
#
# row.names(x4) = df[,2]
# colnames(x4) = df[,1]
#
# # probably for any string distance calc the less characters the less precise it will be
# # lets find a slightly shorter one word example to use
#
#
#
# cnts <- c(
#   "Litchfield",
#   "Hartford",
#   "Tolland",
#   "Windham",
#   "Fairfield",
#   "New Haven",
#   "Middlesex",
#   "New London"
# )
#
# cnts_typo <- c(
#   "Litchfeld",
#   "Hartferd",
#   "Tolland",
#   "Wind Ham",
#   "Fairfield",
#   "New Heven",
#   "Midlsex",
#   "NwLndn"
# )
#
# ct_counties <- append(cnts,cnts_typo)
# ct_counties <- unique(ct_counties)
# ct <- as.data.frame(ct_counties)
#
# stringdistmatrix(ct$ct_counties,ct$ct_counties,method="jw") -> x5
# rownames(x5) = ct$ct_counties
# colnames(x5) = ct$ct_counties
# # for jw with words ~6/7 or more characters, 0.15 will probably be a safe cutoff,
# # 0.2 to be conservative
#
# # need to test on lists with words that are close but not typos but typos would
# # make them closer
#
# l <- c(
#
#
# )
#
#
#
# # Choosing from the Clusters ----
# # how to decide which is the correctly spelled word in the group
# # thing with lowest average distance between whole cluster?
#
# ap = u_m_column[3:4]
# # in a case like this with only 2 strings in a cluster I can't think of a way
# # to decide which is right without looking at like trends in how words are written
# # e.g. soft and sofd, f and d aren't normally next to each other in english
#
# ph = u_m_column[c(9:11,13,14,29,22,25,26)] %>% na.omit()
# stringdistmatrix(ph,ph,method="jw") -> ph.x
# rownames(ph.x) = ph
# colnames(ph.x) = ph
#
# rowSums(ph.x)
# # in this example just summing the rows makes the correct spelling the lowest
#
#
# nr = u_m_column[15:17]
# stringdistmatrix(nr,nr,method="jw") -> nr.x
# rownames(nr.x) = nr
# colnames(nr.x) = nr
#
# rowSums(nr.x)
# # in this example rowsumming doesnt work
#
# # if there are a lot of similar mistakes they will have lower stringdist than the
# # correct spelling
#
# # have user designate the correct spelling in a cluster / if there was a problem
# # with the clustering
#
#

Try the messy.cats package in your browser

Any scripts or data that you put into this service are public.

messy.cats documentation built on Nov. 30, 2022, 5:08 p.m.