inst/scripts/rm_non_ascii_en_lemmas.R

library(tools)
library(stringi)
library(dplyr)
library(readxl)
library(purrr)
library(tidyr)
library(stringdist)

en_lemmas <- read_excel("inst/scripts/en_lemmas.xlsx")

en_lemmas <- en_lemmas %>%
  mutate(
    ascii1 = stringi::stri_enc_mark(word),
    ascii2 = stringi::stri_enc_mark(lemma),
    ascii3 = stringi::stri_enc_mark(term)
  ) %>%
  filter(ascii1 == "ASCII", ascii2 == "ASCII", ascii3 == "ASCII") %>%
  select(word, lemma, term) %>%
  mutate(word = iconv(word, to="ASCII//TRANSLIT"),
         lemma = iconv(lemma, to="ASCII//TRANSLIT"),
         term = iconv(term, to="ASCII//TRANSLIT")
  )

add_term <- tibble(
  word = unique(en_lemmas$term)
) %>%
  mutate(lemma = word, term = word)

en_lemmas <- en_lemmas %>%
  bind_rows(add_term) %>%
  mutate(
    difference = stringdist(word, term),
    characters = nchar(term)
  ) %>%
  group_by(word) %>%
  nest() %>%
  mutate(data = map(data, filter, difference == min(difference))) %>%
  mutate(data = map(data, filter, characters == max(characters))) %>%
  mutate(data = map(data, sample_n, 1)) %>%
  unnest() %>%
  ungroup() %>%
  select(-characters) %>%
  unique()


save(en_lemmas, file = "data/en_lemmas.RData")
resaveRdaFiles("data/en_lemmas.RData")


library(dplyr)

toascii <- data.frame(
  mapL = c("[á]","[é]","[í]","[ó]","[ú]","[Á]","[É]","[Í]","[Ó]","[Ú]","[ñ]","[Ñ]","[ü]","[Ü]","[ç]","[ä]","[Ä]","[ë]","[Ë]","[ï]","[Ï]","[ö]","[Ö]","[ü]","[Ü]","[ÿ]","[Ÿ]","[â]","[Â]","[ê]","[Ê]","[î]","[Î]","[ô]","[Ô]","[û]","[Û]","[à]","[À]","[è]","[È]","[ì]","[Ì]","[ò]","[Ò]","[ù]","[Ù]"),
  mapA = c("a","e","i","o","u","A","E","I","O","U","n","N","u","U","c","a","A","e","E","i","I","o","O","u","U","y","Y","a","A","e","E","i","I","o","O","u","U","a","A","e","E","i","I","o","O","u","U")
)

toascii <- mutate(toascii, mapL = as.character(mapL), mapA = as.character(mapA))


save(toascii, file = "data/toascii.RData")
NicolasJBM/lexana documentation built on July 3, 2019, 10 a.m.