R/cleanupText.R

#' Function to clean up common encoding issues
#'
#' @param character.vector Character vector or string you want to clean up
#'
#' @return cleaned character vector
#' @export
#'@import stringi
#'@import dplyr

cleanupText <- function(characterVector)
{

  character.vector <- as.character(character.vector)
  character.vector <-
    stri_replace_all_fixed(character.vector, "â", "") %>%
    stri_replace_all_fixed("á", "á") %>%
    stri_replace_all_fixed("é", "é") %>%
    stri_replace_all_fixed("ó", "ó") %>%
    stri_replace_all_fixed("ú", "ú") %>%
    stri_replace_all_fixed("ñ", "ñ") %>%
    stri_replace_all_fixed("ã", "ã") %>%
    stri_replace_all_fixed("ü", "ü") %>%
    stri_replace_all_fixed("ö", "ö") %>%
    stri_replace_all_fixed("ä", "ä") %>%
    stri_replace_all_fixed("Ã¥", "å") %>%
    stri_replace_all_fixed("ø", "ø") %>%
    stri_replace_all_fixed("î", "î") %>%
    stri_replace_all_fixed("â", "â") %>%
    stri_replace_all_fixed("ê", "ê") %>%
    stri_replace_all_fixed("è", "è") %>%
    stri_replace_all_fixed("ç", "ç") %>%
    stri_replace_all_fixed("Ã",  "í") %>%
    stri_replace_all_regex("[\\u00B7|\\uf0b7|\\uf0a0|\\u00AD]", "") %>%
    stri_replace_all_fixed("habia", "había") %>%
    stri_replace_all_fixed("habian", "habían") %>%
    stri_replace_all_fixed("habra", "habrá") %>%
    stri_replace_all_fixed("habran", "habrán") %>%
    stri_replace_all_regex("[^[:print:]]", "") %>%
    stri_replace_all_regex("  +", " ") %>%
    stri_trans_tolower()
  return(character.vector)
}
jeroenclaes/tweetCorp documentation built on May 27, 2019, 4:50 a.m.