R/remove.nonASCII.R

#' @title Remove non ASCII characters
#'
#' @description A function to remove non ASCII characters from a vector. ASCII has 128 characters (2^7 combinations), which covers pretty much everything we use regularly in the Western alphabet. UTF came along later and has millions of possible values - which is very useful for Arabic, Cryllic and other alphabets, as well as diacritical marks and emojis. However, whilst UTF-8 is backwards compatible with ASCII it can play merry havoc with your text processing. I've found that in many cases it is just easier to remove these characters if they only constitute a small proportion of your text. Et voila, this function was made.

#' @param text.clean vector containing one or more strings (i.e. length is equal to or greater than 1)
#' @return A list, where the first item is a vector with all non ASCII characters removed, and the second item is a vector of indexes showing which entries contained non ASCII characters
#' @export


remove.nonASCII = function(text.clean){

  ascii.valz = length(which(!(stringi::stri_enc_isascii(text.clean))))

  if (ascii.valz !=0){
    print(paste0('There are this many entries with non ASCII characters: ', ascii.valz))
    print(paste0('For example, this entry: ', text.clean[which(!(stringi::stri_enc_isascii(text.clean)))[1]] ))

    Encoding(text.clean) = 'UTF-8'
    text.clean = base::iconv(text.clean, from = 'UTF-8', to = 'ASCII', sub = '') # removes any non-ASCII characters

    ascii.check = length(which(!(stringi::stri_enc_isascii(text.clean))))
    print(paste0('There are now this many entries with ASCII characters: ', ascii.check))

  } else {
    print('No entries have ASCII characters. Good for you!')
  }

  out = list(text.clean,
             ascii.valz)
  return(out)
}
bvidgen/tc documentation built on May 9, 2019, 2:21 a.m.