R/deep_kleen.R

Defines functions deep_kleen

Documented in deep_kleen

#' Title Cleanse Text Of Common Errors
#'
#' Description Removes html tags, non-ascii, new-line characters, and encodes to UTF-8
#'
#' @param xx   text string / vector
#' @export
#'
deep_kleen <- function(xx){
  trimws(
    gsub("\\s+", " ",
         gsub("<[^>]*?>"," ",
              gsub("\\r", " ",
                   gsub("\\n", " ",
                        gsub("[^ -~]"," ", iconv(text,"latin1", "UTF-8")
                        ) ) ) ) ) )

}
data-steve/kleentex documentation built on May 12, 2017, 5:41 p.m.