testthat/clean_corpus.R

# funcao cleanCorpus
clean_corpus <- function(corpus)
{
  corpus <- tm::tm_map(corpus, stripWhitespace) # retira espacos em branco
  # corpus <- tm::tm_map(corpus, content_transformer(tolower)) # transforma em minusculas
  corpus <- tm::tm_map(corpus, removePunctuation) # retira pontuacao
  corpus <- tm::tm_map(corpus, removeNumbers) # retira numeros
  #corpus <- tm_map(corpus.tmp, removeWords, stopwords('english'))
  myStopwords <- c(stopwords('portuguese'), 'nao', 'pag', 'nÂș')
  corpus <- tm::tm_map(corpus, removeWords, myStopwords)
  return(corpus)
}
filipezabala/jurimetrics documentation built on Aug. 28, 2020, 1:37 p.m.