R/extract_text.R

extract_text <- function(files=".",
                          word.length.min=4,
                          word.length.max=Inf,
                          freq.min=10,
                          freq.max=Inf){

  Rpdf <- tm::readPDF(control = list(text = "-layout"))
  if(grepl(".pdf",files)){
    files <- c(files)
  }
  else{
    files <- list.files(path=files, pattern = "pdf$")
  }
  text <- tm::Corpus(tm::URISource(files), readerControl = list(reader = Rpdf))
  text_clean <- tm::TermDocumentMatrix(text, control = list(removePunctuation = TRUE,
                                                       tolower = TRUE,
                                                       removeNumbers = TRUE,
                                                       stopwords = TRUE,
                                                       wordLengths=c(word.length.min,word.length.max)
                                                       # stemming = TRUE
                                                       # bounds = list(global = c(2, Inf))
  ))

  words <- tm::findFreqTerms(text_clean, lowfreq=freq.min, highfreq=freq.max)
  words <- as.data.frame(tm::inspect(text_clean[words,]))
  data <- data.frame(word=rownames(words),freq=rowSums(words))
  order <- desc(data$freq)
  data <- arrange(data, order)
}

Try the neuropsychology package in your browser

Any scripts or data that you put into this service are public.

neuropsychology documentation built on May 2, 2019, 2:13 p.m.