R/text_functions.R

Defines functions has_words count_string_words .rm_stopwords check_spelling get_english_words_ratio find_top_words parse_pdf extract_text_between clean_text wrap_text .count_words count_common_words synonym_match get_pdf_text

#' @title Convert PDF File to Text
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param .dir file directory of pdf(s)
#' @param ... a character vector to filter out PDF file names
#' @examples
#' \dontrun{
#' pdf_text_files <- get_pdf_text(.dir = "%systemdrive%/Documents and Settings/All Users/Desktop", "words in pdf file name")
#' cat(pdf_text_files)
#' }
#' \dontrun{
#' pdf_text_files <- get_pdf_text(.dir = "%systemdrive%/Documents and Settings/All Users/Desktop", c("Employee", "Engagement"))
#' cat(pdf_text_files)
#' }
#' @import pdftools readr
#' @export
get_pdf_text <- function(.dir = getwd(), clean = TRUE, ...) {
  .paths <- tryCatch(list.files(.dir, pattern = "\\.pdf$"), error = function(err) {
    NA
  })
  if (!length(.paths) || is.na(.paths)) {
    stop(sprintf("No valid PDF files found in %s", dir))
  }
  .fuzzy <- list(...)
  if (!!length(.fuzzy)) {
    .fuzzy <- paste(.fuzzy, collapse = "|")
    if (any(grepl(.fuzzy, x = .paths, ignore.case = TRUE))) {
      .paths <- .paths[grepl(.fuzzy, x = .paths, ignore.case = TRUE)]
    }
  }
  .pdfs <- lapply(.paths, pdftools::pdf_text)
  if (clean) {
    .pdfs <- lapply(.pdfs, clean_pdf)
  }
  return(.pdfs)
}
#' @title Identify synonyms using wordnet
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of situational adejectives
#' @param POS part-of-speech to be identified
#' @param dictionary a file path wordnet dictionary
#' @param drop if false, will return all synonyms identified
#' @seealso [wordnet::setDict()]
#' @import reshape2 wordnet
#' @export
synonym_match <- function(x, POS = "ADJECTIVE", dictionary = "C:\\Program Files (x86)\\WordNet\\2.1",
                          drop = TRUE) {
  .home <- gsub("*\\\\dict", "", dictionary)
  Sys.setenv(WNHOME = .home)
  wordnet::setDict(dictionary)
  .syn <- sapply(x, function(x) wordnet::synonyms(word = x, pos = POS))
  .syn <- reshape2::melt(.syn, factorsAsStrings = FALSE)
  names(.syn) <- c("Synonym", "Word")
  .syn[, 1] <- gsub("*\\([^\\)]+\\)$", "", as.character(.syn[, 1]))
  .syn[, "match"] <- ifelse(.syn[, 1] != .syn[, 2], T, F)
  .syn <- .syn[.syn[, "match"], ]
  if (drop) {
    .syn <- .syn[vec_grep(.syn[, 2], .syn[, 1], FALSE), ]
  }
  .syn
}
#' @title Count common words between two vectors
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param y character vector of words or sentences.
#' @param stopwords Logical. Remove stop words? Uses [tm::stopwords]
#' @param stem Logical. Stem words? Uses [textstem::stem_word]
#' @import tm textstem
#' @export
count_common_words <- function(x, y, stopwords = TRUE, stem = FALSE) {
  stopifnot(is.character(x), is.character(y))
  if (stopwords) {
    x <- .rm_stopwords(x)
    y <- .rm_stopwords(y)
  }
  if (stem) {
    x <- textstem::stem_words(x, "en")
    y <- textstem::stem_words(y, "en")
  }
  l <- sapply(list(unique(x), unique(y)), clean_text)
  l <- sapply(l, function(x) strsplit(x, split = " "))
  res <- sapply(l[[1]], function(x) {
    res <- sapply(l[[2]], function(y) {
      n <- .count_words(x, y)
      n
    })
  })

  res <- as.data.frame(res)
  names(res) <- l[[1]]
  res[, "doc_y"] <- l[[2]]
  res <- tidyr::gather_(res, "doc_x", "common_word_count", names(res)[names(res) !=
    "doc_y"], na.rm = T)
  return(res)
}
#' @title Count words Helper
#' @export
.count_words <- function(x, y) {
  res <- length(intersect(x, y))
  return(res)
}
#' @title Wrap Text Function
#' @export
wrap_text <- function(txt, pattern) {
  if (any(nchar(txt) == 0L)) {
    txt <- txt[!nchar(txt) == 0L]
  }
  lines <- grep(pattern, txt)
  remove <- setdiff(seq(txt), lines)
  d <- diff(lines)
  spread <- unique(d)[order(unique(d), decreasing = TRUE)]
  for (i in seq(spread)) {
    wrap <- lines[d == spread[i]]
    if (spread[i] != 1L && (spread[i] - 1L) %in% spread) {
      txt[wrap] <- paste(txt[wrap], txt[wrap + 1L], sep = " ")
    } else if (spread[i] == 1L) {
      txt[wrap] <- txt[wrap]
    } else {
      txt[wrap + 1L] <- NA
    }
  }
  txt <- txt[-remove]
  return(txt)
}
#' @title Clean text from character vector
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param rm_nums Logical. Only keep words, hyphens and spaces?
#' @param convert_nums Logical. Update numbers to words?
#' @param convert_contract Logical. Convert contractions to base words?
#' @import qdap
#' @export
clean_text <- function(x, lowercase = TRUE, rm_nums = TRUE, convert_nums = FALSE, convert_contract = TRUE, rm_punct = TRUE,
                       rm_whitespace = TRUE) {
  stopifnot({
    sapply(c(lowercase, rm_nums, convert_nums, rm_punct, rm_whitespace), is.logical)
  })
  if (typeof(x) != "character") {
    stop("Please define x as a character")
  }

  if (any(grepl("I_WAS_NOT_ASCII", iconv(x, "latin1", "ASCII",
    sub = "I_WAS_NOT_ASCII"
  )))) {
    x <- gsub("^(\\s*<U\\+\\w+>\\s*)+.*$", "encoding error", x)
    x <- stringi::stri_trans_general(x, "latin-ascii")
  }

  if (convert_nums) {
    if (any(grepl("[[:digit:]]", x))) {
      x <- qdap::replace_number(x)
      x <- qdap::replace_ordinal(x)
    }
  } else if (rm_nums) {
    x <- gsub("[[:digit:]]", " ", x)
  }

  if (convert_contract) {
    x <- qdap::replace_contraction(x)
  }

  if (rm_punct) {
    x <- gsub("[^[:alnum:]\\s]", " ", x)
  }

  if (any(grepl("^\\s*$", x))) {
    x[grep("^\\s*$", x)] <- "NA"
  }

  if (rm_whitespace) {
    x <- gsub("\\s+", " ", x)
    x <- gsub("^\\s+|\\s+$", "", x)
    x <- x[x != ""]
  }

  if (lowercase) {
    x <- tolower(x)
  }

  return(x)
}
#' @title Capture text between two characters
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param between character vector of length 2 containing boundary characters
extract_text_between <- function(x, between = c(".*", ".*"), ...) {
  .pattern <- sprintf("%s(.*?)%s", between[1], between[2])
  .x <- regmatches(x, regexec(.pattern, x, ...))
  return(.x)
}
#' @title Parse PDF article
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param pdf_path file path to article as a pdf
#' @import tabulizer
#' @export
parse_pdf <- function(pdf_path) {
  if (!file.exists(pdf_path)) {
    stop("File path invalid")
  }
  x <- tryCatch(
    {
      tabulizer::extract_text(pdf_path, encoding = "UTF-8")
    },
    warning = function(w) {
      print(paste("warning:", w))
    },
    error = function(e) {
      print(paste("error:", e))
    }
  )
  if (any(grepl("\r\n", x))) {
    x <- unlist(strsplit(x, "\r\n"))
  }
  x <- clean_text(x)
  x <- x[x != ""]
  if (length(x) == 0L) {
    return()
  } else {
    return(x)
  }
}
#' @title Find top words in a text document
#'
#' @param x Character. A vector of words from a text document.
#' @param stopwords Logical. Remove stop words? Uses [tm::stopwords]
#' @param stem Logical. Stem words? Uses [textstem::stem_word]
#' @param ... Additional arguments to be passed to [qdap::freq_terms]
#' @return
#' @export
find_top_words <- function(x, stopwords = TRUE, stem = FALSE, ...) {
  if (stopwords) {
    x <- .rm_stopwords(x)
  }
  if (stem) {
    x <- textstem::stem_words(x, "en")
  }
  if (length(list(...)) != 0L) {
    x <- qdap::freq_terms(text.var = x, ...)
  } else {
    x <- qdap::freq_terms(text.var = x, 20, at.least = 3, stopwords = qdapDictionaries::Top200Words)
  }
  return(x)
}
#' @title Attempt to calculate number of english words in a string
#' @param x Character. A vector of words from a text document.
#' @param ... Additional words to be passed to be checked against
#' @seealso [qdapDictionaries::GradyAugmented]
#' @export
get_english_words_ratio <- function(x, ...) {
  if (length(list(...)) != 0L) {
    .dict <- qdapDictionaries::GradyAugmented
  } else {
    .dict <- c(qdapDictionaries::GradyAugmented, paste(...))
  }
  x <- sum(x %in% .dict) / length(x)
  if (is.nan(x)) {
    x <- 1
  }
  return(x)
}
#' @title Check Spelling
#' @param x Character. A vector of words from a text document.
#' @param return_misspell Logical. Return misspelled words? Otherwise will remove.
#' @param ... additional arguments to be passed to \code{\link[hunspell]{hunspell_check}}
#' @seealso [hunspell::suggest]
#' @export
check_spelling <- function(x, return_misspell = TRUE, ...) {
  stopifnot({
    is.character(x)
    !any(grepl("\\s+", x))
    is.logical(return_misspell)
  })
  .ms <- hunspell::hunspell_check(x, ...)
  if (return_misspell) {
    x <- x[!.ms]
  } else {
    x <- x[.ms]
  }
  return(x)
}
#' @title Remove Stopwords from a string
#' @param x Character. A vector of words from a text document.
#' @seealso [tm::stopwords]
#' @export
.rm_stopwords <- function(x) {
  sw <- paste(tm::stopwords("en"), collapse = "\\b|\\b")
  sw <- paste0("\\b", sw, "\\b")
  x <- gsub(sw, "", x)
  return(x)
}
#' @title Count words in vector of strings
#' @param x Character. A vector of words from a text document.
#' @return a numeric vector of lengths
#' @export
count_string_words <- function(x) {
  if (!is.character(x)) {
    stop("x not a character vector")
  }
  x <- sapply(gregexpr("[[:alpha:]]+", x), function(x) sum(x > 0))
  return(x)
}
#' @title Find rows in data.frame columns with certain words
#' @param ... Required. Character. A list of words to find.
#' @param data Required. Data.frame. A data.frame containing character columns
#' @param partial Logical/Boolean. Include partial matches?
#' @param type Character. Either 'and' or 'or' suggesting how matching words in \code{...}
#' should be treated
#' @return logical vector with \code{length} equal to \code{nrow(data)}
#' @export
has_words <- function(..., data, partial = FALSE, type = "and") {
  stopifnot({
    inherits(data, "data.frame")
    is.logical(partial)
    is.character(type)
  })

  .char <- sapply(data, is.character)

  if (sum(.char) == 0) {
    stop("data contains no valid character columns")
  }

  .words <- c(...)
  if (is.recursive(.words)) {
    .words <- unlist(.words)
  }

  if (partial) {
    .words <- paste0("\\b.*", .words, ".*\\b")
  } else {
    .words <- paste0("\\b", .words, "\\b")
  }

  if (tolower(type) == "and") {
    .out <- list()
    for (w in seq_along(.words)) {
      .out[[w]] <- apply(data[, .char], 1, function(x) {
        x <- grepl(.words[w], x)
        return(any(x))
      })
    }

    .out <- Reduce("&", .out)
  } else if (tolower(type) == "or") {
    .words <- paste0(.words, collapse = "|")
    .out <- apply(data[, .char], 1, function(x) {
      x <- grepl(.words, x)
      return(any(x))
    })
  } else {
    stop("type must be one of the follower: 'and' 'or'")
  }


  return(.out)
}
Shea-Fyffe/GitItSheaGitIt documentation built on Sept. 23, 2020, 10:34 a.m.