#' @title Convert PDF File to Text
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param .dir file directory of pdf(s)
#' @param ... a character vector to filter out PDF file names
#' @examples
#' \dontrun{
#' pdf_text_files <- get_pdf_text(.dir = "%systemdrive%/Documents and Settings/All Users/Desktop", "words in pdf file name")
#' cat(pdf_text_files)
#' }
#' \dontrun{
#' pdf_text_files <- get_pdf_text(.dir = "%systemdrive%/Documents and Settings/All Users/Desktop", c("Employee", "Engagement"))
#' cat(pdf_text_files)
#' }
#' @import pdftools readr
#' @export
get_pdf_text <- function(.dir = getwd(), clean = TRUE, ...) {
.paths <- tryCatch(list.files(.dir, pattern = "\\.pdf$"), error = function(err) {
NA
})
if (!length(.paths) || is.na(.paths)) {
stop(sprintf("No valid PDF files found in %s", dir))
}
.fuzzy <- list(...)
if (!!length(.fuzzy)) {
.fuzzy <- paste(.fuzzy, collapse = "|")
if (any(grepl(.fuzzy, x = .paths, ignore.case = TRUE))) {
.paths <- .paths[grepl(.fuzzy, x = .paths, ignore.case = TRUE)]
}
}
.pdfs <- lapply(.paths, pdftools::pdf_text)
if (clean) {
.pdfs <- lapply(.pdfs, clean_pdf)
}
return(.pdfs)
}
#' @title Identify synonyms using wordnet
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of situational adejectives
#' @param POS part-of-speech to be identified
#' @param dictionary a file path wordnet dictionary
#' @param drop if false, will return all synonyms identified
#' @seealso [wordnet::setDict()]
#' @import reshape2 wordnet
#' @export
synonym_match <- function(x, POS = "ADJECTIVE", dictionary = "C:\\Program Files (x86)\\WordNet\\2.1",
drop = TRUE) {
.home <- gsub("*\\\\dict", "", dictionary)
Sys.setenv(WNHOME = .home)
wordnet::setDict(dictionary)
.syn <- sapply(x, function(x) wordnet::synonyms(word = x, pos = POS))
.syn <- reshape2::melt(.syn, factorsAsStrings = FALSE)
names(.syn) <- c("Synonym", "Word")
.syn[, 1] <- gsub("*\\([^\\)]+\\)$", "", as.character(.syn[, 1]))
.syn[, "match"] <- ifelse(.syn[, 1] != .syn[, 2], T, F)
.syn <- .syn[.syn[, "match"], ]
if (drop) {
.syn <- .syn[vec_grep(.syn[, 2], .syn[, 1], FALSE), ]
}
.syn
}
#' @title Count common words between two vectors
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param y character vector of words or sentences.
#' @param stopwords Logical. Remove stop words? Uses [tm::stopwords]
#' @param stem Logical. Stem words? Uses [textstem::stem_word]
#' @import tm textstem
#' @export
count_common_words <- function(x, y, stopwords = TRUE, stem = FALSE) {
stopifnot(is.character(x), is.character(y))
if (stopwords) {
x <- .rm_stopwords(x)
y <- .rm_stopwords(y)
}
if (stem) {
x <- textstem::stem_words(x, "en")
y <- textstem::stem_words(y, "en")
}
l <- sapply(list(unique(x), unique(y)), clean_text)
l <- sapply(l, function(x) strsplit(x, split = " "))
res <- sapply(l[[1]], function(x) {
res <- sapply(l[[2]], function(y) {
n <- .count_words(x, y)
n
})
})
res <- as.data.frame(res)
names(res) <- l[[1]]
res[, "doc_y"] <- l[[2]]
res <- tidyr::gather_(res, "doc_x", "common_word_count", names(res)[names(res) !=
"doc_y"], na.rm = T)
return(res)
}
#' @title Count words Helper
#' @export
.count_words <- function(x, y) {
res <- length(intersect(x, y))
return(res)
}
#' @title Wrap Text Function
#' @export
wrap_text <- function(txt, pattern) {
if (any(nchar(txt) == 0L)) {
txt <- txt[!nchar(txt) == 0L]
}
lines <- grep(pattern, txt)
remove <- setdiff(seq(txt), lines)
d <- diff(lines)
spread <- unique(d)[order(unique(d), decreasing = TRUE)]
for (i in seq(spread)) {
wrap <- lines[d == spread[i]]
if (spread[i] != 1L && (spread[i] - 1L) %in% spread) {
txt[wrap] <- paste(txt[wrap], txt[wrap + 1L], sep = " ")
} else if (spread[i] == 1L) {
txt[wrap] <- txt[wrap]
} else {
txt[wrap + 1L] <- NA
}
}
txt <- txt[-remove]
return(txt)
}
#' @title Clean text from character vector
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param rm_nums Logical. Only keep words, hyphens and spaces?
#' @param convert_nums Logical. Update numbers to words?
#' @param convert_contract Logical. Convert contractions to base words?
#' @import qdap
#' @export
clean_text <- function(x, lowercase = TRUE, rm_nums = TRUE, convert_nums = FALSE, convert_contract = TRUE, rm_punct = TRUE,
rm_whitespace = TRUE) {
stopifnot({
sapply(c(lowercase, rm_nums, convert_nums, rm_punct, rm_whitespace), is.logical)
})
if (typeof(x) != "character") {
stop("Please define x as a character")
}
if (any(grepl("I_WAS_NOT_ASCII", iconv(x, "latin1", "ASCII",
sub = "I_WAS_NOT_ASCII"
)))) {
x <- gsub("^(\\s*<U\\+\\w+>\\s*)+.*$", "encoding error", x)
x <- stringi::stri_trans_general(x, "latin-ascii")
}
if (convert_nums) {
if (any(grepl("[[:digit:]]", x))) {
x <- qdap::replace_number(x)
x <- qdap::replace_ordinal(x)
}
} else if (rm_nums) {
x <- gsub("[[:digit:]]", " ", x)
}
if (convert_contract) {
x <- qdap::replace_contraction(x)
}
if (rm_punct) {
x <- gsub("[^[:alnum:]\\s]", " ", x)
}
if (any(grepl("^\\s*$", x))) {
x[grep("^\\s*$", x)] <- "NA"
}
if (rm_whitespace) {
x <- gsub("\\s+", " ", x)
x <- gsub("^\\s+|\\s+$", "", x)
x <- x[x != ""]
}
if (lowercase) {
x <- tolower(x)
}
return(x)
}
#' @title Capture text between two characters
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param x character vector of words or sentences.
#' @param between character vector of length 2 containing boundary characters
extract_text_between <- function(x, between = c(".*", ".*"), ...) {
.pattern <- sprintf("%s(.*?)%s", between[1], between[2])
.x <- regmatches(x, regexec(.pattern, x, ...))
return(.x)
}
#' @title Parse PDF article
#' @author Shea Fyffe, \email{shea.fyffe@@gmail.com}
#' @param pdf_path file path to article as a pdf
#' @import tabulizer
#' @export
parse_pdf <- function(pdf_path) {
if (!file.exists(pdf_path)) {
stop("File path invalid")
}
x <- tryCatch(
{
tabulizer::extract_text(pdf_path, encoding = "UTF-8")
},
warning = function(w) {
print(paste("warning:", w))
},
error = function(e) {
print(paste("error:", e))
}
)
if (any(grepl("\r\n", x))) {
x <- unlist(strsplit(x, "\r\n"))
}
x <- clean_text(x)
x <- x[x != ""]
if (length(x) == 0L) {
return()
} else {
return(x)
}
}
#' @title Find top words in a text document
#'
#' @param x Character. A vector of words from a text document.
#' @param stopwords Logical. Remove stop words? Uses [tm::stopwords]
#' @param stem Logical. Stem words? Uses [textstem::stem_word]
#' @param ... Additional arguments to be passed to [qdap::freq_terms]
#' @return
#' @export
find_top_words <- function(x, stopwords = TRUE, stem = FALSE, ...) {
if (stopwords) {
x <- .rm_stopwords(x)
}
if (stem) {
x <- textstem::stem_words(x, "en")
}
if (length(list(...)) != 0L) {
x <- qdap::freq_terms(text.var = x, ...)
} else {
x <- qdap::freq_terms(text.var = x, 20, at.least = 3, stopwords = qdapDictionaries::Top200Words)
}
return(x)
}
#' @title Attempt to calculate number of english words in a string
#' @param x Character. A vector of words from a text document.
#' @param ... Additional words to be passed to be checked against
#' @seealso [qdapDictionaries::GradyAugmented]
#' @export
get_english_words_ratio <- function(x, ...) {
if (length(list(...)) != 0L) {
.dict <- qdapDictionaries::GradyAugmented
} else {
.dict <- c(qdapDictionaries::GradyAugmented, paste(...))
}
x <- sum(x %in% .dict) / length(x)
if (is.nan(x)) {
x <- 1
}
return(x)
}
#' @title Check Spelling
#' @param x Character. A vector of words from a text document.
#' @param return_misspell Logical. Return misspelled words? Otherwise will remove.
#' @param ... additional arguments to be passed to \code{\link[hunspell]{hunspell_check}}
#' @seealso [hunspell::suggest]
#' @export
check_spelling <- function(x, return_misspell = TRUE, ...) {
stopifnot({
is.character(x)
!any(grepl("\\s+", x))
is.logical(return_misspell)
})
.ms <- hunspell::hunspell_check(x, ...)
if (return_misspell) {
x <- x[!.ms]
} else {
x <- x[.ms]
}
return(x)
}
#' @title Remove Stopwords from a string
#' @param x Character. A vector of words from a text document.
#' @seealso [tm::stopwords]
#' @export
.rm_stopwords <- function(x) {
sw <- paste(tm::stopwords("en"), collapse = "\\b|\\b")
sw <- paste0("\\b", sw, "\\b")
x <- gsub(sw, "", x)
return(x)
}
#' @title Count words in vector of strings
#' @param x Character. A vector of words from a text document.
#' @return a numeric vector of lengths
#' @export
count_string_words <- function(x) {
if (!is.character(x)) {
stop("x not a character vector")
}
x <- sapply(gregexpr("[[:alpha:]]+", x), function(x) sum(x > 0))
return(x)
}
#' @title Find rows in data.frame columns with certain words
#' @param ... Required. Character. A list of words to find.
#' @param data Required. Data.frame. A data.frame containing character columns
#' @param partial Logical/Boolean. Include partial matches?
#' @param type Character. Either 'and' or 'or' suggesting how matching words in \code{...}
#' should be treated
#' @return logical vector with \code{length} equal to \code{nrow(data)}
#' @export
has_words <- function(..., data, partial = FALSE, type = "and") {
stopifnot({
inherits(data, "data.frame")
is.logical(partial)
is.character(type)
})
.char <- sapply(data, is.character)
if (sum(.char) == 0) {
stop("data contains no valid character columns")
}
.words <- c(...)
if (is.recursive(.words)) {
.words <- unlist(.words)
}
if (partial) {
.words <- paste0("\\b.*", .words, ".*\\b")
} else {
.words <- paste0("\\b", .words, "\\b")
}
if (tolower(type) == "and") {
.out <- list()
for (w in seq_along(.words)) {
.out[[w]] <- apply(data[, .char], 1, function(x) {
x <- grepl(.words[w], x)
return(any(x))
})
}
.out <- Reduce("&", .out)
} else if (tolower(type) == "or") {
.words <- paste0(.words, collapse = "|")
.out <- apply(data[, .char], 1, function(x) {
x <- grepl(.words, x)
return(any(x))
})
} else {
stop("type must be one of the follower: 'and' 'or'")
}
return(.out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.