#' Return Works of a Corpus that Include Matching Text
#'
#' \code{subset_corpus} returns the works in a corpus whose texts include all of the words (or match the \link[regular expression]{stringi-search-regex}) specified in the `pattern` argument.
#'
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus()]{build_corpus}}.
#' @param pattern A character string (or vector) containing a series of words or a \link[regular expression]{stringi-search-regex}.
#'
#' @details
#' \code{subset_corpus} returns the works in a corpus whose texts include all of the words (or match the \link[regular expression]{stringi-search-regex}) specified in the `pattern` argument. It can therefore be used to identify a subset of the corpus that is of particular interest.
#'
#'
#' @return A dataframe
#'
#' @examples
#' \dontrun{
#' miasma_yf <- subset_corpus(yf_corpus, "miasma")
#' }
#'
#'
#' @importFrom dplyr "%>%" select mutate filter arrange
#' @importFrom stringr str_detect
#'
#' @export
subset_corpus <- function(corpus, pattern) {
matching_texts <- corpus %>%
mutate(matching = detect_in_file(local_file, p = pattern)) %>%
filter(matching) %>%
select(-matching) %>%
arrange(author, date)
return(matching_texts)
}
detect_in_file <- function(files, p) {
x <- map_lgl(files, function(f) {
search_file(file = f, pattern = p)
})
return(x)
}
search_file <- function(file, pattern) {
if (length(pattern) == 1) {
if (str_detect(pattern, " ") & !str_detect(str_replace(pattern, " ", ""), "\\P{L}")) {
pattern <- ifelse(length(strsplit(pattern, split = " ")) == 1,
paste0("^",
strsplit(pattern, split = " ") %>%
first() %>%
paste0("(?=.*\\b", ., "\\b)", collapse = ""),
".*$"),
pattern)
} else {
pattern <- pattern
}
} else {
pattern <- str_replace_all(paste0("(", paste0(pattern, collapse = ")|("), ")"), " ", "\\\\s+")
}
found <- readLines(file) %>%
paste(collapse = "") %>%
str_detect(., regex(pattern, ignore_case = TRUE))
return(found)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.