R/subset_corpus.R

Defines functions search_file detect_in_file subset_corpus

Documented in subset_corpus

#' Return Works of a Corpus that Include Matching Text
#'
#' \code{subset_corpus} returns the works in a corpus whose texts include all of the words (or match the \link[regular expression]{stringi-search-regex}) specified in the `pattern` argument.
#' 
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus()]{build_corpus}}.
#' @param pattern A character string (or vector) containing a series of words or a \link[regular expression]{stringi-search-regex}.
#' 
#' @details
#' \code{subset_corpus} returns the works in a corpus whose texts include all of the words (or match the \link[regular expression]{stringi-search-regex}) specified in the `pattern` argument.  It can therefore be used to identify a subset of the corpus that is of particular interest.
#' 
#' 
#' @return A dataframe
#'
#' @examples
#' \dontrun{
#'  miasma_yf <- subset_corpus(yf_corpus, "miasma")
#' }
#' 
#' 
#' @importFrom dplyr "%>%" select mutate filter arrange
#' @importFrom stringr str_detect
#'
#' @export

subset_corpus <- function(corpus, pattern) {
    matching_texts <- corpus %>% 
        mutate(matching = detect_in_file(local_file, p = pattern)) %>% 
        filter(matching) %>% 
        select(-matching) %>%
        arrange(author, date)
    
    return(matching_texts)
}

detect_in_file <- function(files, p) {
    x <- map_lgl(files, function(f) {
        search_file(file = f, pattern = p)
    })
    return(x)
}

search_file <- function(file, pattern) {
    if (length(pattern) == 1) {
        if (str_detect(pattern, " ") & !str_detect(str_replace(pattern, " ", ""), "\\P{L}")) {
            pattern <- ifelse(length(strsplit(pattern, split = " ")) == 1,
                              paste0("^",
                                     strsplit(pattern, split = " ") %>%
                                     first() %>% 
                                     paste0("(?=.*\\b", ., "\\b)", collapse = ""),
                                     ".*$"),
                              pattern)
        } else {
            pattern <- pattern
        }
    } else {
        pattern <- str_replace_all(paste0("(", paste0(pattern, collapse = ")|("), ")"), " ", "\\\\s+")
    }
    found <- readLines(file) %>% 
        paste(collapse = "") %>% 
        str_detect(., regex(pattern, ignore_case = TRUE))
    return(found)
}
mariolaespinosa/historicalnetworks documentation built on Feb. 9, 2022, 12:31 p.m.