R/find_citing.R

Defines functions progress_message find_citing

Documented in find_citing

#' Identify Potentially Citing Works
#'
#' \code{find_citing} identifies those texts within a corpus that potentially cite to a collection of cited works 
#' 
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus()]{build_corpus}}.
#' @param df A dataframe representing downloaded texts (the "cited works") whose citation histories are sought; see details.
#' @param near An optional regex pattern that, if specified, must be near to a potential citation.
#' @param max_distance The maximum number of words between the \code{near} pattern and the potential citation; the default value is 250.
#' @param verbose Should the function provide information about its progress?
#' 
#' @details
#' The `find_citing` function casts a broad net for citing works, returning all works that include the cited work's author's last name that were published in or after the cited work's year of publication.
#' 
#' @return A dataframe
#'
#' @examples
#' rush1794 <- yf_corpus %>% 
#'     filter(id = "101283166.nlm.nih.gov")
#' 
#' mentions_rush <- yf_corpus %>% 
#'     find_citing(rush1794)
#' 
#' @importFrom dplyr "%>%" select mutate filter bind_rows
#' @importFrom stringr str_detect str_replace
#' @importFrom purrrlyr invoke_rows
#'
#' @export


find_citing <- function(corpus, df, near, max_distance = 250, verbose = TRUE) {
    if (!("cited_author" %in% names(df)) | !("cited_year" %in% names(df))) {
        cited_info <- df %>%
            transmute(cited_author = if_else(str_detect(author, ","), 
                                            str_replace(author, "(^[^,]*).*", "\\1"),
                                            word(author, -1)),
                      cited_year = date) %>% 
            arrange(cited_author, cited_year) %>% 
            distinct(cited_author, .keep_all = TRUE)
    } else {
        cited_info <- df %>% 
            select(cited_author, cited_year)
    }
    
    if (verbose) cat("Now searching for: \n")
    
    x <- purrrlyr::invoke_rows(.d = cited_info,
                     .collate = "rows",
                     .f = function(cited_author, cited_year) {
                         if (verbose) cat(paste("\t", cited_author, cited_year, "\n"))
                         later_works <- corpus %>% 
                             filter(date >= cited_year) %>% 
                             filter(cited_author != ifelse(str_detect(author, ","), 
                                                     str_replace(author, "(^[^,]*).*", "\\1"),
                                                     author))
                         citing_works <- later_works %>%  
                             mutate(citing = ifelse(cited_year <= date, 
                                                    detect_in_file(local_file, cited_author),
                                                    FALSE)) %>% 
                             filter(citing) %>%
                             select(-citing) %>% 
                             progress_message(., verbose = verbose) %>% 
                             mutate(archive_link = paste0("http://archive.org/stream/", id,
                                                          "#search/", str_replace(cited_author, " ", "+")),
                                    city = ifelse(str_detect(publisher, ":"), 
                                                  str_extract(publisher, "^.*?(?=:)") %>% 
                                                      str_replace("(^[^,]*).*", "\\1"),
                                                  str_replace(publisher, "(^[^,]*).*", "\\1")) %>% 
                                        str_replace("^A\\s+", "") %>%
                                        str_replace_all("[\\[\\]]", "") %>%
                                        str_replace("-", " ") %>% 
                                        str_trim(),
                                    cited = paste(cited_author, cited_year)) 
                         if (nrow(citing_works) > 1) {
                             citing_works <- citing_works %>% 
                                 omit_duplicates()
                         }
                         if (identical(citing_works, list(character(0)))) {
                             citing_works <- data_frame(id = character(0), 
                                                        author = character(0),
                                                        date = numeric(0),
                                                        title = character(0),
                                                        cited = character(0),
                                                        city = character(0),
                                                        publisher = character(0),
                                                        creator2 = character(0),
                                                        volume = character(0),
                                                        url = character(0),
                                                        local_file = character(0),
                                                        archive_link = character(0),
                                                        classification = numeric(0)) 
                         }
                         citing_works <- citing_works %>% 
                             select(id, author, date, title, everything())
                         return(citing_works)
                     } )
    
    x <- x[!duplicated(names(x))]
    x <- x %>% 
        select(-cited_author, -cited_year, -.row)
    
    if (!missing(near)) {
        x_list <- split(x, f = x$cited)
        
        x_list <- lapply(x_list, function(citing_works) {
            cited_author <- citing_works$cited[1] %>% 
                str_replace(" \\d{4}", "")
            x1 <- subset_corpus(citing_works, match_near(near, cited_author, max_distance))
            return(x1)
        })
        
        x <- bind_rows(x_list)
    }
    
    return(x)
}

progress_message <- function(x, verbose = FALSE) {
    if (verbose) cat(paste("\t\t", nrow(x),
                           "potential citing works found \n"))
    return(x)
}
mariolaespinosa/historicalnetworks documentation built on Feb. 9, 2022, 12:31 p.m.