#' Identify Potentially Citing Works
#'
#' \code{find_citing} identifies those texts within a corpus that potentially cite to a collection of cited works
#'
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus()]{build_corpus}}.
#' @param df A dataframe representing downloaded texts (the "cited works") whose citation histories are sought; see details.
#' @param near An optional regex pattern that, if specified, must be near to a potential citation.
#' @param max_distance The maximum number of words between the \code{near} pattern and the potential citation; the default value is 250.
#' @param verbose Should the function provide information about its progress?
#'
#' @details
#' The `find_citing` function casts a broad net for citing works, returning all works that include the cited work's author's last name that were published in or after the cited work's year of publication.
#'
#' @return A dataframe
#'
#' @examples
#' rush1794 <- yf_corpus %>%
#' filter(id = "101283166.nlm.nih.gov")
#'
#' mentions_rush <- yf_corpus %>%
#' find_citing(rush1794)
#'
#' @importFrom dplyr "%>%" select mutate filter bind_rows
#' @importFrom stringr str_detect str_replace
#' @importFrom purrrlyr invoke_rows
#'
#' @export
find_citing <- function(corpus, df, near, max_distance = 250, verbose = TRUE) {
if (!("cited_author" %in% names(df)) | !("cited_year" %in% names(df))) {
cited_info <- df %>%
transmute(cited_author = if_else(str_detect(author, ","),
str_replace(author, "(^[^,]*).*", "\\1"),
word(author, -1)),
cited_year = date) %>%
arrange(cited_author, cited_year) %>%
distinct(cited_author, .keep_all = TRUE)
} else {
cited_info <- df %>%
select(cited_author, cited_year)
}
if (verbose) cat("Now searching for: \n")
x <- purrrlyr::invoke_rows(.d = cited_info,
.collate = "rows",
.f = function(cited_author, cited_year) {
if (verbose) cat(paste("\t", cited_author, cited_year, "\n"))
later_works <- corpus %>%
filter(date >= cited_year) %>%
filter(cited_author != ifelse(str_detect(author, ","),
str_replace(author, "(^[^,]*).*", "\\1"),
author))
citing_works <- later_works %>%
mutate(citing = ifelse(cited_year <= date,
detect_in_file(local_file, cited_author),
FALSE)) %>%
filter(citing) %>%
select(-citing) %>%
progress_message(., verbose = verbose) %>%
mutate(archive_link = paste0("http://archive.org/stream/", id,
"#search/", str_replace(cited_author, " ", "+")),
city = ifelse(str_detect(publisher, ":"),
str_extract(publisher, "^.*?(?=:)") %>%
str_replace("(^[^,]*).*", "\\1"),
str_replace(publisher, "(^[^,]*).*", "\\1")) %>%
str_replace("^A\\s+", "") %>%
str_replace_all("[\\[\\]]", "") %>%
str_replace("-", " ") %>%
str_trim(),
cited = paste(cited_author, cited_year))
if (nrow(citing_works) > 1) {
citing_works <- citing_works %>%
omit_duplicates()
}
if (identical(citing_works, list(character(0)))) {
citing_works <- data_frame(id = character(0),
author = character(0),
date = numeric(0),
title = character(0),
cited = character(0),
city = character(0),
publisher = character(0),
creator2 = character(0),
volume = character(0),
url = character(0),
local_file = character(0),
archive_link = character(0),
classification = numeric(0))
}
citing_works <- citing_works %>%
select(id, author, date, title, everything())
return(citing_works)
} )
x <- x[!duplicated(names(x))]
x <- x %>%
select(-cited_author, -cited_year, -.row)
if (!missing(near)) {
x_list <- split(x, f = x$cited)
x_list <- lapply(x_list, function(citing_works) {
cited_author <- citing_works$cited[1] %>%
str_replace(" \\d{4}", "")
x1 <- subset_corpus(citing_works, match_near(near, cited_author, max_distance))
return(x1)
})
x <- bind_rows(x_list)
}
return(x)
}
progress_message <- function(x, verbose = FALSE) {
if (verbose) cat(paste("\t\t", nrow(x),
"potential citing works found \n"))
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.