#' omit_duplicates
#'
#' \code{omit_duplicates}
#'
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus]{build_corpus}}
#' @param strict Should works be considered duplicates only if they share both the same author's last name and the same city (along with matching title, publication date, and volume number)?
#'
#' @details
#' Because the Internet Archive's collection of texts includes many works more than once,
#' the output created by `build_corpus` will likely contain duplicates. `omit_duplicates`
#' takes a fairly conservative approach to filtering out these duplicates. By default, the
#' function considers works to be duplicates if the first ten words of the title are identical
#' and they have the same publication date, volume number, and either the same author's
#' last name, or the same city of publication (formatting issues are particularly common for
#' these two pieces of metadata). Setting the `exact` argument to `TRUE` will only consider
#' works to be duplicates if they share both the same author's last name and the same city
#' of publication.
#'
#' @return A dataframe
#'
#' @examples
#'
#' @importFrom dplyr "%>%" select mutate
#' @importFrom stringr str_detect word
#'
#' @export
omit_duplicates <- function(df, exact = FALSE) {
df_new <- df %>%
mutate(title_length = str_count(title, "\\P{L}+") + 1,
title_short = word(title, start = 1, end = pmin(title_length, 10), sep = "\\P{L}+") %>%
tolower() %>%
str_replace_all("\\P{L}+", " "),
city_short = word(city, 1, sep = "\\W+"),
author_short = ifelse(str_detect(author, ","),
str_replace(author, "(^[^,]*).*", "\\1") %>%
tolower(),
word(author, -1) %>%
tolower()),
volume2 = FALSE)
if (nrow(df_new) > 1) {
for (i in 2:nrow(df_new)) {
df_new$volume2[i] = (as.numeric(adist(df_new$id[i], df_new$id[i-1])) == 1 &
df_new$title_short[i] == df_new$title_short[i-1] &
df_new$date[i] == df_new$date[i-1])
}
} else {
df_new$volume2 <- NA
}
if (exact) {
df_new <- df_new %>%
arrange(id) %>%
distinct(title_short, author_short, city_short, date, volume, volume2, .keep_all = TRUE) %>%
select(-title_length, -author_short, -title_short, -city_short, -volume2)
} else {
df_new <- df_new %>%
arrange(id) %>%
distinct(title_short, city_short, date, volume, volume2, .keep_all = TRUE)
df_new <- df_new %>%
distinct(title_short, author_short, date, volume, volume2, .keep_all = TRUE) %>%
select(-title_length, -author_short, -title_short, -city_short, -volume2)
}
return(df_new)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.