R/omit_duplicates.R

Defines functions omit_duplicates

Documented in omit_duplicates

#' omit_duplicates
#'
#' \code{omit_duplicates} 
#' 
#' @param corpus A dataframe representing a corpus of downloaded texts generated by \code{\link[build_corpus]{build_corpus}}
#' @param strict Should works be considered duplicates only if they share both the same author's last name and the same city (along with matching title, publication date, and volume number)?
#'  
#' @details
#' Because the Internet Archive's collection of texts includes many works more than once,
#' the output created by `build_corpus` will likely contain duplicates.  `omit_duplicates`
#' takes a fairly conservative approach to filtering out these duplicates.  By default, the
#' function considers works to be duplicates if the first ten words of the title are identical
#' and they have the same publication date, volume number, and either the same author's 
#' last name, or the same city of publication (formatting issues are particularly common for 
#' these two pieces of metadata).  Setting the `exact` argument to `TRUE` will only consider
#' works to be duplicates if they share both the same author's last name and the same city
#' of publication.
#' 
#' @return A dataframe
#'
#' @examples
#' 
#' @importFrom dplyr "%>%" select mutate
#' @importFrom stringr str_detect word
#'
#' @export

omit_duplicates <- function(df, exact = FALSE) {
    df_new <- df %>% 
        mutate(title_length = str_count(title, "\\P{L}+") + 1,
               title_short = word(title, start = 1, end = pmin(title_length, 10), sep = "\\P{L}+") %>% 
                   tolower() %>% 
                   str_replace_all("\\P{L}+", " "),
               city_short = word(city, 1, sep = "\\W+"),
               author_short = ifelse(str_detect(author, ","), 
                                     str_replace(author, "(^[^,]*).*", "\\1") %>% 
                                         tolower(),
                                     word(author, -1) %>% 
                                         tolower()),
               volume2 = FALSE) 
    
    if (nrow(df_new) > 1) {
        for (i in 2:nrow(df_new)) {
            df_new$volume2[i] = (as.numeric(adist(df_new$id[i], df_new$id[i-1])) == 1 & 
                                     df_new$title_short[i] == df_new$title_short[i-1] &
                                     df_new$date[i] == df_new$date[i-1])
        }
    } else {
        df_new$volume2 <- NA
    }
        
    if (exact) {
        df_new <- df_new %>% 
            arrange(id) %>% 
            distinct(title_short, author_short, city_short, date, volume, volume2, .keep_all = TRUE) %>% 
            select(-title_length, -author_short, -title_short, -city_short, -volume2)
    } else {
        df_new <- df_new %>% 
            arrange(id) %>% 
            distinct(title_short, city_short, date, volume, volume2, .keep_all = TRUE) 
        df_new <- df_new %>% 
            distinct(title_short, author_short, date, volume, volume2, .keep_all = TRUE) %>%
            select(-title_length, -author_short, -title_short, -city_short, -volume2)
    }
    
    return(df_new)
}
mariolaespinosa/historicalnetworks documentation built on Feb. 9, 2022, 12:31 p.m.