#' @title Identify potential duplicates based on title and year
#'
#' @param CitDat A dataframe/tibble returned by \code{\link[CitaviR]{find_obvious_dups}}
#' or \code{\link[CitaviR]{handle_obvious_dups}}.
#' @param minSimilarity Minimum similarity (between 0 and 1). Default is 0.6. (TO DO)
#' @param potDupAfterObvDup If TRUE (default), the newly created column
#' \code{pot_dup_id} is moved right next to the \code{obv_dup_id} column.
#' @param maxNumberOfComp Maximum number of clean_title similarity calculations to be made.
#' It is set to 1,000,000 by default (which corresponds to ~ 1414 clean_titles). TO DO: Document while-loop.
#' @param quiet If \code{TRUE}, all output will be suppressed.
#'
#' @details
#' `r lifecycle::badge("maturing")` \cr
#' Currently this only works for files that were generated while Citavi
#' was set to "English" so that column names are "Short Title" etc.
#'
#' @examples
#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
#' CitDat <- read_Citavi_ctv6(example_path) %>%
#' find_obvious_dups() %>%
#' find_potential_dups()
#'
#' CitDat %>%
#' dplyr::select(clean_title_id, obv_dup_id, pot_dup_id)
#'
#' # check similarity yourself - it's a single typo:
#' CitDat %>%
#' dplyr::select(clean_title)
#'
#' @return A tibble containing one new column: \code{pot_dup_id}.
#'
#' @importFrom RcppAlgos comboGeneral
#' @importFrom scales percent
#' @importFrom stats na.omit
#' @importFrom stringdist stringdist
#' @importFrom stringr str_length
#' @importFrom stringr str_pad
#' @importFrom tidyr pivot_longer
#' @import crayon
#' @import dplyr
#'
#' @export
find_potential_dups <- function(CitDat, minSimilarity = 0.6, potDupAfterObvDup = TRUE, maxNumberOfComp = 1000000, quiet = FALSE) {
bignum <- function(x){format(x, digits = 0, big.mark = ",", scientific = FALSE)}
# combinations of clean_title ---------------------------------------------
ct <- CitDat %>% pull(.data$clean_title) %>% unique # should be equal to filtering for dup_01
pot_dup_id_padding <- ct %>% n_distinct() %>% log(10) %>% ceiling() + 1
ct_similar <- RcppAlgos::comboGeneral(v = ct, 2) %>% # slower alternative: combn()
as_tibble(.name_repair = ~c("ct1", "ct2")) %>%
mutate(ct1nchar = stringr::str_length(.data$ct1), # number of characters in string
ct2nchar = stringr::str_length(.data$ct2)) # slower alternative: nchar()
# Too many combinations? --------------------------------------------------
NumberOfComp <- nrow(ct_similar)
ncharDiffCutoff <- 42 # Don't panic!
if (NumberOfComp > maxNumberOfComp) {
# Msg: Too many comparisons
if (!quiet) {
cat(
blue("clean_title comparisons ="),
bignum(NumberOfComp),
red(">"),
bignum(maxNumberOfComp),
blue(
"= maxNumberOfComp\n",
" Trying to ignore comparisons with large differences in character length:"
),
"\n "
)
}
# Reduce until not too many comparisons
while (NumberOfComp > maxNumberOfComp) {
if (!quiet) {
cat(blue(ncharDiffCutoff, ""))
}
ncharDiffCutoff <- ncharDiffCutoff - 1
ct_similar <-
ct_similar %>% filter(abs(.data$ct1nchar - .data$ct2nchar) <= ncharDiffCutoff)
NumberOfComp <- nrow(ct_similar)
if (ncharDiffCutoff == 1) {
stop("You must set a larger maxNumberOfComp!")
}
}
# Msg: Suitable reduction
if (!quiet) {
cat(
"\n",
blue(
" clean_title comparisons with a character length difference >",
ncharDiffCutoff,
"are ignored."
),
"\n"
)
}
}
# Msg: Not too many comparisons
if (!quiet) {
cat(
blue("clean_title comparisons ="),
bignum(NumberOfComp),
green("<"),
bignum(maxNumberOfComp),
blue("= maxNumberOfComp"),
"\n"
)
}
# calculate similarity ----------------------------------------------------
if (!quiet) {
cat(blue(" calculating similarity now..."), "\n")
}
sta <- Sys.time() # TO DO: Progress bar without losing efficiency?
ct_similar <- ct_similar %>%
mutate(
similarity = 1 -
stringdist::stringdist(.data$ct1, .data$ct2, method = "lv") /
pmax(.data$ct1nchar, .data$ct2nchar)
# RecordLinkage::levenshteinSim(str1 = .data$ct1, str2 = .data$ct2), # slower alternative
)
end <- Sys.time()
# Msg: Similarity calculation took this long
if (!quiet) {
cat(blue(
" calculating similarity done:",
round(end - sta, 1),
"sec elapsed"
),
"\n")
}
# create clean_title_similarity -------------------------------------------
ct_similar <- ct_similar %>% # similarity per pair
filter(.data$similarity >= minSimilarity) %>% # keep only those with a minimum similarity
arrange(desc(.data$similarity)) %>% # highest similarity ...
mutate(similarityRank = 1:n()) %>% # ... gets best rank
mutate(pot_dup_id = paste0( # "potdup_001 (99.9% similarity)" etc.
"potdup_", stringr::str_pad(.data$similarityRank, pot_dup_id_padding, pad = "0"),
" (", scales::percent(.data$similarity, accuracy = 0.1), " similarity)"
)) %>%
# columns served their purpose
select(-.data$similarity,
-.data$similarityRank,
-.data$ct1nchar,
-.data$ct2nchar) %>%
# convert wide: [ct1 ct2 pot_dup_id] to long: [ct pot_dup_id]
tidyr::pivot_longer(
cols = .data$ct1:.data$ct2,
values_to = "clean_title",
names_to = NULL
) %>%
# in case there are multiple pot_dup_id per clean_title: collapse
group_by(.data$clean_title) %>%
mutate(pot_dup_id = paste(stats::na.omit(.data$pot_dup_id), collapse =
"; ")) %>%
ungroup() %>%
unique()
# join with CitDat --------------------------------------------------------
CitDat <-
left_join(x = CitDat, y = ct_similar, by = "clean_title") %>%
mutate(pot_dup_id = if_else(
as.integer(gsub("dup_","",.data$obv_dup_id)) > 1, # not dup_01 or dup_001 or dup_0001 ...
NA_character_,
.data$pot_dup_id
))
# potDupAfterObvDup -------------------------------------------------------
if (potDupAfterObvDup) {
CitDat <- CitDat %>%
relocate("pot_dup_id", .after = "obv_dup_id")
}
# return tibble -----------------------------------------------------------
CitDat
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.