R/deduplication_functions.R

Defines functions override_duplicates review_duplicates deduplicate extract_unique_references find_duplicates

Documented in deduplicate extract_unique_references find_duplicates override_duplicates review_duplicates

#' Detect duplicate values
#'
#' Identifies duplicate bibliographic entries using different duplicate
#' detection methods.
#' @param data A character vector containing duplicate bibliographic entries.
#' @param method A string indicating how matching should be calculated. Either
#' `"exact"` for exact matching (the default), or the name of a function for
#' calculating string distance.
#' @param group_by An optional vector, data.frame or list containing data to use
#' as 'grouping' variables; that is, categories within which duplicates should
#' be sought. Defaults to NULL, in which case all entries are compared against
#' all others. Ignored if `method = "exact"`.
#' @param threshold Numeric: the cutoff threshold for deciding if two strings
#' are duplicates. Sensible values depend on the `method` chosen. Defaults to 5
#' if `method = "string_osa"` and must be specified in all other instances
#' except `method = "exact"` (where no threshold is required).
#' @param to_lower Logical: Should all entries be converted to lower case before
#' calculating string distance? Defaults to `FALSE.`
#' @param rm_punctuation Logical: Should punctuation should be removed before
#' calculating string distance? Defaults to `FALSE.`
#' @return Returns a vector of duplicate matches, with `attributes` listing
#' methods used.
#' @seealso \code{\link{string_}} or \code{\link{fuzz_}} for suitable functions
#' to pass to \code{methods}; \code{\link{extract_unique_references}} and
#' \code{\link{deduplicate}} for higher-level functions.
#' @importFrom rlang abort
#' @example inst/examples/deduplicate.R
#' @export
find_duplicates <- function(
  data, # string
  method = "exact",
  group_by, # either a vector or a data.frame or list containing vectors
  threshold,
  to_lower = FALSE,
  rm_punctuation = FALSE
){
  # data
  if(missing(data)){
    abort("'data' is missing: Please provide a vector")
  }
  if(inherits(data, "data.frame")){
    abort("'data' must be a character vector, not a data.frame")
  }
  if(!inherits(data, "character")){
    data <- as.character(data)
  }

  # grouping
  if(missing(group_by)){
    group_variables <- NULL
  }else{
    if(!inherits(group_by, c("data.frame", "character", "list"))){
      abort("object specified by 'group_by' must be of class list, data.frame or character")
    }
    group_variables <- switch(class(group_by),
      "character" = {list(group_by)},
      "data.frame" = {as.list(group_by)},
      "list" = {group_by}
    )
  }

  # threshold
  if(missing(threshold)){
    if(method == "exact"){
      threshold <- NULL
    }else if(method == "string_osa"){
      threshold <- 5
    }else{
      abort("threshold is missing, with no default")
    }
  }

  # transformations
  if(to_lower){data <- tolower(data)}
  if(rm_punctuation){data <- gsub("[[:punct:]]", "", data)}

  # quick option for exact matching based on split()
  if(method == "exact"){
    order_initial <- seq_along(data)
    # ensure NAs are always given a unique value
    data[is.na(data)] <- paste0("MISSING_VALUE_", seq_along(which(is.na(data))))
    # split data by name
    order_list <- split(order_initial, data)
    names(order_list) <- NULL
    order_list <- order_list[order(unlist(lapply(order_list, min)))]
    result <- do.call(c, lapply(
      seq_along(order_list),
      function(a, ol){rep(a, length(ol[[a]]))},
      ol = order_list
    ))
    result <- result[order(do.call(c, order_list))]

  }else{ # i.e. if a distance-based method is requested

    # prep vectors to track progress
    checked <- rep(FALSE, length(data))
    group <- rep(FALSE, length(data))

    # run 'while' loop
    progress <- 1
    while(all(checked) == FALSE){
      remaining_rows <- which(checked == FALSE)
      if(length(remaining_rows) == 1){
        group[remaining_rows] <- progress
        checked[remaining_rows] <- TRUE
      }else{
        # locate relevant information
        row_start <- remaining_rows[1]
        # if this entry is empty, then skip (i.e. never match NAs)
        if(is.na(data[row_start])){
          checked[row_start] <- TRUE
          group[row_start] <- progress
        }else{
          # include only those entries in the same grouping categories as the current entry,
          # plus any entries that are missing those values
          if(is.null(group_variables)){
            rows_tr <- remaining_rows
          }else{
            match_list <- lapply(
              group_variables,
              function(a, row){
                if(is.na(a[row])){rep(TRUE, length(a))}else{a == a[row]}
              }, row = row_start
            )
            if(length(group_variables) == 1){
              rows_tr <- which(unlist(match_list))
            }else{
              rows_tr <- which(apply(
                do.call(cbind, match_list),
                1,
                function(a){all(a)}
              ))
            }
          }
          rows_tr <- rows_tr[which(rows_tr != row_start)]

          if(length(rows_tr) > 0){
            match_result <- do.call(
              method,
              list(
                a = data[row_start],
                b = data[rows_tr]
              )
            ) <= threshold
            if(any(match_result, na.rm = TRUE)){
              rows_selected <- rows_tr[which(match_result)]
              checked[c(row_start, rows_selected)] <- TRUE
              group[c(row_start, rows_selected)] <- progress
            }else{
              checked[row_start] <- TRUE
              group[row_start] <- progress
            }
          }else{
            checked[row_start] <- TRUE
            group[row_start] <- progress
          }
        } # end if(is.na(data[row_start, match_variable]))
      } # end if(length(remaining_rows) == 1)
      progress <- progress + 1
    } # end while loop
    result <- group
  }

  # add attributes
  attr(result, "method") <- method
  attr(result, "threshold") <- threshold
  attr(result, "to_lower") <- to_lower
  attr(result, "rm_punctuation") <- rm_punctuation
  return(result)
}

#' Remove duplicates from a bibliographic data set
#'
#' @description Given a list of duplicate entries and a data set, this function
#' extracts only unique references.
#' @param data A `data.frame` containing bibliographic information.
#' @param matches A vector showing which entries in `data` are duplicates.
#' @param type How should entries be selected to retain? Default is `"merge"`,
#' which selects the entries with the largest number of characters in each
#' column. Alternatively, `"select"` returns the row with the highest total
#' number of characters.
#' @return Returns a `data.frame` of unique references.
#' @seealso `find_duplicates()`, `deduplicate()`
#' @importFrom rlang abort
#' @example inst/examples/deduplicate.R
#' @export
extract_unique_references <- function(
  data, # data.frame
  matches, # vector showing which values are duplicates
  type = "merge"
){
  if(missing(matches)){
    a("please specify a vector containing identified matches
    (e.g. as returned by find_duplicates)")
  }
  if(length(matches) != nrow(data)){
    abort("'matches' does not have the same length as x")
  }
  if(!type %in% c("merge", "select")){
    abort("'type' must be one of 'select' or 'merge'")
  }

  x_split <- split(data, matches)
  x_split <- lapply(x_split, function(a, type){
    if(nrow(a) == 1){
      result <- a[1, ]
      result$n_duplicates <- 1
    }else{
      if(type == "merge"){
        result_list <- lapply(a, function(b){
          nx <- nchar(as.character(b))
          nx[is.na(nx)] <- 0
          return(b[which.max(nx)])
        })
        result <- as.data.frame(result_list, stringsAsFactors = FALSE)
      }else{
        row <- which.max(
          apply(
            apply(a, 1, nchar),
            2,
            function(b){sum(b, na.rm = TRUE)}
          )
        )
        result <- a[row, ]
      }
      result$n_duplicates <- nrow(a)
    }
    return(result)
  }, type = type
  )
  output <- as.data.frame(
    do.call(rbind, x_split),
    stringsAsFactors = FALSE
  )
  return(output)
}

#' Remove duplicates from a bibliographic data set
#'
#' @description Removes duplicates using sensible defaults
#'
#' @param data A `data.frame` containing bibliographic information.
#' @param match_by Name of the column in `data` where duplicates should be sought.
#' @param method The duplicate detection function to use; see
#' \code{link{string_}} or \code{link{fuzz_}} for examples. Passed to
#' `find_duplicates()`.
#' @param type How should entries be selected? Default is `"merge"` which
#' selects the entries with the largest number of characters in each column.
#' Alternatively `"select"` returns the row with the highest total number of
#' characters.
#' @param \dots Arguments passed to `find_duplicates()`.
#' @return A `data.frame` containing data identified as unique.
#' @details
#' This is a wrapper function to `find_duplicates()` and
#' `extract_unique_references()`, which tries to choose some sensible defaults.
#' Use with care.
#' @seealso `find_duplicates()` and `extract_unique_references()` for underlying
#' functions.
#' @example inst/examples/deduplicate.R
#' @export
deduplicate <- function(
  data,
  match_by,
  method,
  type = "merge",
  ... # args passed to find_duplicates
){
  if(missing(data)){
    abort("'data' is missing, with no default")
  }

  # add defaults
  if(missing(match_by)){
    if(any(colnames(data) == "doi")){
      data_fd <- data[, "doi"]
      if(missing(method)){method <- "exact"}
    }else{
      if(any(colnames(data) == "title")){
        data_fd <- data[, "title"]
        if(missing(method)){method <- "string_osa"}
      }else{
        abort("'match_by' is missing, with no default;
          please specify which column should be searched for duplicates"
        )
      }
    }
  }else{
    if(!any(colnames(data) == match_by)){
      abort(paste0(
        match_by,
        " is not a valid column name in ",
        data,
        ": Please specify which column should be searched for duplicates"
      ))
    }else{
      data_fd <- data[, match_by]
      if(missing(method)){method <- "string_osa"}
    }
  }

  result <- find_duplicates(as.character(data_fd),
                            method = method, ...)
  return(
    extract_unique_references(data, matches = result, type = type)
  )
}

#' Manually review potential duplicates
#' @description Allows users to manually review articles classified as
#' duplicates.
#' @param text A character vector of the text that was used to identify
#' potential duplicates.
#' @param matches Numeric: a vector of group numbers for texts that indicates
#' duplicates and unique values returned by the \code{\link{find_duplicates}}
#' function.
#' @return A `data.frame` of potential duplicates grouped together.
#' @export
review_duplicates <- function(text, matches){
  likely_duplicates <- unique(matches)[table(matches)>1]
  review <- data.frame(
    title = text[matches %in% likely_duplicates],
    matches = matches[matches %in% likely_duplicates],
    stringsAsFactors = FALSE
  )
  review <- review[order(review[,2]),]
  return(review)
}

#' Manually override duplicates
#' @description Re-assign group numbers to text that was classified as
#' duplicated but is unique.
#' @param matches Numeric: a vector of group numbers for texts that indicates
#' duplicates and unique values returned by the \code{\link{find_duplicates}}
#' function.
#' @param overrides Numeric: a vector of group numbers that are not true
#' duplicates.
#' @return The input \code{matches} vector with unique group numbers for members
#' of groups that the user overrides.
#' @export
override_duplicates <- function(matches, overrides){
  matches <- as.numeric(matches)
  for(i in 1:length(overrides)){
    matches[max(which(matches==overrides[i]))] <- max(matches)+1
  }
  return(matches)
}
rmetaverse/synthesisr documentation built on Feb. 23, 2025, 5:29 p.m.