R/c14_date_list_duplicates_remove.R
In c14bazAAR: Download and Prepare C14 Dates from Different Source Databases

Documented in remove_duplicates remove_duplicates.c14_date_list remove_duplicates.default

#### remove_duplicates ####

#' @rdname duplicates
#' @export
remove_duplicates <- function(x, preferences = NULL, supermerge = FALSE, log = TRUE) {
  UseMethod("remove_duplicates")
}

#' @rdname duplicates
#' @export
remove_duplicates.default <- function(x, preferences = NULL, supermerge = FALSE, log = TRUE) {
  stop("x is not an object of class c14_date_list")
}

#' @rdname duplicates
#' @export
remove_duplicates.c14_date_list <- function(x, preferences = NULL, supermerge = FALSE, log = TRUE) {

  # set usage option
  if (is.null(preferences) | !("sourcedb" %in% colnames(x))) {
    # 1. option: replace inconsistencies with NA
    removal_option <- 1
    message(
      "You did not provide the argument 'preferences' or your c14_date_list ",
      "does not contain the necessary column 'sourcedb'. That means that duplicates ",
      "are removed in a way that obscures conflicting information. As a result of this ",
      "vital data for your analysis might get lost. "
    )
    message(
      "Please check '?duplicates' for more information."
    )
  } else if (!supermerge & !is.null(preferences) & "sourcedb" %in% colnames(x)) {
    # 2. option: replace inconsistencies with the first value from the prefered database
    removal_option <- 2
  } else if (supermerge & !is.null(preferences) & "sourcedb" %in% colnames(x)) {
    # 3. option: supermerge
    removal_option <- 3
  }

  # filter dataset to database selection
  if (removal_option == 2 | removal_option == 3) {
    x %<>% dplyr::filter(
      .data$sourcedb %in% preferences
    ) %>%
      mark_duplicates()
  }

  # call functions if necessary duplicate_group column is missing
  if("duplicate_group" %in% colnames(x) %>% `!`) {
    x %<>% mark_duplicates()
  }

  # if if there are no duplicates, then stop. There's nothing to remove
  if(all(is.na(x[["duplicate_group"]]))) {
    message("No duplicates found.")
    return(x)
  }

  # start message:
  message(paste0("Removing duplicates... ", {if (nrow(x) > 10000) {"This may take several minutes."}}))

  # get all unique dates
  not_duplicates <- x %>%
    dplyr::filter(
      is.na(.data$duplicate_group)
    )

  # get all duplicates and order them by the duplicate group number
  duplicates <- x %>%
    dplyr::filter(
      !is.na(.data$duplicate_group)
    ) %>%
    dplyr::arrange(
      .data$duplicate_group
    )

  # combine the duplicates

  # 1. option
  if (removal_option == 1) {
    summarised_duplicates <- duplicates %>%
      split(., .$duplicate_group) %>%
      pbapply::pblapply(
        .,
        function(x) {
          dplyr::summarise_all(
            x,
            .funs = ~compare_and_combine_data_frame_values(.)
          )
        }
      ) %>%
      do.call(rbind, .) %>%
      dplyr::arrange(.data$duplicate_group)
  }

  # 2. option
  if (removal_option == 2) {
    preference_based_order <- unique(c(preferences, duplicates$sourcedb %>% unique))
    duplicates$sourcedb_factor <- factor(duplicates$sourcedb, levels = preference_based_order)
    summarised_duplicates <- duplicates %>%
      dplyr::group_by(.data$duplicate_group) %>%
      dplyr::arrange(.data$sourcedb_factor) %>%
      dplyr::filter(dplyr::row_number() == 1) %>%
      dplyr::ungroup() %>%
      dplyr::select(-.data$sourcedb_factor) %>%
      dplyr::arrange(.data$duplicate_group)
  }

  # 3. option
  if (removal_option == 3) {
    summarised_duplicates <- duplicates %>%
      dplyr::mutate(
        sourcedb_order = match(.data$sourcedb, preferences)
      ) %>%
      split(., .$duplicate_group) %>%
      pbapply::pblapply(
        .,
        function(x) {
          dplyr::summarise_all(
            x,
            .funs = ~supermerge_data_frame_values(., order = x$sourcedb_order)
          )
        }
      ) %>%
      do.call(rbind, .) %>%
      dplyr::select(
        -.data$sourcedb_order,
        -.data$sourcedb
      ) %>%
      dplyr::arrange(.data$duplicate_group)

    not_duplicates <- not_duplicates %>% dplyr::select(-.data$sourcedb)
  }

  # optional: add log string
  if (log) {
    # create log string: stringify variation in duplicates
    log_string <- duplicates %>%
      plyr::dlply("duplicate_group") %>%
      lapply(FUN = stringify_data_frame) %>%
      unlist
    # duplicates
    summarised_duplicates <- summarised_duplicates %>%
      dplyr::mutate(
        duplicate_remove_log = if(length(log_string) != 0) {
          log_string
        } else {
          NA_character_
        }
      )
    # not_duplicates
    if (nrow(not_duplicates) > 0) {
      not_duplicates$duplicate_remove_log <- NA
    }
  }

  # put not_duplicates and duplicates again together
  final_without_duplicates <- not_duplicates %>%
    rbind(summarised_duplicates)

  final_without_duplicates %>%
    dplyr::select(
      -.data$duplicate_group
    ) %>%
    as.c14_date_list() %>%
    return()
}

#### helper functions ####

stringify_data_frame <- function(x) {
  # remove all columns that are not character or numeric
  y <- x[, sapply(x, class) %in% c("character", "numeric", "double", "integer", "factor")]
  # create string representing the content of the data.frame
  sapply(colnames(y), function(x) {
    paste0(x, ": ", paste(unique(y[[x]]), collapse = "|")) }
  ) %>%
    paste(collapse = ", ") %>%
    return()
}

compare_and_combine_data_frame_values <- function(x) {
  # remove NA values
  y <- x[!is.na(x)]
  # if only NA, than return NA
  if(length(y) == 0) { return(x[1]) }
  # if all values are the same, than return this value
  if(all(y[1] == y)) {
    return(y[1])
  # else return NA
  } else {
    if(class(y) == "character") {
      return(NA_character_)
    } else {
      return(NA)
    }
  }
}

supermerge_data_frame_values <- function(x, order_vector) {
  # if all values are NA, than return NA
  if (all(is.na(x))) {
    if(class(x) == "character") {
      return(NA_character_)
    } else {
      return(NA)
    }
  # if all values are equal, than return that value
  } else if (length(unique(stats::na.omit(x))) == 1) {
    return(unique(stats::na.omit(x)))
  # else return the value with the highest rank
  } else {
    ordered <- x[order(order_vector)]
    return(ordered[which(!is.na(ordered))[1]])
  }
}

Any scripts or data that you put into this service are public.

c14bazAAR documentation built on March 26, 2020, 6:38 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

c14bazAAR
Download and Prepare C14 Dates from Different Source Databases

R/c14_date_list_duplicates_remove.R
In c14bazAAR: Download and Prepare C14 Dates from Different Source Databases

Defines functions remove_duplicates remove_duplicates.default remove_duplicates.c14_date_list

Documented in remove_duplicates remove_duplicates.c14_date_list remove_duplicates.default

Try the c14bazAAR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

c14bazAAR Download and Prepare C14 Dates from Different Source Databases

R/c14_date_list_duplicates_remove.R In c14bazAAR: Download and Prepare C14 Dates from Different Source Databases

Defines functions remove_duplicates remove_duplicates.default remove_duplicates.c14_date_list

Documented in remove_duplicates remove_duplicates.c14_date_list remove_duplicates.default

Try the c14bazAAR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

c14bazAAR
Download and Prepare C14 Dates from Different Source Databases

R/c14_date_list_duplicates_remove.R
In c14bazAAR: Download and Prepare C14 Dates from Different Source Databases