R/standardize_subject_ids.R

Defines functions check_subject_ids_oness correct_subject_ids check_subject_ids

Documented in check_subject_ids check_subject_ids_oness correct_subject_ids

#' Check whether the subject IDs comply with the expected format. When incorrect
#' IDs are found, the function sends a warning and the user can call the
#' `correct_subject_ids()` function to correct them.
#'
#' @param data The input data frame or linelist
#' @param target_columns A vector of column names with the subject ids.
#' @param prefix A prefix used in the subject IDs
#' @param suffix A suffix used in the subject IDs
#' @param range A vector with the range of numbers in the sample IDs
#' @param nchar An integer that represents the expected number of characters in
#'    the subject ids.
#'
#' @returns The input dataset with a warning if incorrect subject ids were found
#'
#' @examples
#' dat <- check_subject_ids(
#'   data           = readRDS(system.file("extdata", "test_df.RDS",
#'                                        package = "cleanepi")),
#'   target_columns = "study_id",
#'   prefix         = "PS",
#'   suffix         = "P2",
#'   range          = c(1, 100),
#'   nchar          = 7
#' )
#' @export
check_subject_ids <- function(data,
                              target_columns,
                              prefix         = NULL,
                              suffix         = NULL,
                              range          = NULL,
                              nchar          = NULL) {
  checkmate::assert_data_frame(data, null.ok = FALSE)
  checkmate::assert_character(target_columns, null.ok = FALSE,
                              any.missing = FALSE, len = 1L)
  checkmate::assert_vector(prefix, min.len = 1L, null.ok = TRUE,
                           any.missing = FALSE)
  checkmate::assert_vector(suffix, min.len = 1L, null.ok = TRUE,
                           any.missing = FALSE)
  checkmate::assert_vector(range, any.missing = FALSE, min.len = 2L,
                           null.ok = TRUE, unique = TRUE, max.len = 2L)
  checkmate::assert_numeric(nchar, null.ok = TRUE, any.missing = FALSE,
                            len = 1L)

  # get the correct names in case some have been modified - see the
  # `retrieve_column_names()` function for more details
  target_columns <- retrieve_column_names(data, target_columns)

  # coerce id column to character
  if (is.numeric(data[[target_columns]]) || is.factor(data[[target_columns]])) {
    data[[target_columns]] <- as.character(data[[target_columns]])
  }

  # check for missing and duplicated ids
  data       <- check_subject_ids_oness(data, target_columns)
  bad_rows   <- NULL
  regex_test <- ""

  # we will use regular expressions to match on prefix and suffix
  if (!is.null(prefix)) {
    prefix     <- paste(prefix, sep = "|")
    regex_test <- paste0(regex_test, "pre_")
  }
  if (!is.null(suffix)) {
    suffix     <- paste(suffix, sep = "|")
    regex_test <- paste0(regex_test, "suf")
  }
  regex_match  <- switch(regex_test,
                         pre_    = sprintf("^%s", prefix),
                         pre_suf = sprintf("^%s.*%s$", prefix, suffix),
                         suf     = sprintf("*%s$", suffix))
  if (!is.null(regex_match)) {
    bad_rows <- c(bad_rows, which(!grepl(regex_match, data[[target_columns]])))
  }

  # the usage of regular expression to determine whether numbers belong to a
  # specified range is not trivial. we use an approach where we parse numbers
  # only.
  if (!is.null(range)) {
    numbers_in <- as.numeric(unlist(lapply(data[[target_columns]],
                                           readr::parse_number)))
    bad_rows   <- c(
      bad_rows,
      which(!(numbers_in >= min(range) & numbers_in <= max(range)))
    )
  }

  # detect subject IDs where the number of characters is not as expected
  if (!is.null(nchar)) {
    bad_rows <- c(bad_rows, which(!nchar(data[[target_columns]]) == nchar))
  }

  # remove the incorrect rows
  if (!is.null(bad_rows)) {
    bad_rows     <- unique(bad_rows)
    tmp_report   <- data.frame(idx = bad_rows,
                               ids = data[[target_columns]][bad_rows])
    bad_rows     <- toString(bad_rows)
    warning("Detected incorrect subject ids at lines: ", bad_rows,
            "\nUse the correct_subject_ids() function to adjust them.\n",
            call. = FALSE)
    data         <- add_to_report(x     = data,
                                  key   = "incorrect_subject_id",
                                  value = tmp_report)
  }

  return(data)
}

#' Correct the wrong subject IDs based on the user-provided values.
#'
#' After detecting incorrect subject IDs from the `check_subject_ids()`
#' function, use this function to provide the correct IDs and perform the
#' substitution.
#'
#' @inheritParams check_subject_ids
#' @param correction_table A data frame with the following two columns:
#'  \enumerate{
#'    \item `from`: a column with the wrong subject IDs,
#'    \item `to`: a column with the values to be used to substitute the
#'        incorrect ids.
#'  }
#'
#' @returns The input dataset where all subject ids comply with the expected
#'    format.
#' @export
#'
#' @examples
#' # detect the incorrect subject ids
#' dat <- check_subject_ids(
#'   data           = readRDS(system.file("extdata", "test_df.RDS",
#'                                        package = "cleanepi")),
#'   target_columns = "study_id",
#'   prefix         = "PS",
#'   suffix         = "P2",
#'   range          = c(1, 100),
#'   nchar          = 7
#' )
#'
#' # generate the correction table
#' correction_table <- data.frame(
#'   from = c("P0005P2", "PB500P2", "PS004P2-1"),
#'   to   = c("PB005P2", "PB050P2", "PS004P2")
#' )
#'
#' # perform the correction
#' dat <- correct_subject_ids(
#'   data             = dat,
#'   target_columns   = "study_id",
#'   correction_table = correction_table
#' )
correct_subject_ids <- function(data, target_columns, correction_table) {
  checkmate::assert_data_frame(correction_table, any.missing = FALSE,
                               min.rows = 1L, ncols = 2L, null.ok = FALSE,
                               col.names = "named")
  checkmate::assert_names(names(correction_table),
                          identical.to = c("from", "to"))

  stopifnot("Some ids in the correction table were not found in the input data"
            = all(correction_table[["from"]] %in% data[[target_columns]]))

  # perform the substitution
  idx                         <- match(correction_table[["from"]],
                                       data[[target_columns]])
  data[[target_columns]][idx] <- correction_table[["to"]]

  # check whether substitution did not introduce any duplicate
  data                        <- check_subject_ids_oness(data, target_columns)

  return(data)
}

#' Checks the uniqueness in values of the sample IDs column
#'
#' @param data the input data frame
#' @param id_col_name the name of the column that contains the sample IDs
#'
#' @returns the input data frame with and extra element in its attributes when
#'    there are missing or duplicated IDs.
#' @keywords internal
#'
check_subject_ids_oness <- function(data, id_col_name) {
  # check for missing values in ID column
  if (anyNA(data[[id_col_name]])) {
    idx                <- which(is.na(data[[id_col_name]]))
    warning("\nMissing values found on ID column in lines: ",
            paste(idx, sep = ", "), call. = FALSE)
    data <- add_to_report(x     = data,
                          key   = "missing_ids",
                          value = paste(idx, sep = ", "))
  }

  # check for duplicates ID column
  duplicated_ids       <- find_duplicates(data, id_col_name)
  tmp_report           <- attr(duplicated_ids, "report")
  if (!is.null(tmp_report) && "duplicated_rows" %in% names(tmp_report) &&
      nrow(tmp_report[["duplicated_rows"]]) > 0L) {
    dups               <- tmp_report[["duplicated_rows"]]
    data               <- add_to_report(x     = data,
                                        key   = "duplicated_ids",
                                        value = dups)
  }

  return(data)
}

Try the cleanepi package in your browser

Any scripts or data that you put into this service are public.

cleanepi documentation built on June 22, 2024, 9:48 a.m.