R/validate.R

Defines functions validateStrataArgument validateNameStyle addMesSup isResultSuppressed validateColumn validateNewColumn validateResultArgument validateCdmArgument nameAgeGroup thereIsOverlap isIntegerish correctAgeGroup validateAgeGroupArgument assertWindowName getWindowNames validateWindowArgument assertValidation validateConceptSetArgument getId removePackageName isTidySelect validateCohortIdArgument validateCohortArgument validateNameArgument

Documented in isResultSuppressed validateAgeGroupArgument validateCdmArgument validateCohortArgument validateCohortIdArgument validateColumn validateConceptSetArgument validateNameArgument validateNameStyle validateNewColumn validateResultArgument validateStrataArgument validateWindowArgument

# Copyright 2024 DARWIN EU (C)
#
# This file is part of omopgenerics
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Validate name argument. It must be a snake_case character vector. You can add
#' the a cdm object to check `name` is not already used in that cdm.
#'
#' @param name Name of a new table to be added to a cdm object.
#' @param cdm A cdm_reference object. It will check if a table named name
#' already exists in the cdm.
#' @param null If TRUE, name can be NULL
#' @param validation How to perform validation: "error", "warning".
#' @param call A call argument to pass to cli functions.
#'
#' @export
#' @examples
#' # this is a validate name
#' name <- "my_new_table"
#' validateNameArgument(name)
#'
#' # this is not
#' name <- "myTableNAME"
#' validateNameArgument(name, validation = "warning")
#'
validateNameArgument <- function(name,
                                 cdm = NULL,
                                 validation = "error",
                                 null = FALSE,
                                 call = parent.frame()) {
  assertValidation(validation)
  if (isTRUE(null) && (is.null(name) | isTRUE(is.na(name)))) {
    return(NULL)
  }

  nm <- substitute(name) |> utils::capture.output()
  assertCharacter(name, length = 1, call = call)
  newName <- toSnakeCase(name)
  if (newName != name) {
    if (validation == "error") {
      cli::cli_abort(c("!" = "`{nm}` is not snake_case it should be modified to: {newName}"))
    } else if (validation == "warning") {
      cli::cli_warn(c("!" = "`{nm}` was modified: {name} -> {newName}"))
    }
  }
  if (!is.null(cdm)) {
    if (newName %in% names(cdm)) {
      if (validation == "error") {
        cli::cli_abort(c("!" = "There already exist a table named {.var {newName}}."))
      } else if (validation == "warning") {
        cli::cli_warn(c("!" = "There already exist a table named {.var {newName}}. It will be overwritten."))
      }
    }
  }
  return(newName)
}

#' Validate a cohort table input.
#'
#' @param cohort Object to be validated as a valid cohort input.
#' @param checkEndAfterStart If TRUE a check that all cohort end dates come on
#' or after cohort start date will be performed.
#' @param checkOverlappingEntries If TRUE a check that no individuals have
#' overlapping cohort entries will be performed.
#' @param checkMissingValues If TRUE a check that there are no missing values in
#' required fields will be performed.
#' @param checkInObservation If TRUE a check that cohort entries are within
#' the individuals observation periods will be performed.
#' @param checkAttributes Whether to check if attributes are present and
#' populated correctly.
#' @param checkPermanentTable Whether to check if the table has to be a
#' permanent table.
#' @param dropExtraColumns Whether to drop extra columns that are not the
#' required ones.
#' @param validation How to perform validation: "error", "warning".
#' @param call A call argument to pass to cli functions.
#'
#' @export
#' @examples
#' cdm <- cdmFromTables(
#'   tables = list(
#'     "person" = dplyr::tibble(
#'       person_id = c(1, 2, 3), gender_concept_id = 0, year_of_birth = 1990,
#'       race_concept_id = 0, ethnicity_concept_id = 0
#'     ),
#'     "observation_period" = dplyr::tibble(
#'       observation_period_id = 1:3, person_id = 1:3,
#'       observation_period_start_date = as.Date("2000-01-01"),
#'       observation_period_end_date = as.Date("2023-12-31"),
#'       period_type_concept_id = 0
#'     )
#'    ),
#'    cohortTables = list(
#'     cohort = dplyr::tibble(
#'       cohort_definition_id = 1L,
#'       subject_id = 1L,
#'       cohort_start_date = as.Date("2020-01-01"),
#'       cohort_end_date = as.Date("2021-02-10")
#'     )
#'   ),
#'   cdmName = "mock"
#' )
#'
#' validateCohortArgument(cdm$cohort)
#'
validateCohortArgument <- function(cohort,
                                   checkEndAfterStart = FALSE,
                                   checkOverlappingEntries = FALSE,
                                   checkMissingValues = FALSE,
                                   checkInObservation = FALSE,
                                   checkAttributes = FALSE,
                                   checkPermanentTable = FALSE,
                                   dropExtraColumns = FALSE,
                                   validation = "error",
                                   call = parent.frame()) {
  assertValidation(validation)
  assertLogical(checkEndAfterStart, length = 1)
  assertLogical(checkOverlappingEntries, length = 1)
  assertLogical(checkMissingValues, length = 1)
  assertLogical(checkInObservation, length = 1)
  assertLogical(checkAttributes, length = 1)
  assertLogical(dropExtraColumns, length = 1)
  assertLogical(checkPermanentTable, length = 1)

  assertClass(cohort, class = c("cohort_table", "cdm_table"), all = TRUE, call = call)

  if (is.na(tableName(cohort)) & checkPermanentTable) {
    missingCohortTableNameError(cdmReference(cohort), validation = validation)
  }

  # columns
  notPresent <- cohortColumns("cohort")[!cohortColumns("cohort") %in% colnames(cohort)]
  if (length(notPresent) > 0) {
    if (validation == "error") {
      cli::cli_abort(c("!" = "columns: {.var {notPresent}} not present in cohort object"), call = call)
    } else if (validation == "warning") {
      cli::cli_warn(c("!" = "columns: {.var {notPresent}} not present in cohort object"), call = call)
    }
  }
  if (isTRUE(checkEndAfterStart)) {
    cohort <- checkStartEnd(cohort = cohort, validation = validation, call = call)
  }
  if (isTRUE(checkOverlappingEntries)) {
    cohort <- checkOverlap(cohort = cohort, validation = validation, call = call)
  }
  if (isTRUE(checkMissingValues)) {
    cohort <- checkNaCohort(cohort = cohort, validation = validation, call = call)
  }
  if (isTRUE(checkInObservation)) {
    cohort <- checkObservationPeriod(cohort = cohort, validation = validation, call = call)
  }
  if (isTRUE(checkAttributes)) {
    cohort <- checkCohortAttributes(cohort = cohort, validation = validation, call = call)
  }
  if (dropExtraColumns) {
    cols <- colnames(cohort)
    extraColumns <- cols[!cols %in% cohortColumns("cohort")]
    if (length(extraColumns) > 0) {
      cli::cli_warn(c("!" = "Extra columns dropped: {.var {extraColumns}}."))
    }
    cohort <- cohort |>
      dplyr::select(dplyr::any_of(cohortColumns("cohort")))
  } else {
    cohort <- cohort |>
      dplyr::relocate(dplyr::any_of(cohortColumns("cohort")))
  }

  return(cohort)
}

#' Validate cohortId argument. CohortId can either be a cohort_definition_id
#' value, a cohort_name or a tidyselect expression referinc to cohort_names. If
#' you want to support tidyselect expressions please use the function as:
#' `validateCohortIdArgument({{cohortId}}, cohort)`.
#'
#' @param cohortId A cohortId vector to be validated.
#' @param cohort A cohort_table object.
#' @param null Whether `NULL` is accepted. If NULL all `cohortId` will be
#' returned.
#' @param validation How to perform validation: "error", "warning".
#' @param call A call argument to pass to cli functions.
#'
#' @export
#' @examples
#' cdm <- cdmFromTables(
#'   tables = list(
#'     "person" = dplyr::tibble(
#'       person_id = c(1, 2, 3), gender_concept_id = 0, year_of_birth = 1990,
#'       race_concept_id = 0, ethnicity_concept_id = 0
#'     ),
#'     "observation_period" = dplyr::tibble(
#'       observation_period_id = 1:3, person_id = 1:3,
#'       observation_period_start_date = as.Date("2000-01-01"),
#'       observation_period_end_date = as.Date("2023-12-31"),
#'       period_type_concept_id = 0
#'     )
#'    ),
#'    cohortTables = list(
#'     cohort = dplyr::tibble(
#'       cohort_definition_id = 1L,
#'       subject_id = 1L,
#'       cohort_start_date = as.Date("2020-01-01"),
#'       cohort_end_date = as.Date("2021-02-10")
#'     )
#'   ),
#'   cdmName = "mock"
#' )
#'
#' validateCohortIdArgument(NULL, cdm$cohort)
#' validateCohortIdArgument(1L, cdm$cohort)
#' validateCohortIdArgument(2L, cdm$cohort, validation = "warning")
#'
validateCohortIdArgument <- function(cohortId,
                                     cohort,
                                     null = TRUE,
                                     validation = "error",
                                     call = parent.frame()) {
  assertValidation(validation)
  assertClass(cohort, class = "cohort_table", call = call)
  assertLogical(null, length = 1)

  set <- settings(cohort)

  if (isTidySelect(rlang::enquo(cohortId))) {
    cohortId <- selectTables(set$cohort_name, cohortId)
  }

  if (is.null(cohortId) & !null) {
    cli::cli_abort(c("x" = "`NULL` is not allowed for {.var cohortId}"))
  }

  if (is.null(cohortId)) {
    cohortId <- set$cohort_definition_id
  } else if (is.numeric(cohortId)) {
    cohortId <- as.integer(cohortId)
    areIn <- cohortId %in% set$cohort_definition_id
    notPresent <- cohortId[!areIn]
    cohortId <- cohortId[areIn]
    if (length(notPresent) > 0) {
      report(
        message = "cohort definition id: {notPresent} not defined in settings.",
        validation = validation,
        call = call
      )
    }
    cohortId <- set$cohort_definition_id[getId(set$cohort_definition_id, cohortId)]
  } else if (is.character(cohortId)) {
    areIn <- cohortId %in% set$cohort_name
    notPresent <- cohortId[!areIn]
    cohortId <- cohortId[areIn]
    if (length(notPresent) > 0) {
      report(
        message = "cohort name: {notPresent} not defined in settings.",
        validation = validation,
        call = call
      )
    }
    cohortId <- set$cohort_definition_id[getId(set$cohort_name, cohortId)]
  } else {
    cli::cli_abort("{.arg cohortId} can either be an integer, a character, a tidyselect expression or NULL.")
  }

  if (length(cohortId) == 0) {
    report(message = "cohortId is empty.", validation = validation, call = call)
  }

  return(cohortId)
}
isTidySelect <- function(arg) {
  # check if call
  isCall <- rlang::quo_is_call(arg)

  # selection functions that we want to support
  tidyFunctions <- c(
    "starts_with", "contains", "ends_with", "matches",
    "everything", "all_of", "any_of"
  )

  if (isCall) {
    fn <- as.character(rlang::quo_get_expr(arg))[1] |>
      removePackageName()
    return(fn %in% tidyFunctions)
  }

  return(FALSE)
}
removePackageName <- function(x) {
  x <- stringr::str_split_1(x, "::")
  x[length(x)]
}
getId <- function(x, ids) {
  purrr::map_int(ids, \(xx) which(x == xx))
}

#' Validate conceptSet argument. It can either be a list, a codelist, a
#' conceptSetExpression or a codelist with details. The output will always be a
#' codelist.
#'
#' @param conceptSet It can be either a named list of concepts or a codelist,
#' codelist_with_details or conceptSetExpression object.
#' @param cdm A cdm_reference object, needed if a conceptSetExpression is
#' provided.
#' @param validation How to perform validation: "error", "warning".
#' @param call A call argument to pass to cli functions.
#'
#' @return A codelist object.
#'
#' @export
#' @examples
#' conceptSet <- list(disease_x = c(1L, 2L))
#' validateConceptSetArgument(conceptSet)
#'
validateConceptSetArgument <- function(conceptSet,
                                       cdm = NULL,
                                       validation = "error",
                                       call = parent.frame()) {
  if (is.null(conceptSet)) return(emptyCodelist())
  if (inherits(conceptSet, "codelist")) {
    conceptSet <- validateCodelist(conceptSet, call = call)
  } else if (inherits(conceptSet, "codelist_with_details")) {
    conceptSet <- validateCodelistWithDetails(conceptSet, call) |>
      purrr::map(\(x) dplyr::pull(x, "concept_id")) |>
      newCodelist()
  } else if (inherits(conceptSet, "conceptSetExpression")) {
    concepts <- validateConceptSetExpression(conceptSet, call)
    concepts <- concepts |>
      lapply(dplyr::select, c("concept_id", "excluded", "descendants")) |>
      dplyr::bind_rows(.id = "concept_name")
    descendants <- concepts |>
      dplyr::filter(.data$descendants == TRUE) |>
      dplyr::select(-"descendants")
    if (nrow(descendants) > 0) {
      assertClass(cdm, "cdm_reference")
      nm <- uniqueTableName()
      cdm <- insertTable(cdm = cdm, name = nm, table = descendants)
      descendants <- cdm$concept_ancestor |>
        dplyr::rename("to_join" = "ancestor_concept_id") |>
        dplyr::inner_join(
          cdm[[nm]] |>
            dplyr::rename("to_join" = "concept_id"),
          by = "to_join"
        ) |>
        dplyr::select(
          "concept_id" = "descendant_concept_id", "excluded", "concept_name"
        ) |>
        dplyr::collect()
      concepts <- concepts |>
        dplyr::filter(.data$descendants == FALSE) |>
        dplyr::select(-"descendants") |>
        dplyr::union_all(descendants)
    } else {
      concepts <- concepts |> dplyr::select(-"descendants")
    }
    conceptSet <- concepts |>
      dplyr::filter(.data$excluded == FALSE) |>
      dplyr::select(-"excluded") |>
      dplyr::anti_join(
        concepts |>
          dplyr::filter(.data$excluded == TRUE),
        by = c("concept_id", "concept_name")
      ) |>
      dplyr::group_by(.data$concept_name) |>
      dplyr::group_split() |>
      as.list()
    names(conceptSet) <- purrr::map_chr(conceptSet, \(x) unique(x$concept_name))
    conceptSet <- conceptSet |>
      purrr::map(\(x) unique(x$concept_id)) |>
      newCodelist()
  } else {
    conceptSet <- newCodelist(conceptSet)
  }
  return(conceptSet)
}
assertValidation <- function(validation, call = parent.frame()) {
  validation |>
    assertChoice(choices = c("error", "warning"), length = 1, call = call)
}

#' Validate a window argument. It must be a list of two elements (window start
#' and window end), both must be integerish and window start must be lower or
#' equal than window end.
#'
#' @param window time window
#' @param snakeCase return default window  name in snake case if TRUE
#' @param call A call argument to pass to cli functions.
#'
#' @return time window
#' @export
#' @examples
#' validateWindowArgument(list(c(0, 15), c(-Inf, Inf)))
#' validateWindowArgument(list(c(0, 15), c(-Inf, Inf)), snakeCase = FALSE)
#'
validateWindowArgument <- function(window,
                                   snakeCase = TRUE,
                                   call = parent.frame()) {
  assertLogical(snakeCase, length = 1, call = call)

  if (!is.list(window)) {
    window <- list(window)
  }

  # Find if any NA, throw warning that it will be changed to Inf, change it later
  if (any(unlist(lapply(window, is.na)))) {
    cli::cli_abort("NA found in window, please use Inf or -Inf instead", call = call)
  }

  assertList(window, call = call)
  elements <- window |>
    unlist() |>
    purrr::keep(\(x) !is.infinite(x)) |>
    unique()
  assertNumeric(elements, integerish = TRUE, call = call, msg = "Elements of window must be integerish.")

  # if any element of window list has length over 2, throw error
  if (any(lengths(window) > 2)) {
    "window can only contain two values: windowStart and windowEnd" |>
      cli::cli_abort(call = call)
  }

  # eg if list(1,2,3), change to list(c(1,1), c(2,2), c(3,3))
  if (length(window) > 1 && any(lengths(window) == 1)) {
    window[lengths(window) == 1] <- lapply(
      window[lengths(window) == 1],
      function(x) {
        c(
          unlist(x[lengths(x) == 1]),
          unlist(x[lengths(x) == 1])
        )
      }
    )
    cli::cli_warn(
      "Window list contains element with only 1 value provided,
          use it as both window start and window end"
    )
  }

  assertWindowName(window, snakeCase, call = call)
}
getWindowNames <- function(window, snakeCase) {
  # snakecase
  getname <- function(element) {
    element <- tolower(as.character(element))
    element <- stringr::str_replace_all(
      string = element,
      pattern = "-",
      replacement = "m"
    )
    paste0(element[1], "_to_", element[2])
  }
  # snakecase False
  getname2 <- function(element) {
    element <- tolower(as.character(element))
    paste0(element[1], " to ", element[2])
  }

  windowNames <- names(window)

  if (isTRUE(snakeCase)) {
    if (is.null(windowNames)) {
      windowNames <- purrr::map_chr(window, getname)
    } else {
      id <- windowNames == ""
      windowNames[id] <- purrr::map_chr(window[id], getname)
      newNames <- toSnakeCase(windowNames)
      differentNames <- which(windowNames != newNames)
      if (length(differentNames) > 0) {
        newName <- newNames[differentNames]
        oldName <- windowNames[differentNames]
        changes <- paste0("`", oldName, "` -> `", newName, "`") |>
          rlang::set_names(rep("*", length(newName)))
        cli::cli_inform(c("window names casted to snake_case: ", changes))
      }
      windowNames <- newNames
    }
  } else {
    if (is.null(windowNames)) {
      windowNames <- purrr::map_chr(window, getname2)
    } else {
      id <- windowNames == ""
      windowNames[id] <- purrr::map_chr(window[id], getname2)
    }
  }
  windowNames
}
assertWindowName <- function(window, snakeCase, call = parent.frame()) {
  names(window) <- getWindowNames(window, snakeCase = snakeCase)
  lower <- purrr::map_dbl(window, \(x) x[1])
  upper <- purrr::map_dbl(window, \(x) x[2])

  if (any(lower > upper)) {
    "First element in window must be smaller or equal to the second one" |>
      cli::cli_abort(call = call)
  }
  if (any(is.infinite(lower) & lower == upper & sign(upper) == 1)) {
    cli::cli_abort("Not both elements in the window can be +Inf", call = call)
  }
  if (any(is.infinite(lower) &
          lower == upper & sign(upper) == -1)) {
    cli::cli_abort("Not both elements in the window can be -Inf", call = call)
  }

  window
}

#' Validate the ageGroup argument. It must be a list of two integerish numbers
#' lower age and upper age, both of the must be greater or equal to 0 and lower
#' age must be lower or equal to the upper age. If not named automatic names
#' will be given in the output list.
#'
#' @param ageGroup age group in a list.
#' @param multipleAgeGroup allow mutliple age group.
#' @param overlap allow overlapping ageGroup.
#' @param null null age group allowed true or false.
#' @param ageGroupName Name of the default age group.
#' @param call parent frame.
#'
#' @return validate ageGroup
#' @export
#' @examples
#' validateAgeGroupArgument(list(c(0, 39), c(40, Inf)))
#'
validateAgeGroupArgument <- function(ageGroup,
                                     multipleAgeGroup = TRUE,
                                     overlap = FALSE,
                                     null = TRUE,
                                     ageGroupName = "age_group",
                                     call = parent.frame()) {
  # initial checks
  assertLogical(null, length = 1)
  assertLogical(multipleAgeGroup, length = 1)
  assertLogical(overlap, length = 1)
  assertCharacter(ageGroupName, length = 1)

  if (is.null(ageGroup)) {
    if (null) {
      return(NULL)
    } else {
      cli::cli_abort("`ageGroup` argument can not be NULL.", call = call)
    }
  }

  # convert to list of lists if it is not
  if (is.numeric(ageGroup)) {
    ageGroup <- list(list(ageGroup))
  } else if (rlang::is_bare_list(ageGroup)) {
    if (length(ageGroup) == 0) {
      return(NULL)
    } else if (is.numeric(ageGroup[[1]])) {
      ageGroup <- list(ageGroup)
    }
  } else {
    cli::cli_abort("`ageGroup` must be a list of age groups.", call = call)
  }

  len <- length(ageGroup)

  # check multiple age group
  if (!multipleAgeGroup & len > 1) {
    cli::cli_abort("Multiple age group are not allowed", call = call)
  }

  # correct individual age groups
  ageGroup <- ageGroup |>
    purrr::map(\(x) correctAgeGroup(x, overlap = overlap, call = call))

  # correct age group names
  nms <- names(ageGroup)
  if (is.null(nms)) nms <- rep("", len)
  if (len == 1 & identical(nms, "")) {
    names(ageGroup) <- ageGroupName
  } else {
    for (k in seq_len(len)) {
      if (nms[k] == "") nms[k] <- paste0(ageGroupName, "_", k)
    }
    names(ageGroup) <- nms
  }

  return(ageGroup)
}
correctAgeGroup <- function(ageGroup,
                            overlap,
                            call) {
  len <- length(ageGroup)

  # assert numeric
  isNumeric <- purrr::map_lgl(ageGroup, is.numeric) |>
    all()
  if (!isNumeric) {
    "Elements of `ageGroup` argument are not numeric." |>
      cli::cli_abort(call = call)
  }

  # correct length 1
  ageGroup <- purrr::map(ageGroup, \(x) {
    if (length(x) == 1) rep(x, 2) else x
  })

  # length 2
  if (any(lengths(ageGroup) != 2)) {
    "Elements of `ageGroup` must have length 2." |>
      cli::cli_abort(call = call)
  }

  allValues <- unlist(ageGroup)

  # no NA
  if (any(is.na(allValues))) {
    "Elements of `ageGroup` argument can not contain NA." |>
      cli::cli_abort(call = call)
  }

  # assert integerish
  if (!isIntegerish(allValues)) {
    "Elements of `ageGroup` argument must be integerish." |>
      cli::cli_abort(call = call)
  }

  # convert to numeric as Inf can not be integer
  ageGroup <- purrr::map(ageGroup, as.numeric)

  # positive
  if (any(unlist(ageGroup) < 0L)) {
    "Elements of `ageGroup` argument must be greater or equal to {.val 0}." |>
      cli::cli_abort(call = call)
  }

  # min <= max
  isMinBigger <- purrr::map_lgl(ageGroup, \(x) x[1] > x[2]) |>
    any()
  if (isMinBigger) {
    "First element of `ageGroup` argument must be smaller or equal than the second one." |>
      cli::cli_abort(call = call)
  }

  # overlap
  if (!overlap & len > 1) {
    for (i in 1:(len - 1)) {
      for (j in (i + 1):len) {
        if (thereIsOverlap(ageGroup[[i]], ageGroup[[j]])) {
          "`ageGroup` must not contain overlap between groups." |>
            cli::cli_abort(call = call)
        }
      }
    }
  }

  # add names
  if (is.null(names(ageGroup))) {
    nms <- rep("", len)
  } else {
    nms <- names(ageGroup)
  }
  for (k in seq_len(len)) {
    if (nms[k] == "") nms[k] <- nameAgeGroup(ageGroup[[k]])
  }
  names(ageGroup) <- nms

  return(ageGroup)
}
isIntegerish <- function(x) {
  if (is.integer(x)) {
    return(TRUE)
  }
  xInt <- x[!is.infinite(x)]
  err <- max(abs(xInt - round(xInt)))
  err < 0.0001
}
thereIsOverlap <- function(x, y) {
  if (x[1] < y[1] & x[2] < y[1]) {
    return(FALSE)
  }
  if (y[1] < x[1] & y[2] < x[1]) {
    return(FALSE)
  }
  TRUE
}
nameAgeGroup <- function(x) {
  if (x[1] == 0L & is.infinite(x[2])) {
    return("overall")
  }
  if (is.infinite(x[2])) {
    return(paste(x[1], "or above"))
  }
  paste(x[1], "to", x[2])
}

#' Validate if an object in a valid cdm_reference.
#'
#' @param cdm A cdm_reference object
#' @param checkOverlapObservation TRUE to perform check on no overlap
#' observation period
#' @param checkStartBeforeEndObservation TRUE to perform check on correct
#' observational start and end date
#' @param checkPlausibleObservationDates TRUE to perform check that there are
#' no implausible observation period start dates (before 1800-01-01) or end
#' dates (after the current date)
#' @param checkPerson TRUE to perform check on person id in all clinical table
#' are in person table
#' @param requiredTables Name of tables that are required to be part of the
#' cdm_reference object.
#' @param validation How to perform validation: "error", "warning".
#' @param call A call argument to pass to cli functions.
#'
#' @return A cdm_reference object
#' @export
#' @examples
#' cdm <- cdmFromTables(
#'   tables = list(
#'     "person" = dplyr::tibble(
#'       person_id = c(1, 2, 3), gender_concept_id = 0, year_of_birth = 1990,
#'       race_concept_id = 0, ethnicity_concept_id = 0
#'     ),
#'     "observation_period" = dplyr::tibble(
#'       observation_period_id = 1:3, person_id = 1:3,
#'       observation_period_start_date = as.Date("2000-01-01"),
#'       observation_period_end_date = as.Date("2023-12-31"),
#'       period_type_concept_id = 0
#'     )
#'   ),
#'   cdmName = "mock"
#' )
#'
#' validateCdmArgument(cdm)
#'
validateCdmArgument <- function(cdm,
                                checkOverlapObservation = FALSE,
                                checkStartBeforeEndObservation = FALSE,
                                checkPlausibleObservationDates = FALSE,
                                checkPerson = FALSE,
                                requiredTables = character(),
                                validation = "error",
                                call = parent.frame()) {
  # input validation
  assertValidation(validation)
  assertLogical(checkOverlapObservation, length = 1)
  assertLogical(checkStartBeforeEndObservation, length = 1)
  assertCharacter(requiredTables)

  # assert class
  assertClass(cdm, class = c("cdm_reference"), all = TRUE, call = call)

  # not overlapping periods
  if (isTRUE(checkOverlapObservation)) {
    checkOverlapObservation(cdm$observation_period)
  }

  # no start observation before end
  if (isTRUE(checkStartBeforeEndObservation)) {
    checkStartBeforeEndObservation(cdm$observation_period)
  }

  if (isTRUE(checkPlausibleObservationDates)) {
    checkPlausibleObservationDates(cdm$observation_period)
  }

  if (isTRUE(checkPerson)) {
    checkPerson(cdm = cdm, call = call)
  }

  notPresent <- requiredTables[!requiredTables %in% names(cdm)]
  if (length(notPresent) > 0) {
    "Required tables not present in the cdm object: {.pkg {notPresent}}." |>
      cli::cli_abort(call = call)
  }

  return(cdm)
}

#' Validate if a an object is a valid 'summarised_result' object.
#'
#' @param result summarised_result object to validate.
#' @param checkNoDuplicates Whether there are not allowed duplicates in the
#' result object.
#' @param checkNameLevel Whether the name-level paired columns are can be
#' correctly split.
#' @param checkSuppression Whether the suppression in the result object is well
#' defined.
#' @param validation Only error is supported at the moment.
#' @param call parent.frame
#'
#' @return summarise result object
#' @export
#' @examples
#' x <- dplyr::tibble(
#'   "result_id" = 1L,
#'   "cdm_name" = "eunomia",
#'   "group_name" = "cohort_name",
#'   "group_level" = "my_cohort",
#'   "strata_name" = c("sex", "sex &&& age_group", "sex &&& year"),
#'   "strata_level" = c("Female", "Male &&& <40", "Female &&& 2010"),
#'   "variable_name" = "number subjects",
#'   "variable_level" = NA_character_,
#'   "estimate_name" = "count",
#'   "estimate_type" = "integer",
#'   "estimate_value" = c("100", "44", "14"),
#'   "additional_name" = "overall",
#'   "additional_level" = "overall"
#' ) |>
#'   newSummarisedResult()
#'
#' validateResultArgument(x)
#'
validateResultArgument <- function(result,
                                   checkNoDuplicates = FALSE,
                                   checkNameLevel = FALSE,
                                   checkSuppression = FALSE,
                                   validation = "error",
                                   call = parent.frame()) {
  assertTrue(validation == "error")
  assertTable(result, class = "summarised_result", call = call)

  validateResultSettings(attr(result, "settings"), call = call)

  result <- result |>
    validateSummarisedResultTable(
      duplicates = checkNoDuplicates,
      pairs = checkNameLevel,
      duplicateEstimates = checkNoDuplicates,
      suppressPossibility = checkSuppression,
      call = call
    )

  return(result)
}

#' Validate a new column of a table
#'
#' @param table The table to check if the column already exists.
#' @param column Character vector with the name(s) of the new column(s).
#' @param validation Whether to throw warning or error.
#' @param call Passed to cli functions.
#'
#' @return table without conflicting columns.
#' @export
#' @examples
#' x <- dplyr::tibble(
#'   column1 = c(1L, 2L),
#'   column2 = c("a", "b")
#' )
#' validateNewColumn(x, "not_exiting_column")
#' validateNewColumn(x, "column1")
#'
validateNewColumn <- function(table,
                              column,
                              validation = "warning",
                              call = parent.frame()) {
  # input check
  cols <- colnames(table)
  assertCharacter(column)
  assertValidation(validation)

  # assert if they exist
  eliminate <- column[column %in% cols]
  if (length(eliminate) > 0) {
    if (validation == "error") {
      cli::cli_abort(c("x" = "columns {.var {eliminate}} already exist in the table. Remove or rename new columns."), call = call)
    } else if (validation == "warning") {
      cli::cli_warn(c("!" = "columns {.var {eliminate}} already exist in the table. They will be overwritten."))
      table <- table |>
        dplyr::select(!dplyr::all_of(eliminate))
    }
  }

  # output table or sc_column
  return(table)
}

#' Validate whether a variable points to a certain exiting column in a table.
#'
#' @param column Name of a column that you want to check that exist in `x`
#' table.
#' @param x Table to check if the column exist.
#' @param type Type of the column.
#' @param validation Whether to throw warning or error.
#' @param call Passed to cli functions.
#'
#' @return the validated name
#' @export
#'
#' @examples
#' x <- dplyr::tibble(a = 1, b = "xxx")
#'
#' validateColumn("a", x, validation = "warning")
#' validateColumn("a", x, type = "character", validation = "warning")
#' validateColumn("a", x, type = "numeric", validation = "warning")
#' validateColumn("not_existing", x, type = "numeric", validation = "warning")
#'
validateColumn <- function(column,
                           x,
                           type = c("character", "date", "logical", "numeric", "integer"),
                           validation = "error",
                           call = parent.frame()) {
  assertCharacter(column, length = 1, call = call)
  assertTable(x)
  assertValidation(validation)
  assertChoice(type, c("character", "date", "logical", "numeric", "integer"))

  if (!column %in% colnames(x)) {
    report("{column} column does not exist.", validation = validation, call = call)
    return(column)
  }

  types <- x |>
    dplyr::select(dplyr::all_of(column)) |>
    utils::head(1) |>
    dplyr::pull() |>
    dplyr::type_sum() |>
    assertClassification()
  if (isFALSE(any(type %in% types))) {
    report("{column} type must be a choice of: {.var {type}}; but it is {.pkg {types}}.", validation = validation, call = call)
    return(column)
  }

  return(column)
}

#' To check whether an object is already suppressed to a certain min cell count.
#'
#' @param result The suppressed result to check
#' @param minCellCount  Minimum count of records used when suppressing
#'
#' @return Warning or message with check result
#' @export
#' @examples
#' x <- dplyr::tibble(
#'   "result_id" = 1L,
#'   "cdm_name" = "eunomia",
#'   "group_name" = "cohort_name",
#'   "group_level" = "my_cohort",
#'   "strata_name" = c("sex", "sex &&& age_group", "sex &&& year"),
#'   "strata_level" = c("Female", "Male &&& <40", "Female &&& 2010"),
#'   "variable_name" = "number subjects",
#'   "variable_level" = NA_character_,
#'   "estimate_name" = "count",
#'   "estimate_type" = "integer",
#'   "estimate_value" = c("100", "44", "14"),
#'   "additional_name" = "overall",
#'   "additional_level" = "overall"
#' ) |>
#'   newSummarisedResult()
#'
#' isResultSuppressed(x)
#'
isResultSuppressed <- function(result, minCellCount = 5) {
  # initial checks
  validateResultArgument(result)
  assertNumeric(minCellCount, length = 1, integerish = TRUE)

  # retrieve settings
  set <- settings(result)
  if (nrow(set) == 0) {
    return(TRUE)
  }
  if (!"min_cell_count" %in% colnames(set)) {
    cli::cli_warn("Column {.var min_cell_count} is missing in settings, result is not suppressed.")
    return(FALSE)
  }

  set <- set |>
    dplyr::select("result_id", "min_cell_count") |>
    dplyr::mutate("min_cell_count" = as.integer(.data$min_cell_count))

  if (all(minCellCount == unique(set$min_cell_count))) {
    cli::cli_inform(c(
      "v" = "The {.cls summarised_result} is suppressed with minCellCount = {minCellCount}."
    ))
    return(TRUE)
  } else {
    idSup <- set$result_id[set$min_cell_count == minCellCount]
    idNotSup <- set$result_id[set$min_cell_count == 0 | is.na(set$min_cell_count)]
    idSupLow <- set$result_id[set$min_cell_count > 0 & set$min_cell_count < minCellCount]
    idSupUpp <- set$result_id[set$min_cell_count > minCellCount]
    addMesSup(character(), idSup, result, "v", glue::glue("suppressed minCellCount = {minCellCount}")) |>
      addMesSup(idNotSup, result, "x", "not suppressed") |>
      addMesSup(idSupLow, result, "x", glue::glue("suppressed with minCellCount < {minCellCount}")) |>
      addMesSup(idSupUpp, result, "!", glue::glue("suppressed with minCellCount > {minCellCount}")) |>
      cli::cli_warn()
    return(FALSE)
  }
}
addMesSup <- function(mes, ids, result, lab, err) {
  if (length(ids) == 0) {
    return(mes)
  }
  ncounts <- sum(result$result_id %in% ids)
  ms <- "{length(ids)} ({ncounts} row{?s}) {err}." |>
    cli::cli_text() |>
    cli::cli_fmt() |>
    as.character()
  c(mes, rlang::set_names(ms, lab))
}

#' Validate `nameStyle` argument. If any of the element in `...` has length
#' greater than 1 it must be contained in nameStyle. Note that snake case
#' notation is used.
#'
#' @param nameStyle A character vector. It must contain all the `...` elements
#' in snake_case format and between `{}`.
#' @param ... Elements to be included.
#' @param call Passed to cli functions.
#'
#' @export
#' @return invisible nameStyle.
#' @examples
#' validateNameStyle(
#'   nameStyle = "hi_{cohort_name}",
#'   cohortName = c("cohort1", "cohort2"),
#'   otherVariable = c("only 1 value")
#' )
#'
#' \dontrun{
#' validateNameStyle(
#'   nameStyle = "hi_{cohort_name}",
#'   cohortName = c("cohort1", "cohort2"),
#'   otherVariable = c("value1", "value2")
#' )
#' }
#' validateNameStyle(
#'   nameStyle = "{other_variable}_hi_{cohort_name}",
#'   cohortName = c("cohort1", "cohort2"),
#'   otherVariable = c("value1", "value2")
#' )
#'
validateNameStyle <- function(nameStyle,
                              ...,
                              call = parent.frame()) {
  assertCharacter(nameStyle, length = 1, call = call)
  elementsInDots <- list(...)
  assertList(elementsInDots, class = "character", call = call)

  elementsInDots <- elementsInDots |>
    purrr::keep(\(x) length(x) > 1) |>
    names() |>
    toSnakeCase()
  elementsInNameStyle <- stringr::str_extract_all(nameStyle, "\\{([^}]+)\\}") |>
    dplyr::first() |>
    purrr::map_chr(\(x) substr(x, 2, nchar(x) - 1))
  missingElements <- elementsInDots[!elementsInDots %in% elementsInNameStyle]
  if (length(missingElements) > 0) {
    c("!" = "{missingElements} must be included in `nameStyle`.",
      "*" = "elements in `...`: {.var {elementsInDots}}.",
      "*" = "elements in `nameStyle`: {.var {elementsInNameStyle}}.") |>
      cli::cli_abort(call = call)
  }
  return(invisible(nameStyle))
}

#' To validate a strata list. It makes sure that elements are unique and point
#' to columns in table.
#'
#' @param strata A list of characters that point to columns in table.
#' @param table A table with columns.
#' @param call Passed to cli functions.
#'
#' @return The same strata input or an error if the input is incorrect.
#' @export
#'
#' @examples
#' strata <- list("age", "sex", c("age", "sex"))
#' x <- dplyr::tibble(age = 30L, sex = "Female")
#'
#' validateStrataArgument(strata, x)
#'
validateStrataArgument <- function(strata,
                                   table,
                                   call = parent.frame()) {
  if (is.character(strata)) {
    strata <- list(strata)
  }
  assertList(strata, class = "character", call = call)
  cols <- colnames(table)

  strataUnique <- unique(strata)
  n <- length(strata) - length(strataUnique)
  if (n > 0) {
    cli::cli_warn("{n} elements of strata eliminated because they were repeated.")
  }

  strataCols <- unique(purrr::flatten_chr(strataUnique))
  notPresent <- strataCols[!strataCols %in% cols]
  if (length(notPresent) > 0) {
    "Elements in strata not present as columns in table: {.var {notPresent}}." |>
      cli::cli_abort(call = call)
  }

  return(strataUnique)
}

Try the omopgenerics package in your browser

Any scripts or data that you put into this service are public.

omopgenerics documentation built on June 8, 2025, 10:40 a.m.