R/checks.R

Defines functions checkCategory validateName validateType validateMissingValue validateAgeMissingDay validateAgeMissingMonth validateColumn validateIndexDate validateLogical validateX warnOverwriteColumns assertNameStyle correctStrata checkCensorDate checkVariablesFunctions checkStrata checkTable checkExclude checkSnakeCase checkCohortNames checkValue checkFilter checkVariableInX checkCdm checkX

# Copyright 2024 DARWIN EU (C)
#
# This file is part of PatientProfiles
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' @noRd
checkX <- function(x) {
  if (!isTRUE(inherits(x, "tbl_dbi"))) {
    cli::cli_abort("x is not a valid table")
  }
  if ("person_id" %in% colnames(x) && "subject_id" %in% colnames(x)) {
    cli::cli_abort(paste0(
      "x can only contain an individual identifier, please remove 'person_id'",
      " or 'subject_id'"
    ))
  }
  if (!("person_id" %in% colnames(x)) && !("subject_id" %in% colnames(x))) {
    cli::cli_abort(paste0(
      "x must contain an individual identifier ('person_id'",
      " or 'subject_id')"
    ))
  }
  personVariable <- dplyr::if_else(
    "person_id" %in% colnames(x), "person_id", "subject_id"
  )
  invisible(personVariable)
}

#' @noRd
checkCdm <- function(cdm, tables = NULL) {
  if (!isTRUE(inherits(cdm, "cdm_reference"))) {
    cli::cli_abort("cdm a cdm_reference object.")
  }
  if (!is.null(tables)) {
    tables <- tables[!(tables %in% names(cdm))]
    if (length(tables) > 0) {
      cli::cli_abort(paste0(
        "tables: ",
        paste0(tables, collapse = ", "),
        " are not present in the cdm object"
      ))
    }
  }
  invisible(NULL)
}

#' @noRd
checkVariableInX <- function(indexDate, x, nullOk = FALSE, name = "indexDate") {
  omopgenerics::assertCharacter(indexDate, length = 1, null = nullOk)
  if (!is.null(indexDate) && !(indexDate %in% colnames(x))) {
    cli::cli_abort(glue::glue("{name} ({indexDate}) should be a column in x"))
  }
  invisible(NULL)
}

#' @noRd
checkFilter <- function(filterVariable, filterId, idName, x) {
  if (is.null(filterVariable)) {
    filterId <- NULL
    idName <- NULL
    filterTbl <- NULL
  } else {
    checkVariableInX(filterVariable, x, FALSE, "filterVariable")
    omopgenerics::assertNumeric(filterId, na = FALSE)
    omopgenerics::assertNumeric(utils::head(x, 1) %>%
                               dplyr::pull(dplyr::all_of(filterVariable)))
    if (is.null(idName)) {
      idName <- paste0("id", filterId)
    } else {
      omopgenerics::assertCharacter(idName,
                                    na = FALSE,
                                    length = length(filterId))
    }
    filterTbl <- dplyr::tibble(
      id = filterId,
      id_name = idName
    )
  }
  invisible(filterTbl)
}

#' @noRd
checkValue <- function(value, x, name) {
  omopgenerics::assertCharacter(value, na = FALSE)
  omopgenerics::assertTrue(all(value %in%
                                 c("flag", "count", "date", "days",
                                   colnames(x))))
  valueOptions <- c("flag", "count", "date", "days")
  valueOptions <- valueOptions[valueOptions %in% colnames(x)]
  if (length(valueOptions) > 0) {
    cli::cli_warn(paste0(
      "Variables: ",
      paste0(valueOptions, collapse = ", "),
      " are also present in ",
      name,
      ". But have their own functionality inside the package. If you want to
      obtain that column please rename and run again."
    ))
  }
  invisible(value[!(value %in% c("flag", "count", "date", "days"))])
}

#' @noRd
checkCohortNames <- function(x, targetCohortId, name) {
  if (!("cohort_table" %in% class(x))) {
    cli::cli_abort("cdm[[targetCohortTable]]) must be a 'cohort_table'.")
  }
  cohort <- omopgenerics::settings(x)
  filterVariable <- "cohort_definition_id"
  if (is.null(targetCohortId)) {
    cohort <- dplyr::collect(cohort)
    idName <- cohort$cohort_name
    targetCohortId <- cohort$cohort_definition_id
  } else {
    idName <- cohort %>%
      dplyr::filter(
        as.integer(.data$cohort_definition_id) %in%
          as.integer(.env$targetCohortId)
      ) %>%
      dplyr::arrange(.data$cohort_definition_id) %>%
      dplyr::pull("cohort_name")
    if (length(idName) != length(targetCohortId)) {
      cli::cli_abort(
        "some of the cohort ids given do not exist in the cohortSet of
          cdm[[targetCohortName]]"
      )
    }
  }
  parameters <- list(
    "filter_variable" = filterVariable,
    "filter_id" = sort(targetCohortId),
    "id_name" = idName
  )
  invisible(parameters)
}

#' @noRd
checkSnakeCase <- function(name, verbose = TRUE, null = FALSE, call = parent.frame()) {
  omopgenerics::assertCharacter(name, call = call, null = null)
  if (is.null(name)) {
    return(invisible(name))
  }
  wrong <- FALSE
  for (i in seq_along(name)) {
    n <- name[i]
    n <- gsub("[a-z]", "", n)
    n <- gsub("[0-9]", "", n)
    n <- gsub("_", "", n)
    if (nchar(n) > 0) {
      oldname <- name[i]
      name[i] <- gsub("([[:upper:]])", "\\L\\1", perl = TRUE, name[i])
      name[i] <- gsub("[^a-z,0-9.-]", "_", name[i])
      name[i] <- gsub("-", "_", name[i])
      if (verbose) {
        cli::cli_alert(paste0(oldname, " has been changed to ", name[i]))
      }
      wrong <- TRUE
    }
  }
  if (wrong && verbose) {
    cli::cli_alert("some provided names were not in snake_case")
    cli::cli_alert("names have been changed to lower case")
    cli::cli_alert("special symbols in names have been changed to '_'")
  }
  return(invisible(name))
}

#' @noRd
checkExclude <- function(exclude) {
  if (!is.null(exclude) & !is.character(exclude)) {
    cli::cli_abort("eclude must a character vector or NULL")
  }
}

#' @noRd
checkTable <- function(table) {
  if (!("tbl" %in% class(table))) {
    cli::cli_abort("table should be a tibble")
  }
}

#' @noRd
checkStrata <- function(list, table, type = "strata") {
  errorMessage <- paste0(type, " should be a list that point to columns in table")
  if (!is.list(list)) {
    cli::cli_abort(errorMessage)
  }
  if (length(list) > 0) {
    if (!is.character(unlist(list))) {
      cli::cli_abort(errorMessage)
    }
    if (!all(unlist(list) %in% colnames(table))) {
      notPresent <- list |>
        unlist() |>
        unique()
      notPresent <- notPresent[!notPresent %in% colnames(table)]
      cli::cli_abort(paste0(
        errorMessage,
        ". The following columns were not found in the data: ",
        paste0(notPresent, collapse = ", ")
      ))
    }
  }
  if (!is.null(names(list))) {
    cli::cli_inform(c("!" = "names of {type} will be ignored"))
  }
  names(list) <- NULL
  return(list)
}

#' @noRd
checkVariablesFunctions <- function(variables, estimates, table) {
  errorMessage <- "variables should be a unique named list that point to columns in table"
  omopgenerics::assertList(x = variables, class = "character")
  omopgenerics::assertList(x = estimates, class = "character")
  if (length(variables) != length(estimates)) {
    cli::cli_abort("Variables and estimates must have the same length")
  }
  if (!is.null(names(variables)) & !is.null(names(estimates))) {
    if (!identical(sort(names(variables)), sort(names(estimates)))) {
      cli::cli_abort("Names from variables and estimates must be the same")
    }
    variables <- variables[order(names(variables))]
    estimates <- estimates[order(names(estimates))]
  }

  if (length(variables) == 0) {
    return(dplyr::tibble(
      "variable_name" = character(),
      "estimate_name" = character(),
      "variable_type" = character(),
      "estimate_type" = character()
    ))
  }

  functions <- lapply(seq_along(variables), function(k) {
    tidyr::expand_grid(
      variable_name = variables[[k]],
      estimate_name = estimates[[k]]
    )
  }) |>
    dplyr::bind_rows() |>
    dplyr::inner_join(variableTypes(table), by = "variable_name") |>
    dplyr::inner_join(
      availableEstimates(fullQuantiles = TRUE) |>
        dplyr::select(-"estimate_description"),
      by = c("variable_type", "estimate_name")
    )

  # check binary
  binaryVars <- functions |>
    dplyr::filter(
      .data$variable_type %in% c("numeric", "integer") &
        .data$estimate_name %in% c("count", "percentage")
    ) |>
    dplyr::select("variable_name") |>
    dplyr::distinct() |>
    dplyr::pull()
  if (length(binaryVars) > 0) {
    notBinary <- character()
    for (binVar in binaryVars) {
      x <- table |>
        dplyr::select(dplyr::all_of(binVar)) |>
        dplyr::distinct() |>
        dplyr::pull()
      if (length(x) <= 3) {
        if (!all(as.numeric(x) %in% c(0, 1, NA))) {
          notBinary <- c(notBinary, binVar)
        }
      } else {
        notBinary <- c(notBinary, binVar)
      }
    }
    functions <- functions |>
      dplyr::filter(
        !.data$variable_name %in% .env$notBinary |
          !.data$estimate_name %in% c("count", "percentage")
      )
  }

  return(functions)
}

#' @noRd
checkCensorDate <- function(x, censorDate) {
  check <- x %>%
    dplyr::select(dplyr::all_of(censorDate)) %>%
    utils::head(1) %>%
    dplyr::pull() %>%
    inherits("Date")
  if (!check) {
    cli::cli_abort("{censorDate} is not a date variable")
  }
}

correctStrata <- function(strata, overall) {
  if (length(strata) == 0 | overall) {
    strata <- c(list(character()), strata)
  }
  strata <- unique(strata)
  return(strata)
}

assertNameStyle <- function(nameStyle,
                            values = list(),
                            call = parent.frame()) {
  # initial checks
  omopgenerics::assertCharacter(nameStyle, length = 1,
                                na = FALSE, minNumCharacter = 1, call = call)
  omopgenerics::assertList(values, named = TRUE)
  omopgenerics::assertClass(call, class = "environment")

  # check name style
  err <- character()
  for (k in seq_along(values)) {
    valk <- values[[k]]
    nm <- paste0("\\{", names(values)[k], "\\}")
    if (length(valk) > 1 & !grepl(pattern = nm, x = nameStyle)) {
      err <- c(err, paste0("{{", names(values)[k], "}}"))
    }
  }

  # error
  if (length(err) > 0) {
    names(err) <- rep("*", length(err))
    cli::cli_abort(
      message = c("The following elements are not present in nameStyle:", err),
      call = call
    )
  }

  return(invisible(nameStyle))
}

warnOverwriteColumns <- function(x, nameStyle, values = list()) {
  if (length(values) > 0) {
    nameStyle <- tidyr::expand_grid(!!!values) |>
      dplyr::mutate("tmp_12345" = glue::glue(.env$nameStyle)) |>
      dplyr::pull("tmp_12345") |>
      as.character() |>
      unique()
  }

  extraColumns <- colnames(x)[colnames(x) %in% nameStyle]
  if (length(extraColumns) > 0) {
    ms <- extraColumns
    names(ms) <- rep("*", length(ms))
    cli::cli_inform(message = c(
      "!" = "The following columns will be overwritten:", ms
    ))
    x <- x |> dplyr::select(!dplyr::all_of(extraColumns))
  }

  return(x)
}

# checks demographics
validateX <- function(x, call = parent.frame()) {
  omopgenerics::assertClass(x, class = "cdm_table", call = call)
  cols <- colnames(x)
  n <- sum(c("person_id", "subject_id") %in% cols)
  if (n == 0) cli::cli_abort("No person indentifier (person_id/subject_id) found in x.", call = call)
  if (n == 2) cli::cli_abort("Only person_id or subject_id can be present in x.", call = call)
  return(x)
}
validateLogical <- function(x, null = FALSE, call) {
  if (null) {
    return(NULL)
  }
  err <- paste(substitute(x), "must be TRUE or FALSE") |> rlang::set_names("!")
  if (!is.logical(x)) cli::cli_abort(message = err, call = call)
  if (length(x) != 1) cli::cli_abort(message = err, call = call)
  if (is.na(x)) cli::cli_abort(message = err, call = call)
  return(x)
}
validateIndexDate <- function(indexDate, null, x, call) {
  if (null) {
    return(NULL)
  }
  omopgenerics::assertCharacter(indexDate, length = 1, call = call)
  if (!indexDate %in% colnames(x)) {
    cli::cli_abort("indexDate must be a column in x.", call = call)
  }
  xx <- x |>
    dplyr::select(dplyr::all_of(indexDate)) |>
    utils::head(1) |>
    dplyr::pull()
  if (!inherits(xx, "Date") && !inherits(xx, "POSIXt")) {
    cli::cli_abort("x[[{indexDate}]] is not a date column.", call = call)
  }
  return(indexDate)
}
validateColumn <- function(col, null = FALSE, call = parent.frame()) {
  if (null) {
    return(NULL)
  }

  nm <- paste0(substitute(col))

  err <- "{nm} must be a snake_case character string"
  if (!is.character(col)) cli::cli_abort(message = err, call = call)
  if (length(col) != 1) cli::cli_abort(message = err, call = call)
  if (is.na(col)) cli::cli_abort(message = err, call = call)

  scCol <- omopgenerics::toSnakeCase(col)

  if (scCol != col) {
    cli::cli_warn(
      c("!" = "{nm} has been modified to be snake_case, {col} -> {scCol}"),
      call = call
    )
  }

  return(scCol)
}
validateAgeMissingMonth <- function(ageMissingMonth, null, call) {
  if (null) {
    return(ageMissingMonth)
  }

  if (is.character(ageMissingMonth)) {
    ageMissingMonth <- as.numeric(ageMissingMonth)
  }
  omopgenerics::assertNumeric(ageMissingMonth, integerish = TRUE, min = 1, max = 12, call = call)
  ageMissingMonth <- as.integer(ageMissingMonth)

  return(ageMissingMonth)
}
validateAgeMissingDay <- function(ageMissingDay, null, call) {
  if (null) {
    return(ageMissingDay)
  }

  if (is.character(ageMissingDay)) {
    ageMissingDay <- as.numeric(ageMissingDay)
  }
  omopgenerics::assertNumeric(ageMissingDay, integerish = TRUE, min = 1, max = 12, call = call)
  ageMissingDay <- as.integer(ageMissingDay)

  return(ageMissingDay)
}
validateMissingValue <- function(x, null, call) {
  if (null) {
    return(NULL)
  }
  nm <- paste0(substitute(x))
  err <- "{nm} must be a character of length 1." |> rlang::set_names("!")
  if (!is.character(x)) cli::cli_abort(message = err, call = call)
  if (length(x) != 1) cli::cli_abort(message = err, call = call)
  return(x)
}
validateType <- function(x, null, call) {
  if (null) {
    return(NULL)
  }
  nm <- paste0(substitute(x))
  err <- "{nm} must be a choice between 'date' or 'days'." |>
    rlang::set_names("!")
  if (!is.character(x)) cli::cli_abort(message = err, call = call)
  if (length(x) != 1) cli::cli_abort(message = err, call = call)
  if (!x %in% c("date", "days")) cli::cli_abort(message = err, call = call)
  return(x)
}
validateName <- function(name, call = parent.frame()) {
  omopgenerics::assertCharacter(name, length = 1, null = TRUE, call = call)
}

checkCategory <- function(category, overlap = FALSE, type = "numeric", call = parent.frame()) {
  omopgenerics::assertList(category, class = type, call = call)

  if (is.null(names(category))) {
    names(category) <- rep("", length(category))
  }

  # check length
  category <- lapply(category, function(x) {
    if (length(x) == 1) {
      x <- c(x, x)
    } else if (length(x) > 2) {
      cli::cli_abort(
        "Please specify only two values (lower bound and upper bound) per category",
        call = call
      )
    }
    invisible(x)
  })

  # check lower bound is smaller than upper bound
  checkLower <- unlist(lapply(category, function(x) {
    x[1] <= x[2]
  }))
  if (!(all(checkLower))) {
    "Lower bound should be equal or smaller than upper bound" |>
      cli::cli_abort(call = call)
  }

  # built tibble
  result <- lapply(category, function(x) {
    dplyr::tibble(lower_bound = x[1], upper_bound = x[2])
  }) %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(category_label = names(.env$category)) %>%
    dplyr::mutate(category_label = dplyr::if_else(
      .data$category_label == "",
      dplyr::case_when(
        is.infinite(.data$lower_bound) & is.infinite(.data$upper_bound) ~ "any",
        is.infinite(.data$lower_bound) ~ paste(.data$upper_bound, "or below"),
        is.infinite(.data$upper_bound) ~ paste(.data$lower_bound, "or above"),
        TRUE ~ paste(.data$lower_bound, "to", .data$upper_bound)
      ),
      .data$category_label
    )) %>%
    dplyr::arrange(.data$lower_bound)

  # check overlap
  if (!overlap) {
    if (nrow(result) > 1) {
      lower <- result$lower_bound[2:nrow(result)]
      upper <- result$upper_bound[1:(nrow(result) - 1)]
      if (!all(lower > upper)) {
        "There can not be overlap between categories" |>
          cli::cli_abort(call = call)
      }
    }
  }

  invisible(result)
}

Try the PatientProfiles package in your browser

Any scripts or data that you put into this service are public.

PatientProfiles documentation built on Oct. 30, 2024, 9:13 a.m.