R/validate_data.R

Defines functions arrange_variables validate_investigations validate_microbiology validate_administrations validate_prescriptions validate_inpatient_diagnoses .validate_inpatient_episode_dates .validate_inpatient_encounters .validate_inpatient_wards validate_inpatient_episodes .validate_variable_no_missing .validate_variable_exist .validate_UCUM_codes .validate_values_unique

Documented in validate_administrations validate_inpatient_diagnoses validate_inpatient_episodes validate_investigations validate_microbiology validate_prescriptions

#' Verify uniqueness constraints
#' 
#' @description Verifies that values in vectors `col_names` in data frame
#' `data` are unique
#' @param data  a data frame
#' @param col_names a character vector of column names. Column names which do
#' not exist in `data` will be dropped.
#' @return TRUE if the validation is passed.
#' @noRd
.validate_values_unique <- function(data, col_names) {
  
  col_names <- col_names[col_names %in% colnames(data)]
  
  if( length(col_names) == 0 ){
    return(TRUE)
  }
  
  must_be_unique <- subset(data, select = col_names)
  duplicate_results <- vapply(
    must_be_unique,
    function(X) {
      any(duplicated(X))
    },
    FUN.VALUE = logical(1)
  )
  
  if( any(duplicate_results) ){
    stop(
      simpleError(paste(
        "The following variables must have unique values",
        "in the", substitute(data), "dataset:\n",
        paste(paste0("`", 
                     names(duplicate_results),
                     "`"), collapse = ", "))
      )
    )
  }
  
  validation_result <- !all(duplicate_results)
  
  validation_result
}


#' Validate unit codes
#'
#' @param unit_codes a character vector of UCUM unit names to pass
#' \code{\link[units]{as_units}()}. `NA` and `""` values will be dropped.
#'
#' @return TRUE if the validation is passed.
#' @noRd
.validate_UCUM_codes <- function(unit_codes) {
  stopifnot(is.character(unit_codes))
  unit_codes <- na.omit(unit_codes)
  unit_codes <- unit_codes[unit_codes != ""]
  
  units_validate <- list()
  units_validate$validation <- lapply(
    unit_codes,
    function(X){
      try(units::as_units(X), silent = T)
    })
  units_validate$class <- lapply(
    units_validate$validation,
    class
  )
  units_validate$errors <- which(units_validate$class == "try-error")
  
  if( any(units_validate$class != "units") ){
    stop(paste(
      units_validate$validation[units_validate$errors],
      sep = "\n"
    ))
  }
  
  return(TRUE)
}


#' Verify variables exist in the data
#'
#' @param data a data frame
#' @param vectorname a character vector of variables which must exist.
#' @param action whether to throw `"warning"` or `"error"`. Default is "warning"
#'
#' @return TRUE if validation is passed
#' @noRd
.validate_variable_exist <- function(
  data,
  vectorname,
  action = "warning"
) {
  stopifnot(action %in% c("warning", "error"))
  stopifnot(is.character(vectorname))
  if( length(vectorname) == 0 ) {
    return(TRUE)
  }
  
  not_exist <- !vapply(vectorname, exists, where = data, FUN.VALUE = logical(1))
  if( any(not_exist) & action == "warning"){
    warning(
      simpleWarning(paste(
        "The following variables must exist:",
        paste(paste0("`", vectorname[not_exist], "`"), collapse = ", "))
      )
    )
  } else if( any(not_exist) & action == "error"){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", vectorname[not_exist], "`"), collapse = ", "))
      )
    )
  } else {
    return(TRUE)
  }
}

#' Verify no missing or "" value
#'
#' @param data a data frame
#' @param vectorname a character vector of variables which must not be missing.
#' Variables that are not present in `data` will be dropped.
#' @param action whether to throw `"warning"` or `"error"`. Default is "warning"
#'
#' @return TRUE if validation is passed
#' @noRd
.validate_variable_no_missing <- function(data, 
                                          vectorname, 
                                          action = "warning") {
  
  stopifnot(action %in% c("warning", "error"))
  stopifnot(is.character(vectorname))
  vectorname <- vectorname[vectorname %in% colnames(data)]
  if( length(vectorname) == 0 ) {
    return(TRUE)
  }
  
  missing_data <- vapply(
    vectorname, 
    function(var, data) {
      any(is.na(data[[var]]))
    },
    data = data,
    FUN.VALUE = logical(1)
  )
  
  empty_data <- vapply(
    vectorname, 
    function(var, data) {
      any(as.character(na.omit(data[[var]])) == "")
    },
    data = data,
    FUN.VALUE = logical(1)
  )
  
  if( any(missing_data) & action == "warning"){
    warning(
      paste(
        "The following variables contain missing data:",
        paste(paste0("`", 
                     vectorname[missing_data],
                     "`"), collapse = ", ")
      ))
  }
  
  if( any(missing_data) & action == "error"){
    stop(
      paste(
        "The following variables must not contain missing data:",
        paste(paste0("`", 
                     vectorname[missing_data],
                     "`"), collapse = ", ")
      ))
  }
  
  
  if( any(empty_data) & action == "warning" ){
    warning(
      paste(
        "The following variables contain \"\" values:",
        paste(paste0("`", 
                     vectorname[empty_data],
                     "`"), collapse = ", ")
      ))
  }
  
  if( any(empty_data) & action == "error" ){
    stop(
      paste(
        "The following variables must not equal \"\":",
        paste(paste0("`", 
                     vectorname[empty_data],
                     "`"), collapse = ", ")
      ))
  }
  
  !any(empty_data, missing_data)
}



#' Validate inpatient episode and ward movement records
#'
#' @description This function performs a series of checks for mandatory and
#' optional requirements on episodes of care records. The data definitions 
#' closely follow those of the  
#' \href{https://datadictionary.nhs.uk/data_sets/cds_v6-2/cds_v6-2_type_130_-_admitted_patient_care_-_finished_general_episode_cds.html#dataset_cds_v6-2_type_130_-_admitted_patient_care_-_finished_general_episode_cds}{English NHS Admitted Patient Care Commissioning Datasets}.
#' @param patients data frame containing one row per patient
#' @param episodes data frame containing one row per episode of care
#' @param wards (optional) data frame containing one row per ward stay. 
#'   Default is \code{NULL}.
#' @section Patient mandatory variables:
#' \describe{
#'   \item{\code{patient_id}}{a patient identifier with no missing value}
#'   }
#' @section Patient optional variables:
#' \describe{
#'   \item{\code{forename}}{the patient's forename}
#'   \item{\code{surname}}{the patient's surname}
#'   \item{\code{date_of_birth}}{a \code{Date} for the birth date}
#'   \item{\code{date_of_death}}{a missing value or a \code{Date} of death}
#'   \item{\code{sex}}{the following values are valid: \itemize{
#'        \item \code{"male"}  
#'        \item \code{"female"}  
#'        \item \code{"other"}
#'        \item \code{"unknown"}
#'    }
#'   Must not be missing.}
#'   \item{\code{ethnic_category_UK}}{reserved for UK users for \code{Ramses} to compute
#'   the empirical glomerular filtration rate (eGFR). The following codes are valid:
#'   
#'   White \itemize{
#'       \item \code{"A"} British
#'       \item \code{"B"} Irish
#'       \item \code{"C"}	Any other White background
#'   }
#'   Mixed
#'   \itemize{
#'       \item \code{"D"} White and Black Caribbean
#'       \item \code{"E"} White and Black African
#'       \item \code{"F"} White and Asian
#'       \item \code{"G"} Any other mixed background
#'   }
#' 
#'   Asian or Asian British
#'   \itemize{
#'       \item \code{"H"} Indian
#'       \item \code{"J"} Pakistani
#'       \item \code{"K"} Bangladeshi
#'       \item \code{"L"} Any other Asian background
#'   }
#'
#'   Black or Black British
#'   \itemize{
#'       \item \code{"M"} Caribbean
#'       \item \code{"N"} African
#'       \item \code{"P"} Any other Black background
#'   }
#'
#'   Other Ethnic Groups
#'   \itemize{
#'       \item \code{"R"} Chinese
#'       \item \code{"S"} Any other ethnic group
#'   }
#'   
#'   Not stated \itemize{
#'       \item \code{"Z"} Not stated
#'   }}
#' }
#' @section Episode mandatory variables:
#' \describe{
#'   \item{\code{patient_id}}{a patient identifier with no missing value}
#'   \item{\code{encounter_id}}{a hospital encounter (admission or spell) 
#'   identifier with no missing value}
#'   \item{\code{admission_method}}{a non-missing character code: \itemize{
#'        \item \code{"1"} elective admission
#'        \item \code{"2"} emergency admission
#'        \item \code{"3"} transfer/other admission
#'     }
#'   \emph{Note:} \code{"1"} and \code{"2"} corresponds to the first character 
#'   of the 
#'   \href{https://datadictionary.nhs.uk/attributes/admission_method.html}{NHS admission method value set};
#'   \code{"3"} corresponds to the remaining values starting with 
#'   \code{3} or \code{8}.}
#'   \item{\code{admission_date}}{a \code{POSIXct} timestamp for 
#'   the hospital admission. Must not be missing.}
#'   \item{\code{discharge_date}}{a \code{POSIXct} timestamp for 
#'   the hospital discharge. Must not be missing.}
#'   \item{\code{episode_number}}{a strictly positive integer indicating the
#'   number of the episode within an admission. Must not be missing.}
#'   \item{\code{last_episode_in_encounter}}{a character indicating whether
#'   the patient is discharged at the end of the episode: \itemize{
#'        \item \code{"1"} the episode is the last episode in the encounter
#'        \item \code{"2"} the episode is \strong{not} the last episode in the encounter
#'    }
#'    Must not be missing.}
#'   \item{\code{episode_start}}{a \code{POSIXct} timestamp for 
#'   the hospital start. Must not be missing.}
#'   \item{\code{episode_end}}{a \code{POSIXct} timestamp for 
#'   the hospital end Must not be missing.}
#'   \item{\code{consultant_code}}{a code uniquely identifying the medical 
#'   professional responsible for the episode of care. Must not be missing.}
#'   \item{\code{main_specialty_code}}{a code identifying
#'   the main specialty of the medical professional responsible for the 
#'   episode of care. Must not be missing.}
#' }
#' 
#' @section Ward mandatory variables:
#' \describe{
#'   \item{\code{patient_id}}{a patient identifier with no missing value}
#'   \item{\code{encounter_id}}{a hospital encounter identifier with no missing value}
#'   \item{\code{ward_code}}{character vector of ward codes}
#'   \item{\code{ward_display_name}}{character vector of expanded ward designations}
#'   \item{\code{ward_description}}{full text description of service/unit/specialty}
#'   \item{\code{ward_start}}{a \code{POSIXct} timestamp for 
#'   the arrival time on the ward}
#'   \item{\code{ward_end}}{a \code{POSIXct} timestamp for 
#'   the ward departure time}
#' }
#' @return A logical value indicating success
#' @export
validate_inpatient_episodes <- function(patients,
                                        episodes,
                                        wards = NULL) {
  
  patient_schema <- .inpatient_patients_variables()
  episode_schema <- .inpatient_episodes_variables()
  
  variable_exists <- episode_schema[episode_schema$must_exist, "variable_name"]
  not_exist <- !vapply(variable_exists, exists, where = episodes,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  variable_exists_non_missing <- episode_schema[
    episode_schema$must_be_nonmissing, 
    "variable_name"]
  no_missing_data <- .validate_variable_no_missing(
    data = episodes,
    vectorname = variable_exists_non_missing,
    action = "error"
  )
  
  variable_exists <- patient_schema[patient_schema$must_exist, "variable_name"]
  not_exist <- !vapply(variable_exists, exists, where = patients,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  variable_exists_non_missing <- patient_schema[
    patient_schema$must_be_nonmissing, 
    "variable_name"]
  no_missing_data <- .validate_variable_no_missing(
    data = patients,
    vectorname = variable_exists_non_missing,
    action = "error"
  )
  
  if( !all(unique(episodes$patient_id) %in% unique(patients$patient_id)) ) {
    stop("All patients in `episodes` must exist in `patients`")
  }
  
  variable_uniqueness <- .validate_values_unique(
    data = patients,
    col_names = patient_schema[
      patient_schema$must_be_unique, 
      "variable_name"]
  )
  
  validation_result <- .validate_inpatient_encounters(episodes)
  validation_result <- append(
    .validate_inpatient_episode_dates(data = episodes,
                                     type = "episodes"), 
    validation_result)
  
  if(!is.null(wards)) {
    validation_result <- append(
      .validate_inpatient_wards(episodes, wards), 
      validation_result)
  }
 
  !any(!validation_result)
}


.validate_inpatient_wards <- function(episodes, wards) {
  
  ward_schema <- .inpatient_wards_variables()
  
  variable_exists <- ward_schema[ward_schema$must_exist, "variable_name"]
  not_exist <- !vapply(variable_exists, exists, where = wards,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  variable_exists_non_missing <- ward_schema[
    ward_schema$must_be_nonmissing, 
    "variable_name"]
  no_missing_data <- .validate_variable_no_missing(
    data = wards,
    vectorname = variable_exists_non_missing,
    action = "error"
  )
  
  wards <- merge(
    wards, 
    dplyr::distinct(episodes, 
                    .data$patient_id, 
                    .data$encounter_id, 
                    .data$admission_date, 
                    .data$discharge_date), 
    all.x = TRUE)
  
  .validate_inpatient_episode_dates(data = wards,
                                   type = "wards")
}


#' Validate the consistency of admission and discharge dates
#'
#' @param data a data frame object
#' @importFrom data.table data.table
#' @return A logical value indicating success
#' @noRd
.validate_inpatient_encounters <- function(data) {
  
  validation_result <- TRUE
  
  if (any(data$admission_date > data$discharge_date)) {
    warning(simpleWarning("Some admission dates are posterior to discharge dates."))
    validation_result <- FALSE
  }
  
  encounters <- data[, c(
    "patient_id", "encounter_id",
    "admission_date", "discharge_date"
  )]
  
  encounters <- unique(data.table::data.table(encounters))
  
  data_cross_prod <- merge(
    encounters, encounters, 
    by = "patient_id",
    all = T,
    allow.cartesian = T
  )
  
  data_cross_prod <- data_cross_prod %>% 
    dplyr::filter(.data$encounter_id.x != .data$encounter_id.y) %>% 
    dplyr::filter(
      ( .data$admission_date.x > .data$admission_date.y &
          .data$admission_date.x < .data$discharge_date.y ) | 
        ( .data$discharge_date.x > .data$admission_date.y &
            .data$discharge_date.x < .data$discharge_date.y )
    ) 

  if (nrow(data_cross_prod) > 0) {
    warning(simpleWarning("Hospital encounters must not overlap."))
    warning(simpleWarning(
      .print_and_capture(
        utils::head(data.table::setorderv(data_cross_prod, 
                                          c("patient_id", "admission_date.x"))))
    ))
    validation_result <- FALSE
  }
  
  validation_result
}


#' Validate the consistency of inpatient episode dates
#'
#' @param data a data frame object
#' @param type a string indicating the type of dates to validate:
#' either \code{"wards"} for ward stays or \code{"episodes"} for inpatient episodes.
#' @importFrom data.table data.table := 
#' @return A logical value indicating success
#' @noRd
.validate_inpatient_episode_dates <- function(data, type = "episodes") {
  
  if(type == "episodes") {
    data$start <- data[["episode_start"]]
    data$end <- data[["episode_end"]]
  } else if(type == "wards") {
    data$start <- data[["ward_start"]]
    data$end <- data[["ward_end"]]
  } else {
    stop("`type` must be 'episodes' or 'wards'.")
  }
  
  validation_result <- TRUE
  if (any(data$start > data$end)) {
    warning(simpleWarning(paste0("Some `", type, "` start dates are posterior to end dates.")))
    validation_result <- FALSE
  }
  
  if (any(!(data$start >= data$admission_date & data$start <= data$discharge_date))) {
    warning(simpleWarning(paste0("Some `", type, "` fall outside hospitalisation dates.")))
    validation_result <- FALSE
  }
  
  if (any(!(data$end >= data$admission_date & data$end <= data$discharge_date))) {
    warning(simpleWarning(paste0("Some `", type, "` fall outside hospitalisation dates.")))
    validation_result <- FALSE
  }
  
  episodes <- data.table::as.data.table(data)
  
  end <- start <- patient_id <- encounter_id <- admission_date <- NULL
  discharge_date <- BD_encounter <- BD_episode <- nextepistart <- NULL
  
  bed_day_matching <- episodes[,
    list(BD_episode = sum(difftime(end, start, units = "secs"))), 
    by = list(patient_id, 
              encounter_id, 
              admission_date, 
              discharge_date)
    ]
  bed_day_matching[, BD_encounter := difftime(discharge_date, 
                                              admission_date,
                                              units = "secs")]
  
  if (nrow(bed_day_matching[abs(BD_episode - BD_encounter) > 5]) > 0) {
    warning(simpleWarning(paste0(
      "Total bed days calculated from `", type,"` does not",
      "match admission duration.\n Bed days may be incorrect.")))
  }
  
  rm(bed_day_matching)
  
  episodes <- episodes[, list(patient_id, encounter_id, start, end)]
  
  episodes <- data.table::setorderv(
    episodes, c("patient_id", "encounter_id", "start"))
  
  episodes[ , `:=`(nextepistart =  data.table::shift(start, type = "lead")), 
            by = list(patient_id, encounter_id)]
  
  if (nrow(episodes[!is.na(nextepistart) & nextepistart != end]) > 0) {
    warning(simpleWarning(paste0(
      "Some hospital encounters have gaps between `", type,"`.\n",
      "Bed days may be underestimated.")))
  }

  episodes[ , nextepistart := NULL]
  
  data_cross_prod <- merge(
    episodes, episodes, 
    by = c("patient_id"),
    all = T,
    allow.cartesian = T
  )
  
  data_cross_prod <- data_cross_prod %>% 
    dplyr::filter(
      !(.data$encounter_id.x == .data$encounter_id.y & 
          .data$start.x == .data$start.y & 
          .data$end.x == .data$end.y)
    ) %>% 
    dplyr::filter(
      ( .data$start.x > .data$start.y &
          .data$start.x < .data$end.y ) | 
        ( .data$end.x > .data$start.y &
            .data$end.x < .data$end.y )
    )
  
  if (nrow(data_cross_prod) > 0) {
    warning(simpleWarning(paste0("Hospital `", type,"` must not overlap.")))
    utils::head(data.table::setorderv(
      data_cross_prod, c("patient_id", "start.x"))
      )
    validation_result <- FALSE
    
  }
  
  validation_result
}


#' Validate inpatient diagnosis records
#' 
#' @description Validate constraints on diagnosis records, namely that the 
#' minimum variables are present, and that all \code{icd_code} values can be 
#' looked up in an ICD-10 reference table
#' @param diagnoses_data a data frame containing clinical diagnoses, with, 
#' at minimum, variables \code{patient_id}, \code{encounter_id}, 
#' \code{episode_number}, \code{icd_code}, \code{diagnosis_position}
#' @param diagnoses_lookup a data frame containing an ICD-10 reference look up 
#' table with, at minimum, variables \code{icd_description}, \code{icd_display}, 
#' \code{category_code}, \code{category_description}
#' 
#' @section Diagnoses mandatory variables:
#' \describe{
#'   \item{\code{patient_id}}{a patient identifier with no missing value}
#'   \item{\code{encounter_id}}{a hospital encounter identifier with no missing value}
#'   \item{\code{episode_number}}{a strictly positive integer indicating the
#'   number of the episode within an admission. Must not be missing.}
#'   \item{\code{icd_code}}{a code corresponding to the International Classification of Diseases
#'   without a "." separator}
#'   \item{\code{diagnosis_position}}{an integer describing the diagnosis position
#'   on the discharge summary (1 = primary cause of admission}
#' }
#' 
#' @section Diagnoses optional variables:
#' Some record systems track dates when clinical diagnoses were first noted and
#' when they were considered resolved (eg: problem lists). \code{diagnosis_start}
#' and \code{diagnosis_end} should be used to store this information.
#' 
#' \describe{
#'   \item{\code{diagnosis_start}}{a vector of \code{POSIXct} timestamps when 
#'   the clinical problems were first noted or manifested}
#'   \item{\code{diagnosis_end}}{a vector of \code{POSIXct} timestamp when the
#'   clinical problems were considered resolved}
#' }
#' 
#' If no data are provided, Ramses functions such as \code{\link[Ramses]{therapy_timeline}}
#' will use episode start and end dates from the \code{inpatient_episodes} table instead.
#' 
#' @section Diagnoses lookup mandatory variables:
#' 
#' \describe{
#'   \item{\code{icd_code}}{character diagnosis codes corresponding to the 
#'   International Classification of Diseases formatted without a "." separator (eg "A0101")}
#'   \item{\code{icd_display}}{laid-out diagnosis codes for display (eg: "A01.01")}
#'   \item{\code{icd_description}}{full text descriptions of the diagnoses}
#'   \item{\code{category_code}}{three-character heading codes (eg "A01")}
#'   \item{\code{category_description}}{full text descriptions of three-character heading codes}
#' }
#' 
#' \strong{Note}: \code{\link[Ramses]{import_icd}()} can produce this lookup data frame from
#' a standard ICD release archive file.
#' 
#' @return A logical value indicating success
#' @export
#' @importFrom data.table data.table :=
#' @examples
#' data_icd <- dplyr::filter(Ramses::inpatient_diagnoses, !is.na(icd_code))
#' lookup_icd <- dplyr::distinct(data_icd, icd_code)
#' lookup_icd$icd_display <- lookup_icd$icd_code
#' lookup_icd$icd_description <- "ICD-10 code label text"
#' lookup_icd$category_code <- substr(lookup_icd$icd_code, 0, 3)
#' lookup_icd$category_description <- "ICD-10 category label text"
#' validate_inpatient_diagnoses(data_icd, lookup_icd)
validate_inpatient_diagnoses <- function(diagnoses_data, diagnoses_lookup) {

  diagnoses_data_schema <- .inpatient_diagnoses_data_variables()
  diagnoses_lookup_schema <- .inpatient_diagnoses_lookup_variables()
  
  data_var_exists <- diagnoses_data_schema[
    diagnoses_data_schema[["must_exist"]],
    "variable_name"
  ]
  not_exist <- !vapply(data_var_exists, exists, where = diagnoses_data,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", data_var_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  lkup_var_exists <- diagnoses_lookup_schema[
    diagnoses_lookup_schema[["must_exist"]],
    "variable_name"
  ]
  not_exist <- !vapply(lkup_var_exists, exists, where = diagnoses_lookup,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", lkup_var_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }

  validation_result <- .validate_variable_no_missing(
    data = diagnoses_data,
    vectorname = diagnoses_data_schema[
      diagnoses_data_schema[["must_be_nonmissing"]],
      "variable_name"
    ],
    action = "warning"
  )
  
  validation_result <- validation_result & 
    .validate_variable_no_missing(
    data = diagnoses_lookup,
    vectorname = diagnoses_lookup_schema[
      diagnoses_lookup_schema[["must_be_nonmissing"]],
      "variable_name"
    ],
    action = "error"
  )
  
  icd_code <- NULL
  diagnoses_data <- data.table::data.table(diagnoses_data)
  diagnoses_data <- unique(diagnoses_data[, list(icd_code)])
  diagnoses_lookup <- data.table::data.table(diagnoses_lookup)[, list(icd_code)]
  diagnoses_lookup[, missing := FALSE]
  
  diagnoses_data <- merge(diagnoses_data, diagnoses_lookup, by = "icd_code", all.x = T)
  
  if (any(is.na(diagnoses_data$missing))) {
    warning(
      simpleWarning("some `icd_code` values in `diagnoses_data` do not match any `icd_code` in `diagnoses_lookup`")
    )
  }
  
  validation_result
}


#' Validate medication prescription records
#'
#' @description This function performs a series of checks for mandatory and
#' optional requirements on prescriptions data.
#' @param data a data frame containing one row per prescription
#' @section Mandatory fields:
#' These fields are required in order to pass the validation:
#'   \describe{
#'      \item{\code{patient_id}}{a patient identifier with no missing value}
#'      \item{\code{prescription_id}}{a prescription identifier with no missing value}
#'      \item{\code{prescription_text}}{a character string summarising the prescription 
#'      (to be displayed in user interfaces, eg: \code{'Amoxicillin oral 500mg BDS'})}
#'      \item{\code{drug_code}}{identifier of the drug (for antibacterials/antifungals, 
#'      use \code{\link[AMR]{as.ab}()})}
#'      \item{\code{drug_name}}{preferred name of the drug in the drug dictionary
#'      (see \code{\link[AMR]{ab_name}()})}
#'      \item{\code{drug_display_name}}{drug name to display in reports and user interfaces
#'      (can be the same as \code{drug_name})}
#'      \item{\code{drug_group}}{the antimicrobial class see \code{\link[AMR]{ab_group}()}}
#'      \item{\code{antiinfective_type}}{type of antiinfective ("antibacterial", "antifungal",
#'      "antiviral", or "antiparasitic")}
#'      \item{\code{ATC_code}}{the ATC code, see \code{\link[AMR]{ab_atc}()}}
#'      \item{\code{ATC_route}}{route of administration as defined in the ATC (
#'      \code{"O"} = oral; \code{"P"} = parenteral; \code{"Inhal"} = inhaled; 
#'      \code{"N"} = nasal; \code{"SL"} = sublingual/buccal/oromucosal;
#'      \code{"TD"} = transdermal; \code{"R"} = rectal; \code{"V"} = vaginal)}
#'      \item{\code{authoring_date}}{timestamp for when the prescription was issued}
#'      \item{\code{prescription_start}}{timestamp for the prescription start}
#'      \item{\code{prescription_end}}{timestamp for the prescription end (mandated except
#'      for one-off prescriptions with \code{daily_frequency} == -1 or 
#'      prescriptions with \code{prescription_status != "completed"})}
#'      \item{\code{prescription_context}}{either \code{'inpatient'}, \code{'opat'}, or 
#'      \code{'discharge'}}
#'      \item{\code{prescription_status}}{one value from the following 
#'      \href{https://hl7.org/fhir/R4/valueset-medicationrequest-status.html}{FHIR R4}
#'      reference set:
#'      \itemize{ 
#'         \item \code{"active"} the prescription is 'actionable', but not all actions 
#'         that are implied by it have occurred yet.
#'         \item \code{"on-hold"} actions implied by the prescription are to be 
#'         temporarily halted, but are expected to continue later. 
#'         \item \code{"cancelled"} the prescription has been withdrawn before any 
#'         administrations have occurred.
#'         \item \code{"completed"} all actions that are implied by the prescription 
#'         have occurred.
#'         \item \code{"entered-in-error"} some of the actions that are implied by the 
#'         medication request may have occurred. For example, the medication may have 
#'         been dispensed and the patient may have taken some of the medication. 
#'         Clinical decision support systems should take this status into account.
#'         \item \code{"stopped"} actions implied by the prescription are to be 
#'         permanently halted, before all of the administrations occurred. This should 
#'         not be used if the original order was entered in error.
#'         \item \code{"draft"} the prescription is not yet 'actionable', e.g. it is a 
#'         work in progress, requires sign-off, verification or needs to be run through 
#'         decision support process.
#'         \item \code{"unknown"} the authoring/source system does not know which of the 
#'         status values currently applies for this observation. \emph{Note:} This 
#'         concept is not to be used for 'other' - one of the listed statuses is presumed 
#'         to apply, but the authoring/source system does not know which.         
#'      }
#'      \emph{Note that prescriptions marked as \code{"on-hold"}, \code{"cancelled"}, 
#'      \code{"draft"}, \code{"entered-in-error"}, or \code{"unknown"} will not 
#'      count towards therapy episodes or antibiotic consumption estimates.}}
#'      \item{\code{dose}}{a numeric vector of dosage quantities}
#'      \item{\code{unit}}{a character vector of dosage units}
#'      \item{\code{route}}{the route of administration value natively assigned by system}
#'      \item{\code{frequency}}{a character vector of frequencies of administrations
#'       (eg: "BDS" or "Twice a day"). See also: \code{\link{reference_drug_frequency}}}
#'      \item{\code{daily_frequency}}{a numeric translation of variable \code{frequency}
#'      indicating the number of times the drug is to be administered per day. 
#'      Values can be lower than 1 for prescriptions administered less than daily.
#'      Values must be strictly positive, except for the following codes:
#'      \itemize{ 
#'         \item -1 for a single one-off administration
#'         \item -9 for 'as required' (\emph{Pro Re Nata}) prescriptions
#'      }}}
#' @section Optional fields:
#' \describe{
#'   \item{\code{combination_id}}{system-issued identifiers for drugs 
#'        prescribed as a bundle to treat the same indication either 
#'        simultaneously (eg clarithromycin and amoxiclav) or consecutively 
#'        (eg doxicycline 200mg followed by 100mg). Unless provided, 
#'        such identifiers will be created by \code{Ramses} using 
#'        transitive closure.}
#'   \item{\code{DDD}}{the prescribed daily dose (dose x \code{daily_frequency})
#'   expressed in defined daily doses, see \code{\link{compute_DDDs}()}}
#'   \item{\code{...}}{any other field, as desired, can be loaded into the database}
#' }
#' @return NULL if the \code{data} passes the validation. The function will trigger 
#' errors when \emph{mandatory requirements} are not met and warnings when
#' \emph{optional requirements} are not met.
#' @export
validate_prescriptions <- function(data) {
  
  drug_prescriptions_variables <- .drug_prescriptions_variables()
  
  variable_exists <- drug_prescriptions_variables[
    drug_prescriptions_variables[["must_exist"]],
    "variable_name"
  ]
  variable_exists_non_missing <- drug_prescriptions_variables[
    drug_prescriptions_variables[["must_be_nonmissing"]],
    "variable_name"
  ]
  
  not_exist <- !vapply(variable_exists, exists, where = data,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
        )
    )
  }
  
  missing_data <- .validate_variable_no_missing(
    data = data,
    vectorname = variable_exists_non_missing,
    action = "error"
  )
 
  invalid_status <- !data$prescription_status %in% c(
    "active", "on-hold", "cancelled", "completed", 
    "entered-in-error", "stopped", "draft", "unknown"
  )
  
  if( any(invalid_status) ) {
    stop(
      simpleError(
        '`prescription_status` must be one of: "active", "on-hold", "cancelled", "completed", 
    "entered-in-error", "stopped", "draft", or "unknown"'
      )
    )
  }
  
  if( any(is.na(data$prescription_end) & 
          data$daily_frequency != -1 &
          data$prescription_status == "completed") ) {
    stop(
      simpleError(paste(
        "prescriptions with status 'completed' must have a valid `prescription_end`", 
        "except for one-off prescriptions\n",
        .print_and_capture(utils::head(
          dplyr::select(dplyr::filter(
            data, is.na(.data$prescription_end) & 
              .data$daily_frequency != -1 &
              .data$prescription_status == "completed"),
            "patient_id", "daily_frequency", "prescription_end")))
    )))
  }
  
  if( !all(data$daily_frequency == -9 | 
           data$daily_frequency == -1 |
           dplyr::between(data$daily_frequency, 0, 48)) ){
    stop(
      simpleError(paste(
        "Prescription `daily_frequency` must be between",
        "0 and 48, or -9 for PRN 'as required', or -1 for one-off prescriptions"
        )))
  }
  
  if( any(
    !data$antiinfective_type %in% c(
      "antibacterial",
      "antifungal",
      "antiviral",
      "antiparasitic"
    )
  )) {
    stop(paste(
      '`antiinfective_type` must be one of: "antibacterial", "antifungal",',
      '"antiviral", or "antiparasitic"'))
  }
  
  if( any( 
    !data$ATC_route %in% c("Implant", "Inhal", "Instill", "N", 
                           "O", "P", "R", "SL", "TD", "V")
  )) {
    stop(paste('`ATC_route` must be one of:',
               '"Implant", "Inhal", "Instill", "N",',
               '"O", "P", "R", "SL", "TD", "V"'))
  }
  
  duplicates <- data %>% 
    dplyr::group_by(.data$patient_id, .data$drug_code, .data$dose, .data$route, .data$prescription_start) %>% 
    dplyr::summarise(n = dplyr::n()) %>% 
    dplyr::filter(.data$n > 1)
  duplicates <- merge(data, duplicates)
  
  if( nrow(duplicates) > 0 ) {
    warning(
      simpleWarning("There may be some duplicate records")
    )
    warning(simpleWarning(
      .print_and_capture(utils::head(
        dplyr::select(
          dplyr::arrange(duplicates, 
                         .data$patient_id, 
                         .data$drug_code, 
                         .data$prescription_start),
          "patient_id", 
          "prescription_id", 
          "prescription_text", 
          "prescription_start"
        )
      ))))
  }
  
  NULL
}



#' Validate medication administration records
#' 
#' @description This function performs a series of checks for mandatory and
#' optional requirements on drug administrations data.
#' @param data a data frame containing one row per drug administration
#' @section Mandatory fields:
#' \strong{The following fields are required in order to pass the validation:}
#' \describe{
#'      \item{\code{patient_id}}{a patient identifier with no missing value}
#'      \item{\code{prescription_id}}{a prescription identifier with no missing value}
#'      \item{\code{administration_id}}{an administration identifier with no missing value}
#'      \item{\code{administration_text}}{a character string summarising the drug to administer 
#'      (to be displayed in user interfaces, eg: \code{'Amoxicillin oral 500mg'})}
#'      \item{\code{drug_code}}{identifier of the drug (for antibacterials/antifungals, 
#'      use \code{\link[AMR]{as.ab}()})}
#'      \item{\code{drug_name}}{preferred name of the drug in the drug dictionary
#'      (see \code{\link[AMR]{ab_name}()})}
#'      \item{\code{drug_display_name}}{drug name to display in reports and user interfaces
#'      (can be the same as \code{drug_name})}
#'      \item{\code{drug_group}}{the antimicrobial class see \code{\link[AMR]{ab_group}()}}
#'      \item{\code{antiinfective_type}}{type of antiinfective ("antibacterial", "antifungal",
#'      "antiviral", or "antiparasitic")}
#'      \item{\code{ATC_code}}{the ATC code, see \code{\link[AMR]{ab_atc}()}}
#'      \item{\code{ATC_route}}{route of administration as defined in the ATC (
#'      \code{"O"} = oral; \code{"P"} = parenteral; \code{"Inhal"} = inhaled; 
#'      \code{"N"} = nasal; \code{"SL"} = sublingual/buccal/oromucosal;
#'      \code{"TD"} = transdermal; \code{"R"} = rectal; \code{"V"} = vaginal)}
#'      \item{\code{dose}}{a numeric vector of dosage quantities}
#'      \item{\code{unit}}{a character vector of dosage units}
#'      \item{\code{route}}{the route of administration value natively assigned by system}
#'      \item{\code{administration_date}}{timestamp of the drug administration}
#'      \item{\code{administration_status}}{one value from the following 
#'      \href{https://hl7.org/fhir/R4/valueset-medicationrequest-status.html}{FHIR R4}
#'      reference set:
#'      \itemize{ 
#'         \item \code{"in-progess"} the administration has started but has not yet completed.
#'         \item \code{"not-done"} the administration was terminated prior to any impact on 
#'         the subject (though preparatory actions may have been taken). 
#'         \item \code{"on-hold"} actions implied by the administration have been 
#'         temporarily halted, but are expected to continue later.
#'         \item \code{"completed"} all actions that are implied by the administration 
#'         have occurred.
#'         \item \code{"entered-in-error"} the administration was entered in error and 
#'         therefore nullified. 
#'         \item \code{"stopped"} actions implied by the administration have been permanently 
#'         halted, before all of them occurred.
#'         \item \code{"unknown"} the authoring/source system does not know which of the 
#'         status values currently applies for this request. \emph{Note:} This concept 
#'         is not to be used for 'other' - one of the listed statuses is presumed 
#'         to apply, but the authoring/source system does not know which.         
#' }}}
#' @section Optional fields:
#' \describe{
#'      \item{\code{DDD}}{the administered dose expressed in defined daily doses,
#'       see \code{\link{compute_DDDs}()}}
#'      \item{\code{...}}{any other field, as desired, can be loaded into the database.}}
#' @return NULL if the \code{data} passes the validation. The function will trigger 
#' errors when \emph{mandatory requirements} are not met and warnings when
#' \emph{optional requirements} are not met.
#' @export
validate_administrations <- function(data) {
  
  drug_administrations_variables <- .drug_administrations_variables()
  
  variable_exists <- drug_administrations_variables[
    drug_administrations_variables[["must_exist"]],
    "variable_name"
  ]
  
  not_exist <- !vapply(variable_exists, exists, where = data,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  variable_exists_non_missing <- 
    drug_administrations_variables[["variable_name"]]
  variable_exists_non_missing <- variable_exists_non_missing[which(
    drug_administrations_variables[["must_be_nonmissing"]]
  )]
  
  missing_data <- .validate_variable_no_missing(
    data = data,
    vectorname = variable_exists_non_missing,
    action = "error"
  )
  
  invalid_status <- !data$administration_status %in% c(
    "in-progress", "not-done", "on-hold", "completed", 
    "entered-in-error", "stopped", "unknown"
  )
  
  if( any(invalid_status) ) {
    stop(
      simpleError(
        '`prescription_status` must be one of: "in-progress", "not-done", 
        "on-hold", "completed", "entered-in-error", "stopped", or "unknown"'
      )
    )
  }
  
  if( any(
    !data$antiinfective_type %in% c(
      "antibacterial",
      "antifungal",
      "antiviral",
      "antiparasitic"
    )
  )) {
    stop(paste(
      '`antiinfective_type` must be one of: "antibacterial", "antifungal",',
      '"antiviral", or "antiparasitic"'))
  }
  
  duplicates <- data %>% 
    dplyr::group_by(.data$patient_id, .data$drug_code, .data$dose, .data$route, .data$administration_date) %>% 
    dplyr::summarise(n = dplyr::n()) %>% 
    dplyr::filter(.data$n > 1)
  duplicates <- merge(data, duplicates)
  
  if( nrow(duplicates) > 0 ) {
    warning(
      simpleWarning("There may be some duplicate records")
    )
    warning(simpleWarning(
      .print_and_capture(utils::head(
        dplyr::select(
          dplyr::arrange(duplicates, 
                         .data$patient_id, 
                         .data$drug_code, 
                         .data$administration_date),
          "patient_id", 
          "prescription_id", 
          "administration_id", 
          "administration_text",
          "administration_date"
        )
    ))))
  }
  
  NULL
}

#' Validate microbial isolates & susceptibility records
#'
#' @param specimens a data frame with one row per specimen sent
#' to laboratory (see details)
#' @param isolates a data frame with one row per microorganism
#' isolated from the laboratory specimen (see details)
#' @param susceptibilities a data frame with one row per susceptibility
#' (see details)
#' 
#' @section Specimens data frame:
#' \strong{The following fields are mandatory:}
#' \describe{
#'    \item{\code{specimen_id}}{a unique specimen identifier with no missing value}
#'    \item{\code{patient_id}}{a patient identifier with no missing value}
#'    \item{\code{status}}{one value from the following 
#'    \href{https://www.hl7.org/fhir/valueset-specimen-status.html}{FHIR R4}
#'      reference set: \itemize{
#'        \item \code{"available"}: The physical specimen is present and 
#'        in good condition.
#'        \item \code{"unavailable"}: There is no physical specimen because it is either 
#'        lost, destroyed or consumed.
#'        \item \code{"unsatisfactory"}: The specimen cannot be used because of a
#'         quality issue such as a broken container, contamination, or too old.	
#'        \item \code{"entered-in-error"}: The specimen was entered in error and 
#'        therefore nullified.
#'      }}
#'    \item{\code{specimen_datetime}}{datetime when specimen was sampled or 
#'    received for processing.}
#'    \item{\code{specimen_type_code}}{character vector of descendants of 
#'    the SNOMED CT concept \code{123038009 | Specimen (specimen) |}. Admissible
#'    values are listed in \link[Ramses]{reference_specimen_type}}
#'    \item{\code{specimen_type_name}}{character vector of the SNOMED CT
#'    preferred terms for \code{specimen_type_code}}
#'    \item{\code{specimen_type_display}}{character vector of custom specimen
#'    types for display in user interfaces}
#' }
#' 
#' \strong{The following fields are optional:}
#' \describe{
#'    \item{\code{encounter_id}}{a hospital encounter identifier (if the specimen was 
#'    sampled during admission)}
#'    \item{\code{test_display}}{free text description of test requested for 
#'    display in user interfaces. For instance: "Mycobacteria culture" or 
#'    "Microbial culture, anaerobic, initial isolation". Coded concepts
#'    should be stored in other custom columns.}
#'    \item{\code{reason_display}}{free text reason for a procedure for display 
#'    in user interfaces. For instance, "Suspected urinary tract infection", or
#'    "Surgical site microbiological sample". Coded concepts should be stored in other 
#'    custom columns.}
#' }
#'    
#' @section Isolates data frame:
#' 
#' \strong{The following fields are mandatory:}
#' \describe{
#'    \item{\code{isolate_id}}{a unique isolated organism identifier with no missing value}
#'    \item{\code{specimen_id}}{a specimen identifier with no missing value}
#'    \item{\code{patient_id}}{a patient identifier with no missing value}
#'    \item{\code{organism_code}}{a character vector containing either: \itemize{
#'       \item a microorganism code validated using \code{\link[AMR]{as.mo}()}
#'       \item \code{NA_character_} if no microorganism was isolated, for instance
#'       due to no growth or mixed heavy growth
#'    }}
#'    \item{\code{organism_name}}{a microorganism name provided by 
#'    \code{\link[AMR]{mo_name}()}, or \code{NA} if no microorganism was isolated}
#'    \item{\code{organism_display_name}}{microorganism name as labelled by the
#'    laboratory, for display in user interfaces. No growth/mixed heavy growth should
#'    be referenced here}
#'    \item{\code{isolation_datetime}}{datetime when the organism was first isolated 
#'    (or reported) by the laboratory}
#'  }
#' 
#' \strong{The following field is optional:}
#' \describe{
#'    \item{\code{mdr_classification}}{character vector of classifications produced by 
#'    \code{\link[AMR]{mdro}()}. Admissible values are: \itemize{
#'       \item \code{NA_character_} when susceptibilities are not available/conclusive
#'       \item \code{"Negative"} for isolate presenting no wide resistance phenotype
#'       \item other character codes dependent on the \code{guideline} parameter 
#'       provided to \code{\link[AMR]{mdro}()}
#'    }}
#' }  
#'  
#' @section Susceptibilities data frame:
#' 
#' \strong{The following fields are mandatory:}
#' \describe{
#'   \item{\code{isolate_id}}{an isolated organism identifier with no missing value}
#'   \item{\code{specimen_id}}{a specimen identifier with no missing value}
#'   \item{\code{patient_id}}{a patient identifier with no missing value}
#'   \item{\code{organism_code}}{a microorganism code validated using 
#'   \code{\link[AMR]{as.mo}()}, with no missing values}
#'   \item{\code{organism_name}}{a microorganism name provided by 
#'   \code{\link[AMR]{mo_name}()}, with no missing values}
#'   \item{\code{organism_display_name}}{microorganism name as labelled by the
#'   laboratory, for display in user interfaces, with no missing values}
#'   \item{\code{agent_code}}{code of the antimicrobial tested as provided by 
#'   \code{\link[AMR]{as.ab}()}}
#'   \item{\code{agent_name}}{name of the antimicrobial tested as provided by 
#'   \code{\link[AMR]{ab_name}()}}
#'   \item{\code{agent_display_name}}{name of the antimicrobial tested, 
#'   with no missing values, for  display in reports and user interfaces
#'   (can be the same as \code{drug_name})}
#'   \item{\code{rsi_code}}{\code{"R"} (resistant), \code{"S"} (susceptible), 
#'    or \code{"I"} (intermediate exposure), as determined by the laboratory or by
#'   \code{\link[AMR]{as.sir}()} on the basis of minimum inhibitory concentrations
#'   or disk diffusion diameters}
#' }
#' 
#' @return TRUE if the validation is passed
#' @export
validate_microbiology <- function(specimens, isolates, susceptibilities) {
  
  schema <- .inpatient_microbiology_variables()
  
  forgetme <- .validate_variable_exist(
    data = specimens,
    vectorname = schema$specimens$variable_name[
      schema$specimens$must_exist
    ],
    action = "error"
  )
  forgetme <- .validate_variable_exist(
    data = isolates,
    vectorname = schema$isolates$variable_name[
      schema$isolates$must_exist
    ],
    action = "error"
  )
  forgetme <- .validate_variable_exist(
    data = susceptibilities,
    vectorname = schema$susceptibilities$variable_name[
      schema$susceptibilities$must_exist
    ],
    action = "error"
  )
  
  forgetme <- .validate_variable_no_missing(
    specimens, 
    schema$specimens$variable_name[schema$specimens$must_be_nonmissing])
  forgetme <- .validate_variable_no_missing(
    isolates, 
    schema$isolates$variable_name[schema$isolates$must_be_nonmissing])
  forgetme <- .validate_variable_no_missing(
    susceptibilities, 
    schema$susceptibilities$variable_name[schema$susceptibilities$must_be_nonmissing])
  
  forgetme <- .validate_values_unique(
    specimens,
    schema$specimens$variable_name[schema$specimens$must_be_unique]
  )
  forgetme <- .validate_values_unique(
    isolates,
    schema$isolates$variable_name[schema$isolates$must_be_unique]
  )
  .validate_values_unique(
    susceptibilities,
    schema$susceptibilities$variable_name[schema$susceptibilities$must_be_unique]
  )
  
  invalid_specimen_codes <- !(specimens$specimen_type_code %in% 
                                Ramses::reference_specimen_type$conceptId)
  if( any(invalid_specimen_codes) ) {
    warning(paste(
      c("Some values in `specimen_type_code` are not valid SNOMED CT specimen concepts:",
        paste(utils::head(unique(specimens$specimen_type_code[invalid_specimen_codes])), collapse = ", ")),
      collapse = "\n"
    ))
  }
  
  if( any(!susceptibilities$isolate_id %in% isolates$isolate_id) ) {
    stop("Some `isolate_id` in `susceptibility` are missing from `isolates`")
  }
  if( any(!susceptibilities$specimen_id %in% isolates$specimen_id) ) {
    stop("Some `specimen_id` in `susceptibility` are missing from `isolates`")
  }
  if( any(!susceptibilities$specimen_id %in% specimens$specimen_id) ) {
    stop("Some `specimen_id` in `susceptibility` are missing from `specimens`")
  }
  if( any(!isolates$specimen_id %in% specimens$specimen_id) ) {
    stop("Some `specimen_id` in `isolates` are missing from `specimens`")
  }
  
  stopifnot(is(specimens$specimen_datetime, "POSIXt") | is(specimens$specimen_datetime, "Date"))
  stopifnot(is(isolates$isolation_datetime, "POSIXt") | is(isolates$isolation_datetime, "Date"))
  
  invalid_organism_codes <- na.omit(unique(c(isolates$organism_code,
                                     susceptibilities$organism_code)))
  invalid_organism_codes <- invalid_organism_codes[
    !invalid_organism_codes %in% AMR::microorganisms$mo
  ]
  if( length(invalid_organism_codes) > 0 ) {
    stop(paste(
      "Some `organism_code` values are invalid:",
      paste(utils::head(invalid_organism_codes), collapse = ", "),
      collapse = "\n"
    ))
  }
  
  invalid_drug_codes <- na.omit(unique(susceptibilities$agent_code))
  invalid_drug_codes <- invalid_drug_codes[
    !invalid_drug_codes %in% AMR::antibiotics$ab
  ]
  if( length(invalid_drug_codes) > 0 ) {
    stop(paste(
      "Some `organism_code` values are invalid:",
      paste(utils::head(invalid_drug_codes), collapse = ", "),
      collapse = "\n"
    ))
  }
  
  TRUE
}


#' Validate records of observations & investigations
#'
#' @param investigations a data frame
#' @param custom_units a character vector of valid unit codes not listed in
#' the UCUM. Default is: \code{c("breaths", "beats", "U")}.
#' @return \code{TRUE} if the validation is passed
#' @section Mandatory variables:
#' The following variables are required:
#' \describe{
#'    \item{\code{"observation_id"}}{a unique identifier with no missing value}
#'    \item{\code{"patient_id"}}{a patient identifier with no missing value}
#'    \item{\code{"status"}}{Codes from the following value set 
#'    \url{http://hl7.org/fhir/observation-status/} \itemize{
#'   \item \code{"registered"}: The existence of the observation is registered, but there is no result yet available.
#'   \item \code{"preliminary"}: his is an initial or interim observation: data may be incomplete or unverified.
#'   \item \code{"final"}: The observation is complete and there are no further actions needed. 
#'   \item \code{"amended"}: Subsequent to being Final, the observation has been modified 
#'   subsequent. This includes updates/new information and corrections.
#'   \item \code{"corrected"}: Subsequent to being Final, the observation has been modified to 
#'   correct an error in the test result.
#'   \item \code{"cancelled"}: The observation is unavailable because the 
#'   measurement was not started or not completed.
#'   \item \code{"entered-in-error"}: The observation has been withdrawn following previous 
#'   final release. This electronic record should never have existed, though it is possible 
#'   that real-world decisions were based on it. (If real-world activity has occurred, 
#'   the status should be "cancelled" rather than "entered-in-error".).
#'   \item \code{"unknown"}: The authoring/source system does not know which of the status 
#'   values currently applies for this observation. Note: This concept is not to be used for 
#'   "other" - one of the listed statuses is presumed to apply, but the authoring/source 
#'   system does not know which.
#'   }}
#'    \item{\code{"request_datetime"}}{a datetime when the observation was 
#'    requested with no missing value}
#'    \item{\code{"observation_datetime"}}{a datetime when the investigation 
#'    was performed with no missing value}
#'    \item{\code{"observation_code_system"}}{URL of the code system (for instance: 
#'    \code{"http://snomed.info/sct"}, \code{"http://loinc.org"})}
#'    \item{\code{"observation_code"}}{LOINC concept code or SNOMED-CT concept 
#'    code corresponding to a SNOMED CT observable entity or evaluation procedure}
#'    \item{\code{"observation_name"}}{code system name for the observation}
#'    \item{\code{"observation_display"}}{observation name to display}
#'    \item{\code{"observation_value_text"}}{observation string value or codable
#'    concept, for example: \code{"TRUE"}/\code{"FALSE"}, 
#'    \code{"Yes"}/\code{"No"}, SNOMED CT qualifier value}
#'    \item{\code{"observation_value_numeric"}}{observation numeric value}
#'    \item{\code{"observation_unit"}}{a unit code passing 
#'    \code{\link[units]{as_units}()}. See examples. All observations with the
#'    same \code{"observation_code"} must be converted to the same 
#'    \code{"observation_unit"}. See also: \code{\link[units]{valid_udunits}}, 
#'    \code{\link[units]{install_unit}}
#'    \url{https://ucum.org/}}
#' }
#' @export 
#' @examples 
#' # the units "breaths/min" (http://loinc.org/8867-4) or
#' # "beats/min" () do not exist in the https://ucum.org/. 
#' library(units)
#' \dontrun{as_units("breaths/min")} # fails
#' 
#' # Yet, they may be declared.
#' install_unit("breaths") 
#' as_units("breaths/min") # succeeds
validate_investigations <- function(investigations, 
                                    custom_units = c("breaths",
                                                     "beats",
                                                     "U")) {
  
  investigation_schema <- .inpatient_investigations_variables()
  
  variable_exists <- investigation_schema[
    investigation_schema[["must_exist"]],
    "variable_name"]

  not_exist <- !vapply(variable_exists, exists, where = investigations,
                       FUN.VALUE = logical(1))
  if( any(not_exist) ){
    stop(
      simpleError(paste(
        "The following variables must exist:",
        paste(paste0("`", variable_exists[not_exist], "`"), collapse = ", "))
      )
    )
  }
  
  exists_non_missing <- investigation_schema[
    investigation_schema$must_be_nonmissing,
    "variable_name"]
  missing_data <- suppressWarnings(
    !vapply(exists_non_missing, 
            FUN = .validate_variable_no_missing,
            data = investigations,
            FUN.VALUE = logical(1))
  )
  if( any(missing_data) ){
    stop(
      paste(
        "The following variables must not contain missing data:",
        paste(paste0("`", 
                     exists_non_missing[missing_data],
                     "`"), collapse = ", ")
        ))
  }
  
  must_be_unique <- .validate_values_unique(
    investigations,
    investigation_schema[
    investigation_schema$must_be_unique, 
    "variable_name"])
  
  if( !is.null(custom_units) ){
    custom_units <- custom_units[custom_units != ""]
    for (unit in custom_units) {
      silent <- tryCatch(
        expr = {
          units::as_units(unit)
        },
        error = function(e) {
          units::install_unit(unit)
        }
      )
    }
  }
  
  units_validate <- .validate_UCUM_codes(unique(investigations$observation_unit))
  
  units_mixed <- investigations %>% 
    dplyr::group_by(.data$observation_code_system, .data$observation_code) %>% 
    dplyr::summarise(n = dplyr::n_distinct(.data$observation_unit)) %>% 
    dplyr::filter(.data$n > 1)
  
  if( nrow(units_mixed) > 0 ) {
    stop(
      "Every `observation_code` must only be associated with one `observation_unit`.\n",
      "Please convert these observations to a single unit: \n\n ",
      paste0(utils::capture.output(data.frame(units_mixed[, 1:2])), collapse = "\n"), 
      call. = FALSE
    )
  }
  
  multiple_names <- investigations %>% 
    dplyr::group_by(.data$observation_code_system, .data$observation_code) %>% 
    dplyr::summarise(n_name = dplyr::n_distinct(.data$observation_name),
                     n_display = dplyr::n_distinct(.data$observation_display)) %>% 
    dplyr::filter(.data$n_display > 1 | .data$n_name > 1)
  
  if( nrow(multiple_names) > 0 ) {
    stop(
      "Every `observation_code` must only be associated with ",
      "one `observation_name` and `observation_display` only.\n",
      "Please unify observation names and display names for the following codes: \n\n ",
      paste0(utils::capture.output(data.frame(multiple_names[, 1:2])), collapse = "\n"), 
      call. = FALSE
    )
  }
  
  all(!not_exist, !missing_data, must_be_unique, units_validate)
}

arrange_variables <- function(data, first_column_names) {
  other_names <- colnames(data)[!colnames(data) %in% first_column_names]
  dplyr::select(data, dplyr::all_of(first_column_names), dplyr::all_of(other_names))
}
ramses-antibiotics/ramses-package documentation built on Feb. 13, 2024, 1:01 p.m.