#' Run all validation checks
#'
#' Runs all validation checks. Requires an environment configuration
#' (config) to be set. The config is expected to have templates for
#' each metadataType, where individual and biospecimen depend on species and
#' assay depends on the assay type. Additionally, there should be
#' complete_columns for each metadataType.
#'
#' @param data A tibble or dataframe with the columns:
#' name, metadataType, species, assay, file_data, template (optional).
#' The file_data column should be a list column containing
#' a dataframe with the file data or `NULL` if the data
#' does not exist. `data` is expected to have four rows,
#' one for each metadataType: individual, biospecimen,
#' assay, manifest. If file_data is `NULL` for a given
#' metadataType, the metadataType should still be
#' present. The template column should have the synID or Synapse JSON schema
#' id for the metadata temaplate. If this is `NULL` or the column is missing,
#' then the check will not verify that all expected columns are present in
#' the metadata (i.e. `check_cols` is skipped).
#' @param study A string containing the name of the study (default `NA`). If not
#' given, will not check the metadata for individuals and specimens currently
#' in the `samples_table`.
#' @param samples_table Synapse synID for a table containing the columns:
#' specimenID, individualID, assay, study (default `NA`). If `samples_table` or
#' `study` not provided, will not check the metadata for individuals and
#' specimens currently in this table.
#' @inheritParams check_annotation_keys
#' @return List of conditions
#' @export
#' @examples
#' \dontrun{
#' syn <- synapse$Synapse()
#' syn$login()
#'
#' annots <- get_synapse_annotations(syn = syn)
#'
#' data <- tibble::tibble(
#' metadataType = c(
#' "manifest",
#' "individual",
#' "biospecimen",
#' "assay"
#' ),
#' name = c("a", NA, NA, "c"),
#' species = "human",
#' assay = "rnaSeq",
#' file_data = c(
#' list(data.frame(a = c(TRUE, FALSE), b = c(1, 3))),
#' list(NULL),
#' list(NULL),
#' list(data.frame(a = c(TRUE, FALSE), b = c(1, 3)))
#' )
#' )
#' res <- check_all(data, annots, syn)
#' }
check_all <- function(data, annotations, syn, study = NA, samples_table = NA) {
# Get indices by type
indices <- get_metadataType_indices(
data,
c("individual", "biospecimen", "assay", "manifest")
)
# Must have 1 and only 1 index per metadata type
num_indices <- purrr::map_lgl(indices, ~ length(.) != 1)
if (length(indices) < 4 | any(num_indices)) {
stop("There must be exactly row 1 of each metadata type to check: biospecimen, individual, assay, manifest") # nolint
}
# Missing columns ------------------------------------------------------------
# Only run if the template column exists
missing_cols_indiv <- missing_cols_biosp <- missing_cols_assay <- missing_cols_manifest <- NULL # nolint
if ("template" %in% colnames(data)) {
missing_cols_indiv <- check_cols_individual(
data$file_data[indices$individual][[1]],
data$template[indices$individual][[1]],
syn = syn
)
missing_cols_biosp <- check_cols_biospecimen(
data$file_data[indices$biospecimen][[1]],
data$template[indices$biospecimen][[1]],
syn = syn
)
missing_cols_assay <- check_cols_assay(
data$file_data[indices$assay][[1]],
data$template[indices$assay][[1]],
syn = syn
)
missing_cols_manifest <- check_cols_manifest(
data$file_data[indices$manifest][[1]],
data$template[indices$manifest][[1]],
syn = syn
)
}
# Individual and specimen IDs match ----------------------------------------
individual_ids_indiv_biosp <- check_indiv_ids_match(
data$file_data[indices$individual][[1]],
data$file_data[indices$biospecimen][[1]],
"individual",
"biospecimen",
bidirectional = FALSE
)
individual_ids_indiv_manifest <- check_indiv_ids_match(
data$file_data[indices$individual][[1]],
data$file_data[indices$manifest][[1]],
"individual",
"manifest",
bidirectional = FALSE
)
specimen_ids_biosp_assay <- check_specimen_ids_match(
data$file_data[indices$biospecimen][[1]],
data$file_data[indices$assay][[1]],
"biospecimen",
"assay",
bidirectional = FALSE
)
specimen_ids_biosp_manifest <- check_specimen_ids_match(
data$file_data[indices$biospecimen][[1]],
data$file_data[indices$manifest][[1]],
"biospecimen",
"manifest",
bidirectional = FALSE
)
# Annotation keys in manifest are valid ------------------------------------
annotation_keys_manifest <- check_annotation_keys(
data$file_data[indices$manifest][[1]],
annotations,
allowlist_keys = c("path", "parent", "name", "used", "executed"),
success_msg = "All keys (column names) in the manifest are valid",
fail_msg = "Some keys (column names) in the manifest are invalid",
annots_link = get_golem_config("annotations_link")
)
# Annotation values in manifest and metadata are valid ---------------------
annotation_values_manifest <- check_annotation_values(
data$file_data[indices$manifest][[1]],
annotations,
success_msg = "All values in the manifest are valid",
fail_msg = "Some values in the manifest are invalid",
annots_link = get_golem_config("annotations_link")
)
annotation_values_indiv <- check_annotation_values(
data$file_data[indices$individual][[1]],
annotations,
allowlist_keys = c("individualID"),
success_msg = "All values in the individual metadata are valid",
fail_msg = "Some values in the individual metadata are invalid",
annots_link = get_golem_config("annotations_link")
)
annotation_values_biosp <- check_annotation_values(
data$file_data[indices$biospecimen][[1]],
annotations,
allowlist_keys = c("specimenID", "individualID"),
success_msg = "All values in the biospecimen metadata are valid",
fail_msg = "Some values in the biospecimen metadata are invalid",
annots_link = get_golem_config("annotations_link")
)
annotation_values_assay <- check_annotation_values(
data$file_data[indices$assay][[1]],
annotations,
allowlist_keys = c("specimenID"),
success_msg = "All values in the assay metadata are valid",
fail_msg = "Some values in the assay metadata are invalid",
annots_link = get_golem_config("annotations_link")
)
# Individual and specimen IDs are not duplicated ---------------------------
duplicate_indiv_ids <- check_indiv_ids_dup(
data$file_data[indices$individual][[1]],
success_msg = "Individual IDs in the individual metadata file are unique",
fail_msg = "Duplicate individual IDs found in the individual metadata file"
)
duplicate_specimen_ids <- check_specimen_ids_dup(
data$file_data[indices$biospecimen][[1]],
success_msg = "Specimen IDs in the biospecimen metadata file are unique",
fail_msg = "Duplicate specimen IDs found in the biospecimen metadata file"
)
# Empty columns produce warnings -------------------------------------------
empty_cols_manifest <- check_cols_empty(
data$file_data[indices$manifest][[1]],
required_cols = get_golem_config("complete_columns")$manifest,
success_msg = "No columns are empty in the manifest",
fail_msg = "Some columns are completely empty in the manifest"
)
empty_cols_indiv <- check_cols_empty(
data$file_data[indices$individual][[1]],
required_cols = get_golem_config("complete_columns")$individual,
success_msg = "No columns are empty in the individual metadata",
fail_msg = "Some columns are completely empty in the individual metadata"
)
empty_cols_biosp <- check_cols_empty(
data$file_data[indices$biospecimen][[1]],
required_cols = get_golem_config("complete_columns")$biospecimen,
success_msg = "No columns are empty in the biospecimen metadata",
fail_msg = "Some columns are completely empty in the biospecimen metadata"
)
empty_cols_assay <- check_cols_empty(
data$file_data[indices$assay][[1]],
required_cols = get_golem_config("complete_columns")$assay,
success_msg = "No columns are empty in the assay metadata",
fail_msg = "Some columns are completely empty in the assay metadata"
)
# Incomplete required columns produce failures -----------------------------
complete_cols_manifest <- check_cols_complete(
data$file_data[indices$manifest][[1]],
required_cols = get_golem_config("complete_columns")$manifest,
success_msg = "There is no missing data in columns that are required to be complete in the manifest", # nolint
fail_msg = "There is missing data in some columns that are required to be complete in the manifest" # nolint
)
complete_cols_indiv <- check_cols_complete(
data$file_data[indices$individual][[1]],
required_cols = get_golem_config("complete_columns")$individual,
success_msg = "There is no missing data in columns that are required to be complete in the individual metadata", # nolint
fail_msg = "There is missing data in some columns that are required to be complete in the individual metadata" # nolint
)
complete_cols_biosp <- check_cols_complete(
data$file_data[indices$biospecimen][[1]],
required_cols = get_golem_config("complete_columns")$biospecimen,
success_msg = "There is no missing data in columns that are required to be complete in the biospecimen metadata", # nolint
fail_msg = "There is missing data in some columns that are required to be complete in the biospecimen metadata" # nolint
)
complete_cols_assay <- check_cols_complete(
data$file_data[indices$assay][[1]],
required_cols = get_golem_config("complete_columns")$assay,
success_msg = "There is no missing data in columns that are required to be complete in the assay metadata", # nolint
fail_msg = "There is missing data in some columns that are required to be complete in the assay metadata" # nolint
)
# Metadata files appear in manifest ----------------------------------------
meta_files_in_manifest <- check_files_manifest(
data$file_data[indices$manifest][[1]],
c(
data$name[indices$individual],
data$name[indices$biospecimen],
data$name[indices$assay]
),
success_msg = "Manifest file contains all metadata files",
fail_msg = "Manifest file does not contain all metadata files"
)
# Parent column in manifest is valid synID -----------------------------------
valid_parent_syn <- check_parent_syn(data$file_data[indices$manifest][[1]])
# Ages over 90 are censored in human individual metadata ---------------------
if (any(data$species == "human", na.rm = TRUE)) {
ages_over_90_indiv <- check_ages_over_90(
data$file_data[indices$individual][[1]],
success_msg = "No ages over 90 detected in the individual metadata",
fail_msg = "Ages over 90 detected in the individual metadata"
)
ages_over_90_biosp <- check_ages_over_90(
data$file_data[indices$biospecimen][[1]],
col = "samplingAge",
success_msg = "No ages over 90 detected in the biospecimen metadata",
fail_msg = "Ages over 90 detected in the biospecimen metadata"
)
} else {
ages_over_90_biosp <- ages_over_90_indiv <- NULL
}
# No file paths are duplicated in the manifest -------------------------------
duplicate_file_paths <- check_duplicate_paths(
data$file_data[indices$manifest][[1]]
)
# Additions to existing studies have complete IDs ----------------------------
## Only do if samples_table and study provided
complete_ids_indiv <- complete_ids_biosp <- complete_ids_assay <- NULL
if (!is.na(samples_table) & !is.na(study)) {
samples <- syn$tableQuery(
glue::glue("SELECT * FROM {samples_table} WHERE study = '{study}'"),
includeRowIdAndRowVersion = FALSE
)
samples <- readr::read_csv(samples$filepath)
## Check if the study is in the table before continuing
if (study %in% samples$study) {
assay <- data[indices$assay, "assay", drop = TRUE]
complete_ids_indiv <- check_complete_ids(
data$file_data[indices$individual][[1]],
samples_table = samples,
study = study,
id_type = "individualID",
success_msg = "All pre-existing individual IDs are present in the individual file", # nolint
fail_msg = "Some individual IDs that were previously part of this study are missing from the individual file" # nolint
)
complete_ids_biosp <- check_complete_ids(
data$file_data[indices$biospecimen][[1]],
samples_table = samples,
study = study,
id_type = "specimenID",
success_msg = "All pre-existing specimen IDs are present in the biospecimen file", # nolint
fail_msg = "Some specimen IDs that were previously part of this study are missing from the biospecimen file" # nolint
)
complete_ids_assay <- check_complete_ids(
data$file_data[indices$assay][[1]],
samples_table = samples,
study = study,
id_type = "specimenID",
assay = assay,
success_msg = "All pre-existing specimen IDs for this assay are present in the assay file", # nolint
fail_msg = "Some specimen IDs that were previously part of this study and assay are missing from the assay file" # nolint
)
}
}
## List results
res <- list(
missing_cols_indiv = missing_cols_indiv,
missing_cols_biosp = missing_cols_biosp,
missing_cols_assay = missing_cols_assay,
missing_cols_manifest = missing_cols_manifest,
individual_ids_indiv_biosp = individual_ids_indiv_biosp,
individual_ids_indiv_manifest = individual_ids_indiv_manifest,
specimen_ids_biosp_assay = specimen_ids_biosp_assay,
specimen_ids_biosp_manifest = specimen_ids_biosp_manifest,
annotation_keys_manifest = annotation_keys_manifest,
annotation_values_manifest = annotation_values_manifest,
annotation_values_indiv = annotation_values_indiv,
annotation_values_biosp = annotation_values_biosp,
annotation_values_assay = annotation_values_assay,
duplicate_indiv_ids = duplicate_indiv_ids,
duplicate_specimen_ids = duplicate_specimen_ids,
empty_cols_manifest = empty_cols_manifest,
empty_cols_indiv = empty_cols_indiv,
empty_cols_biosp = empty_cols_biosp,
empty_cols_assay = empty_cols_assay,
complete_cols_manifest = complete_cols_manifest,
complete_cols_indiv = complete_cols_indiv,
complete_cols_biosp = complete_cols_biosp,
complete_cols_assay = complete_cols_assay,
meta_files_in_manifest = meta_files_in_manifest,
valid_parent_syn = valid_parent_syn,
ages_over_90_indiv = ages_over_90_indiv,
ages_over_90_biosp = ages_over_90_biosp,
duplicate_file_paths = duplicate_file_paths,
complete_ids_indiv = complete_ids_indiv,
complete_ids_biosp = complete_ids_biosp,
complete_ids_assay = complete_ids_assay
)
res
}
## Check all for invalid characters
check_all_invalid_char <- function(manifest, indiv, biosp, assay) {
# Invalid characters ---------------------------------------------------------
invalid_characters_manifest <- check_invalid_characters(
manifest,
success_msg = "There are no invalid characters in the manifest",
fail_msg = "There are invalid characters in the manifest columns"
)
invalid_characters_individual <- check_invalid_characters(
indiv,
success_msg = "There are no invalid characters in the individual metadata",
fail_msg = "There are invalid characters in the individual metadata columns"
)
invalid_characters_biospecimen <- check_invalid_characters(
biosp,
success_msg = "There are no invalid characters in the biospecimen metadata",
fail_msg = "There are invalid characters in the biospecimen metadata columns" #nolint
)
invalid_characters_assay <- check_invalid_characters(
assay,
success_msg = "There are no invalid characters in the assay metadata",
fail_msg = "There are invalid characters in the assay metadata columns"
)
list(
invalid_characters_manifest = invalid_characters_manifest,
invalid_characters_individual = invalid_characters_individual,
invalid_characters_biospecimen = invalid_characters_biospecimen,
invalid_characters_assay = invalid_characters_assay
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.