R/int_all_datastructure_segment.R

Defines functions int_all_datastructure_segment

Documented in int_all_datastructure_segment

#' Wrapper function to check for segment data structure
#'
#' @description
#' This function tests for unexpected elements and records, as well as duplicated identifiers and content.
#' The unexpected element record check can be conducted by providing the number of expected records or
#' an additional table with the expected records.
#' It is possible to conduct the checks by study segments or to consider only selected
#' segments.
#'
#' [Indicator]
#'
#' @inheritParams .template_function_indicator
#'
#' @param meta_data_segment [data.frame] the data frame that contains the metadata for the segment level, mandatory
#' @param segment_level [data.frame] alias for `meta_data_segment`
#'
#' @return a [list] with
#'   - `SegmentTable`: data frame with selected check results, used for the data quality report.
#'
#' @export
#'
#' @examples
#' \dontrun{
#' out_segment <- int_all_datastructure_segment(
#'   meta_data_segment = "meta_data_segment",
#'   study_data = "ship",
#'   meta_data = "ship_meta"
#' )
#'
#' study_data <- cars
#' meta_data <- dataquieR::prep_create_meta(VAR_NAMES = c("speedx", "distx"),
#'   DATA_TYPE = c("integer", "integer"), MISSING_LIST = "|", JUMP_LIST = "|",
#'   STUDY_SEGMENT = c("Intro", "Ex"))
#'
#' out_segment <- int_all_datastructure_segment(
#'   meta_data_segment = "meta_data_segment",
#'   study_data = study_data,
#'   meta_data = meta_data
#' )
#' }
int_all_datastructure_segment <- function(study_data,
                                          label_col,
                                          item_level = "item_level",
                                          meta_data = item_level,
                                          meta_data_v2,
                                          segment_level,
                                          meta_data_segment = "segment_level") {

  # Preps and checks ----
  util_maybe_load_meta_data_v2()
  util_ck_arg_aliases()

  # map metadata to study data
  prep_prepare_dataframes(.allow_empty = TRUE)
  if (!(STUDY_SEGMENT %in% colnames(meta_data))) {
    meta_data[[STUDY_SEGMENT]] <- "ALL" # TODO: Warn
  }
  meta_data_segment <- prep_check_meta_data_segment(meta_data_segment)

  id_vars_list <- lapply(setNames(meta_data_segment[[SEGMENT_ID_VARS]], # TODO: use the constants everywhere: meta_data_segment[[SEGMENT_ID_VARS]], not ...$SEGMENT_ID_VARS
                                  nm = meta_data_segment[[STUDY_SEGMENT]]),
                         util_parse_assignments,
                         multi_variate_text = TRUE
  )
  id_vars_list_vector <- lapply(id_vars_list, unlist, recursive = TRUE)

  id_vars_list_vector <- lapply(id_vars_list_vector,
                                util_map_labels,
                                meta_data = meta_data,
                                to = label_col)

  # 1. Unexpected data record count ----

  # subset metadata with entries
  meta_data_record_count_0 <-
    meta_data_segment[!util_empty(meta_data_segment[[SEGMENT_RECORD_COUNT]]), ,
                      drop = FALSE]

  unexp_records_out <- NULL # TODO: capture errors form the next all and put them to the matrices
  try(silent = TRUE, {
    unexp_records_out <- withr::with_options(list(
      dataquieR.testdebug = TRUE), int_unexp_records_segment(
        study_segment = meta_data_record_count_0[[STUDY_SEGMENT]],
        data_record_count = meta_data_record_count_0[[SEGMENT_RECORD_COUNT]],
        study_data = study_data, meta_data = meta_data, label_col = label_col))
  })


  # 2. Unexpected data record set ----
  # subset metadata with entries
  meta_data_record_set_1 <-
    meta_data_segment[!util_empty(meta_data_segment[[SEGMENT_RECORD_CHECK]]), ,
                      drop = FALSE
    ]

  unexp_records_id_out <- NULL # TODO: capture errors form the next all and put them to the matrices
  try(silent = TRUE, {
    unexp_records_id_out <- util_int_unexp_records_set_segment(
      id_vars_list =
        id_vars_list_vector[meta_data_record_set_1[[STUDY_SEGMENT]]],
      identifier_name_list = meta_data_record_set_1[[STUDY_SEGMENT]],
      valid_id_table_list = meta_data_record_set_1[[SEGMENT_ID_REF_TABLE]],
      meta_data_record_check_list =
        meta_data_record_set_1[[SEGMENT_RECORD_CHECK]],
      study_data = study_data,
      label_col = label_col,
      meta_data = meta_data)
  })


  # 3. Duplicates: ids ----
  meta_data_dup_ids_1 <-
    meta_data_segment[!util_empty(meta_data_segment[[SEGMENT_ID_VARS]]), ,
                      drop = FALSE
    ]

  duplicate_ids_out <- NULL # TODO: capture errors form the next all and put them to the matrices
  try(silent = TRUE, {

    duplicate_ids_out <- withr::with_options(list(
      dataquieR.testdebug = TRUE), int_duplicate_ids(
        level = "segment",
        id_vars_list = id_vars_list_vector[meta_data_dup_ids_1[[STUDY_SEGMENT]]],
        study_segment = meta_data_dup_ids_1[[STUDY_SEGMENT]],
        repetitions = meta_data_dup_ids_1[[SEGMENT_UNIQUE_ID]],
        study_data = study_data,
        meta_data = meta_data,
        label_col = label_col
      ))})


  # 4. Duplicates: content ----
  meta_data_dup_rows_1 <-
    meta_data_segment[!util_empty(meta_data_segment[[SEGMENT_UNIQUE_ROWS]]), ,
                      drop = FALSE
    ]

  meta_data_dup_rows_1 <-
    meta_data_dup_rows_1[
      trimws(tolower(meta_data_dup_rows_1[[SEGMENT_UNIQUE_ROWS]])) ==
        "no_id" |
        !util_is_na_0_empty_or_false(
          meta_data_dup_rows_1[[SEGMENT_UNIQUE_ROWS]]), ,
      drop = FALSE
    ]

  duplicate_rows_out <- NULL # TODO: capture errors form the next all and put them to the matrices
  try(silent = TRUE, {
    duplicate_rows_out <- withr::with_options(list(
      dataquieR.testdebug = TRUE), int_duplicate_content(
        level = "segment",
        identifier_name_list = meta_data_dup_rows_1[[STUDY_SEGMENT]],
        study_data = study_data,
        meta_data = meta_data,
        label_col = label_col,
        id_vars_list = id_vars_list_vector[meta_data_dup_rows_1[[STUDY_SEGMENT]]],
        unique_rows = setNames(meta_data_dup_rows_1[[SEGMENT_UNIQUE_ROWS]],
                               nm = meta_data_dup_rows_1[[STUDY_SEGMENT]])
      ))
  })

  # X. Unexpected data element set ----

  out_int_sts_element <- NULL # TODO: capture errors form the next all and put them to the matrices
  try(silent = TRUE, {
    out_int_sts_element <-
      withr::with_options(list(
        dataquieR.testdebug = TRUE), int_sts_element_segment(study_data = study_data,
                                                             label_col = label_col,
                                                             meta_data = meta_data)$SegmentTable)

    out_int_sts_element$GRADING <-
      ifelse(out_int_sts_element$NUM_int_sts_element == 0, 0, 1)
    rownames(out_int_sts_element) <- NULL
  })

  # Output ----
  result <- list(
    int_sts_countre = unexp_records_out$SegmentTable,
    int_sts_setrc = unexp_records_id_out$SegmentTable,
    int_sts_dupl_ids = duplicate_ids_out$SegmentTable,
    int_sts_dupl_content = duplicate_rows_out$SegmentTable,
    int_sts_element = out_int_sts_element
  )

  result <- result[vapply(result, FUN.VALUE = logical(1), # TODO: Why remove empty seg-dfs, and why not in int_all_datastructure_dataframe?!
                          FUN = function(df) {
                            !!prod(dim(df))
                          })]

  out_int_sts_elementData <- out_int_sts_element
  # (both already keyed by "Segment"; no DF_NAME swap needed here)

  # Translate indicator metric-like columns via helper; keep unknowns (e.g., "Segment") unchanged
  cn <- colnames(out_int_sts_elementData)
  if (length(cn) > 0) {
    colnames(out_int_sts_elementData) <- util_translate_indicator_metrics(
      cn,
      short = FALSE,
      long  = TRUE,
      ignore_unknown = TRUE
    )
  }

  resultData <- list(
    int_sts_countre = unexp_records_out$SegmentData,
    int_sts_setrc = unexp_records_id_out$SegmentData,
    int_sts_dupl_ids = duplicate_ids_out$SegmentData,
    int_sts_dupl_content = duplicate_rows_out$SegmentData,
    int_sts_element = out_int_sts_elementData
  )

  for (n in names(resultData)) {
    rownames(resultData[[n]]) <- NULL
  }

  dqi <- util_get_concept_info("dqi")
  dqi <- dqi[!util_empty(dqi$abbreviation) & !util_empty(dqi$Name), , FALSE]

  names(resultData) <-
    util_recode(
      names(resultData),
      dqi,
      "abbreviation",
      "Name",
      names(resultData)
    )

  SegmentTable <- util_merge_data_frame_list(result, "Segment") # TODO: why Segment and not STUDY_SEGMENT?!
  cn <- colnames(SegmentTable)
  cn[startsWith(cn, "GRADING.")] <- gsub("^GRADING\\.", "GRADING_",
                                         cn[startsWith(cn, "GRADING.")])
  colnames(SegmentTable) <- cn

  # SegmntData <- util_merge_data_frame_list(resultData, "Segment")

  SegmentData1 <- util_make_data_slot_from_table_slot(SegmentTable)
  if ("resp_vars" %in% colnames(SegmentTable)) {
    SegmentData1$`Unexp. Variables` <- SegmentTable$resp_vars
  }

  # Create the df
  SegmentData <- data.frame(Segment = SegmentData1$Segment)
  # Only if content is present, merge columns
  if(!is.null(SegmentData1$`Unexpected data record count (Number)`)){
    SegmentData$`Unexpected data record count N (%)` <- util_paste0_with_na(SegmentData1$`Unexpected data record count (Number)`, " (",
                                                                            SegmentData1$`Unexpected data record count (Percentage (0 to 100))`, ")")
  }
  SegmentData$`Unexpected data record count (Grading)`<- SegmentData1$`Unexpected data record count (Grading)`

  if(!is.null(SegmentData1$`Unexpected data record set (Number)`)){
    SegmentData$`Unexpected data record set N (%)` <- util_paste0_with_na(SegmentData1$`Unexpected data record set (Number)`, " (",
                                                                          SegmentData1$`Unexpected data record set (Percentage (0 to 100))`, ")")
  }
  SegmentData$`Unexpected data record set (Grading)`<- SegmentData1$`Unexpected data record set (Grading)`

  if(!is.null(SegmentData1$`Duplicates (Number)`)){
    SegmentData$`Duplicates N (%)` <- util_paste0_with_na(SegmentData1$`Duplicates (Number)`, " (",
                                                          SegmentData1$`Duplicates (Percentage (0 to 100))`, ")")
  }
  SegmentData$`Duplicates (Grading)`<- SegmentData1$`Duplicates (Grading)`

  if(!is.null(SegmentData1$`Unexp. Variables`)){
    SegmentData$`Unexp. Variables`<- SegmentData1$`Unexp. Variables`
  }

  if (!is.null(SegmentData1$`Unexpected data element set (Number)`)){
    SegmentData$`Unexpected data element set N (%)` <- util_paste0_with_na(SegmentData1$`Unexpected data element set (Number)`, " (",
                                                                           SegmentData1$`Unexpected data element set (Percentage (0 to 100))`, ")")
  }
  SegmentData$`Unexpected data element set (Grading)`<- SegmentData1$`Unexpected data element set (Grading)`

  rm(SegmentData1)

  return(list(
    SegmentTable = SegmentTable,
    SegmentData = SegmentData,
    SegmentDataList = resultData
  ))
}

Try the dataquieR package in your browser

Any scripts or data that you put into this service are public.

dataquieR documentation built on Jan. 8, 2026, 5:08 p.m.