R/unPackingChecks.R

Defines functions checkDupeRows checkToolConnections checkToolComments checkToolStructure

Documented in checkDupeRows checkToolComments checkToolConnections checkToolStructure

#' Perform various data quality checks on in-process data from Data Packs.
#'
#' @description
#' A series of functions to check and validate quality & integrity
#' of data encountered in various `unPack...` functions across `datapackr`. Note
#' that these functions do not attempt to correct issues identified, but only to
#' identify them.
#'
#' `checkToolStructure` checks structural integrity of sheets for submitted tool.
#'
#' `checkToolComments` searches Data Pack for any comments that cause
#' corruption when executing openxlsx::saveWorkbook.
#'
#' `checkToolConnections` detects the presence of any external links in a Tool.
#'
#' `checkToolEmptySheets` detects whether a sheet is essentially empty (no data
#'   in rows, or no data in row 14 column headers).
#'
#' `checkDupeRows` checks for any rows with duplicates across PSNU and other key
#'    disaggregates.
#'
#' `checkColumnStructure` checks structural integrity of columns on critical sheets for
#'    a submitted Data Pack.
#'
#' `checkNonNumeric` alerts to non-numeric values instead of valid data.
#'
#' `checkMissingMetadata` alerts to missing Age, Sex, or KeyPop.
#'
#' `checkNegativeValues` alerts to negative values.
#'
#' `checkDecimalValues` alerts to decimal values where these are unallowed.
#'
#' `checkInvalidOrgUnits` alerts to invalid org units.
#'
#' `checkInvalidPrioritizations` alerts to invalid Prioritizations.
#'
#' `checkFormulas` checks formulas in a specified sheet in a submitted Data Pack
#' to make sure they are up to date and have not been tampered with.
#'
#' `checkDisaggs` alerts to invalid disaggs (Age, Sex, KeyPop).
#'
#' Some functions (`checkToolStructure`, `checkToolComments`, &
#' `checkToolConnections`) are designed to check across entire tools at
#' once. All others are specific to single sheet.
#'
#' @name unPackDataChecks
#' @md
#'
#' @param d DataPack object loaded via `loadDataPack`.
#' @param sheet String. Name of DataPack sheet to check data from. Default is
#'   first sheet.
#' @param quiet Logical. Should warning messages be printed? Default is TRUE.
#'
#' @return A DataPack object, with updated tests and warnings.
#'
NULL


#' @export
#' @rdname unPackDataChecks
#'
checkToolStructure <- function(d, quiet = TRUE) {

  interactive_print("Checking structure...")

  if (!quiet) {
    messages <- MessageQueue()
  }

  interactive_print("Checking for any missing tabs...")

  submission_sheets <- readxl::excel_sheets(d$keychain$submission_path)
  schema_sheets <- unique(d$info$schema$sheet_name)
  #TODO: Why is Spectrum part of the PSNUxIM schema?
  if (d$info$tool == "PSNUxIM") {
    schema_sheets <- schema_sheets[schema_sheets != "Spectrum"]
  }

  missing_sheets <- schema_sheets[!schema_sheets %in% submission_sheets]

  if (length(missing_sheets) > 0) {

    lvl <- "WARNING"

    msg <-
      paste0(
        lvl, "! MISSING SHEETS: Please ensure no original sheets have",
        " been deleted or renamed in your Data Pack. -> \n  * ",
        paste0(missing_sheets, collapse = "\n  * "),
        "\n")

    d$tests$missing_sheets <- data.frame(sheet_name = missing_sheets)
    attr(d$tests$missing_sheets, "test_name") <- "Missing sheets"
    d$info$messages <- appendMessage(d$info$messages, msg, lvl)

    if (!quiet) {
      messages <- appendMessage(messages, msg, lvl)
    }
  }

  if (!quiet) {
    printMessages(messages)
  }

  return(d)

}



#' @export
#' @rdname unPackDataChecks
#'
checkToolComments <- function(d, quiet = TRUE) {

  interactive_print("Checking comments...")

  if (!quiet) {
    messages <- MessageQueue()
  }

  # if (is.null(d$tool$wb)) {
  #   wb <- openxlsx::loadWorkbook(file = d$keychain$submission_path)
  # } else {
  #   wb <- d$tool$wb
  # }

  if (is.null(d$info$workbook_contents)) {
    d <- listWorkbookContents(d)
  }

  d$info$has_comments_issue <- any(grepl("xl/threadedComments/", d$info$workbook_contents))

  # d$info$has_comments_issue <- any(sapply(wb$threadComments, length) != 0)

  if (d$info$has_comments_issue) {

    lvl <- "ERROR"

    msg <-
      paste0(
        lvl, "! Your workbook contains at least one case of a new type of comment
        introduced in Office 365 called a 'Threaded Comment'. This type of comment,
        as opposed to the previous type of Notes used in Microsoft Excel, causes
        corruption issues when this app attempts to update your PSNUxIM tab.
        Prior to submitting for an updated PSNUxIM tab, you MUST remove all
        threaded comments. For more information about the differences between
        threaded comments and notes,",
        "see: https://support.office.com/en-us/article/the-difference-between-threaded-comments-and-notes-75a51eec-4092-42ab-abf8-7669077b7be3", # nolint
        "\n")

    d$info$messages <- appendMessage(d$info$messages, msg, lvl)
    d$info$has_error <- TRUE

    if (!quiet) {
      messages <- appendMessage(messages, msg, lvl)
    }

  }

  if (!quiet) {
    printMessages(messages)
  }

  return(d)

}


#' @export
#' @rdname unPackDataChecks
#'
checkToolConnections <- function(d, quiet = TRUE) {

  interactive_print("Checking external links...")

  if (!quiet) {
    messages <- MessageQueue()
  }

  if (is.null(d$info$workbook_contents)) {
    d <- listWorkbookContents(d)
  }

  d$info$has_external_links <-
    any(grepl("xl/externalLinks/externalLink\\d+\\.xml", d$info$workbook_contents))

  if (d$info$has_external_links) {

    lvl <- "WARNING"

    msg <-
      paste0(
        lvl, "! Your workbook contains at least one external link. ",
        "This usually results from copying and pasting from another workbook. ",
        "Please find and remove the external links in your DataPack. ",
        "This error may result in other validation checks failing to run properly ",
        "and should be fixed immediately.",
        "\n")

    d$info$messages <- appendMessage(d$info$messages, msg, lvl)

    if (!quiet) {
      messages <- appendMessage(messages, msg, lvl)
    }

  }

  if (!quiet) {
    printMessages(messages)
  }

  d

}


#' @export
#' @rdname unPackDataChecks
checkDupeRows <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  #This test requires index columns. If they are not there, drop the sheets
  #If they are missing index columns, we are not going to
  #Attempt to process them
  if (!is.null(d$tests$missing_index_columns)) {
    sheets <- sheets[!(sheets %in% d$tests$missing_index_columns$sheet_name)]
  }

  if (length(sheets) == 0) {
    return(ch)
  }

  # Get header_cols
  header_cols <- purrr::map(sheets, function(x) {
    d$info$schema %>%
      dplyr::filter(
        sheet_name %in% x,
        col_type == "row_header",
        !indicator_code %in% c("SNU1", "ID")) %>%
      dplyr::pull(indicator_code) %>%
      #c(., "mechCode_supportType") %>% # DP-472
      unique()
  })

  # Duplicates
  dupes <- purrr::map2(d$sheets[sheets], header_cols,
                      function(x, y) {
                        x %>%
                          dplyr::select(tidyselect::all_of(y)) %>%
                          dplyr::filter_all(dplyr::any_vars(!is.na(.))) %>%
                          dplyr::filter(!is.na(PSNU)) %>% # This is caught by checkInvalidOrgUnits
                          dplyr::filter(duplicated(.))
                      }) %>%
    purrr::keep(~ NROW(.x) > 0)


  if (length(dupes) > 0) {

    dupes %<>%
      purrr::map2(., names(.),
                  function(x, y) {
                    x %>%
                      dplyr::arrange(dplyr::across(tidyselect::everything())) %>%
                      tibble::add_column(sheet = y, .before = 1)
                  })

    ch$lvl <- "ERROR"

    ch$msg <-
      purrr::map2(
        dupes, names(dupes),
        function(x, y) {
          paste0(
            ch$lvl, "! In tab ", y,
            ": DUPLICATE ROWS found. Ensure PSNUs or Age, Sex, KeyPop disaggregates",
            " are not repeated within tabs. This issue may have been caused by inadvertent",
            " or incorrect copying of data from one row to another. -> \n\t",
            paste(capture.output(print(as.data.frame(x), row.names = FALSE)),
                  collapse = "\n\t"),
            "\n")
        })

    ch$result <- dplyr::bind_rows(dupes)
    attr(ch$result, "test_name") <- "Duplicated rows"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()

      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }

      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
checkMissingCols <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  missing_cols <- d$sheets[sheets] %>%
    purrr::map2_dfr(
      .,
      names(.),
      function(x, y) {
        d$info$schema %>%
          dplyr::filter(sheet_name == y,
                        !indicator_code %in% names(x)) %>%
          dplyr::select(sheet_name, indicator_code)
      })

  # if (sheet == "PSNUxIM") { # DP-472
  #   ## Drop all IM cols (left & right sides)
  #   schema_cols %<>%
  #     dplyr::filter(
  #       col_type != "allocation",
  #       !(col_type == "target"
  #         & (indicator_code %in% c("Not PEPFAR", "12345_DSD", ""))))
  #
  #   ## We don't care to track col issues with blank/NA cols in PSNUxIM
  #   submission_cols %<>%
  #     dplyr::filter(!is.na(indicator_code),
  #                   !indicator_code %in% c("")) %>%
  #     ## Standardize mech/support_type names
  #     dplyr::mutate(
  #       indicator_code =
  #         dplyr::case_when(
  #           stringr::str_detect(indicator_code, "\\d+")
  #           ~ paste0(stringr::str_extract(indicator_code, "\\d+"),
  #                    "_",
  #                    stringr::str_extract(indicator_code, "DSD|TA")),
  #           TRUE ~ indicator_code))
  # }

  if (NROW(missing_cols) > 0) {

    ch$lvl <- "ERROR"

    ch$msg <- unique(missing_cols$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ", MISSING COLUMNS: Please ensure no columns have been deleted or renamed from",
            " the original Data Pack you have received. ->  \n\t* ",
            paste(missing_cols$indicator_code[missing_cols$sheet_name == x],
                  collapse = "\n\t* "),
            "\n")
        })

    ch$result <- missing_cols
    attr(ch$result, "test_name") <- "Missing columns"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)

}


#' @export
#' @rdname unPackDataChecks
checkDupeCols <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  dup_cols <- d$sheets[sheets] %>%
    purrr::map2_dfr(
      .,
      names(.),
      function(x, y) {
        tibble::enframe(names(x), name = NULL, value = "indicator_code") %>%
          dplyr::filter(duplicated(.)) %>%
          dplyr::distinct() %>%
          tibble::add_column(sheet = y, .before = 1)
      }) %>%
    dplyr::left_join(
      d$info$schema %>%
        dplyr::filter(sheet_name %in% sheets) %>%
        dplyr::mutate(critical = !is.na(col)) %>%
        dplyr::select(sheet = sheet_name, indicator_code, critical),
      by = c("sheet", "indicator_code"))

  # if (sheet == "PSNUxIM") { # DP-472
  #   ## Drop all IM cols (left & right sides)
  #   schema_cols %<>%
  #     dplyr::filter(
  #       col_type != "allocation",
  #       !(col_type == "target"
  #         & (indicator_code %in% c("Not PEPFAR", "12345_DSD", ""))))
  #
  #   ## We don't care to track col issues with blank/NA cols in PSNUxIM
  #   submission_cols %<>%
  #     dplyr::filter(!is.na(indicator_code),
  #                   !indicator_code %in% c("")) %>%
  #     ## Standardize mech/support_type names
  #     dplyr::mutate(
  #       indicator_code =
  #         dplyr::case_when(
  #           stringr::str_detect(indicator_code, "\\d+")
  #           ~ paste0(stringr::str_extract(indicator_code, "\\d+"),
  #                    "_",
  #                    stringr::str_extract(indicator_code, "DSD|TA")),
  #           TRUE ~ indicator_code))
  # }

  # if (sheet == "PSNUxIM") { # DP-472
  #   last_im_int <- dup_cols %>%
  #     dplyr::filter(critical) %>%
  #     dplyr::mutate(
  #       jump = submission_order - dplyr::lag(submission_order, default = 0)) %>%
  #     dplyr::filter(jump > 1) %>%
  #     dplyr::pull(submission_order) %>%
  #     min() - 1
  #
  #   # first_im_int <- max(dup_cols$template_order, na.rm = T)
  #   # b4_im <- dup_cols$submission_order[which(dup_cols$template_order == first_im_int)]
  #
  #   dup_cols %<>%
  #     dplyr::mutate(
  #       grp =
  #         dplyr::case_when(
  #           critical ~ "Critical",
  #           submission_order < last_im_int ~ "IM Allocations",
  #           TRUE ~ "IM Targets"))
  # }

  if (NROW(dup_cols) > 0) {

    ch$lvl <- "ERROR"

    ch$msg <- unique(dup_cols$sheet) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ", DUPLICATE COLUMNS: The following columns appear multiple times. This",
            " must be resolved in your submission, especially for those columns",
            " noted as [Critical!]. ->  \n\t* ",
            paste(
              dup_cols[dup_cols$sheet == x, ] %>%
                dplyr::mutate(
                  msg_col = paste0(indicator_code,
                                   ifelse(!is.na(critical), " [Critical!]", ""))) %>%
                dplyr::pull(msg_col),
              collapse = "\n\t* "),
            "\n")
        })

    ch$result <- dup_cols
    attr(ch$result, "test_name") <- "Duplicate columns"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)

}

#' @export
#' @rdname unPackDataChecks
checkOutOfOrderCols <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  out_of_order <- d$sheets[names(d$sheets) %in% sheets] %>%
    purrr::map2_dfr(
      .,
      names(.),
      function(x, y) {
        names(x) %>%
          tibble::enframe(name = "submission_order", value = "indicator_code") %>%
          tibble::add_column(sheet = y, .before = 1)
      }) %>%
    dplyr::right_join(
      d$info$schema %>%
        dplyr::filter(sheet_name %in% sheets) %>%
        dplyr::select(indicator_code, sheet = sheet_name, template_order = col),
      by = c("indicator_code", "sheet")) %>%
    dplyr::filter(submission_order != template_order)

  # if (sheet == "PSNUxIM") { # DP-472
  #   ## Drop all IM cols (left & right sides)
  #   schema_cols %<>%
  #     dplyr::filter(
  #       col_type != "allocation",
  #       !(col_type == "target"
  #         & (indicator_code %in% c("Not PEPFAR", "12345_DSD", ""))))
  #
  #   ## We don't care to track col issues with blank/NA cols in PSNUxIM
  #   submission_cols %<>%
  #     dplyr::filter(!is.na(indicator_code),
  #                   !indicator_code %in% c("")) %>%
  #     ## Standardize mech/support_type names
  #     dplyr::mutate(
  #       indicator_code =
  #         dplyr::case_when(
  #           stringr::str_detect(indicator_code, "\\d+")
  #           ~ paste0(stringr::str_extract(indicator_code, "\\d+"),
  #                    "_",
  #                    stringr::str_extract(indicator_code, "DSD|TA")),
  #           TRUE ~ indicator_code))
  # }

  if (NROW(out_of_order) > 0) {

    ch$lvl <- "WARNING"

    ch$msg <- unique(out_of_order$sheet) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ", OUT OF ORDER COLUMNS: While it is permitted to rearrange columns",
            " within your Data Pack as needed, this is not encouraged as it may",
            " introduce unintended formula errors. Please review these columns to",
            " ensure their rearrangement has not caused any issues. -> \n\t* ",
            paste(unique(out_of_order$indicator_code[out_of_order$sheet == x]),
                  collapse = "\n\t* "),
            "\n")
        })

    ch$result <- out_of_order
    attr(ch$result, "test_name") <- "Columns out of order"

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  # TODO: Add PSNUxIM check for malformed IM/type headers # DP-472
  # TODO: Add PSNUxIM check for making sure IM appears once in both L & R # DP-472

  return(ch)

}


#' @export
#' @rdname unPackDataChecks
checkNonNumeric <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  non_numeric <- unPackDataPackSheet(d,
                                     sheets,
                                     clean_orgs = FALSE,
                                     clean_disaggs = FALSE,
                                     clean_values = FALSE) %>%
    dplyr::filter(
      !(sheet_name == "Prioritization"
        & stringr::str_sub(PSNU, 1, 9) == "_Military")) %>%
    tidyr::drop_na(value)

  non_numeric <-
    non_numeric[which(is.na(suppressWarnings(as.numeric(non_numeric$value)))), ]

  # if (d$info$tool == "OPU Data Pack") { # DP-472
  #   data %<>%
  #     tidyr::gather(key = "mechCode_supportType",
  #                   value = "value",
  #                   -tidyselect::all_of(header_cols$indicator_code)) %>%
  #     dplyr::select(dplyr::all_of(header_cols$indicator_code),
  #                   mechCode_supportType, value) %>%
  #     tidyr::drop_na(value)
  # }

  # if (d$info$tool == "Data Pack" & sheet == "PSNUxIM" & d$info$cop_year %in% c(2021, 2022)) { # DP-472
  #   data %<>%
  #     tidyr::gather(key = "mechCode_supportType",
  #                   value = "value",
  #                   -tidyselect::all_of(c(header_cols$indicator_code))) %>%
  #     dplyr::select(dplyr::all_of(header_cols$indicator_code), -indicator_code,
  #                   indicator_code = mechCode_supportType, value) %>%
  #     tidyr::drop_na(value)
  # }

  if (NROW(non_numeric) > 0) {

    ch$lvl <- "WARNING"

    ch$msg <- unique(non_numeric$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ": NON-NUMERIC VALUES found! Please check the following columns for",
            " possible non-numeric values. ->  \n\t* ",
            paste(
              unique(
                non_numeric$indicator_code[non_numeric$sheet_name == x]),
              collapse = "\n\t* "),
            "\n")
        })

    ch$result <- non_numeric
    attr(ch$result, "test_name") <- "Non-numeric values"

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
checkNegativeValues <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  negative_values <- unPackDataPackSheet(d,
                                         sheets,
                                         clean_orgs = FALSE,
                                         clean_disaggs = FALSE,
                                         clean_values = FALSE) %>%
    #TODO: Keeping this consistent with checkDecimalValues
    #Consider doing the numeric conversion once in unPackDataSheet
    #instead of multiple times in these checks.
    dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%
    dplyr::filter(value < 0)

  if (NROW(negative_values) > 0) {

    ch$lvl <- "ERROR"

    ch$msg <- unique(negative_values$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ": NEGATIVE VALUES found in the following columns! Ensure all values entered",
            " against Targets are whole, positive, numeric values. These will be removed. -> \n\t* ",
            paste(
              unique(
                negative_values$indicator_code[negative_values$sheet_name == x]),
              collapse = "\n\t* "),
            "\n")
        })

    ch$result <- negative_values
    attr(ch$result, "test_name") <- "Negative values"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}



#' @export
#' @rdname unPackDataChecks
checkDecimalValues <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  decimals_allowed <- d$info$schema %>%
    dplyr::filter(sheet_name %in% sheets,
                  col_type == "target",
                  value_type == "percentage") %>%
    dplyr::pull(indicator_code)

  decimal_cols <- unPackDataPackSheet(d,
                                      sheets,
                                      clean_orgs = FALSE,
                                      clean_disaggs = FALSE,
                                      clean_values = FALSE) %>%
    dplyr::mutate(value = suppressWarnings(as.numeric(value))) %>%
    dplyr::filter(value %% 1 != 0
                  & !indicator_code %in% decimals_allowed)

  if (NROW(decimal_cols) > 0) {
    ch$lvl <- "WARNING"

    ch$msg <- unique(decimal_cols$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl,
            "! In tab ", x,
            ": DECIMAL VALUES found in the following columns that should have only",
            " whole, positive, numeric values. These will be rounded. -> \n\t* ",
            paste(
              unique(
                decimal_cols$indicator_code[decimal_cols$sheet_name == x]),
              collapse = "\n\t* "),
            "\n")
        })

    ch$result <- decimal_cols
    attr(ch$result, "test_name") <- "Decimal values"

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
checkInvalidOrgUnits <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  valid_orgunits_local <- getValidOrgUnits(d$info$cop_year)

  #There may be some variation in the columns between cop years
  cols_to_filter <- switch(as.character(d$info$cop_year),
                           "2021" = c("SNU1", "PSNU", "Age", "Sex"),
                           "2022" = c("SNU1", "PSNU", "Age", "Sex"),
                           "2023" = c("PSNU", "Age", "Sex"),
                           "2024" = c("PSNU", "Age", "Sex"))

  invalid_orgunits <- d$sheets[sheets] %>%
    dplyr::bind_rows(.id = "sheet_name") %>%
    #Reverting this back to the previous logic to filter
    #to ignore any rows which are NA in the columsn to filter.
    dplyr::filter(dplyr::if_any(tidyselect::any_of(cols_to_filter), ~ !is.na(.))) %>%
    #dplyr::filter(dplyr::if_all(tidyselect::all_of(cols_to_filter), ~ !is.na(.x))) %>%
    dplyr::select(sheet_name, PSNU) %>%
    dplyr::distinct() %>%
    dplyr::mutate(snu_uid = extract_uid(PSNU)) %>%
    dplyr::anti_join(valid_orgunits_local, by = c("snu_uid" = "uid"))

  na_orgunits <- invalid_orgunits[is.na(invalid_orgunits$PSNU), ]

  if (NROW(invalid_orgunits) > 0 || NROW(na_orgunits) > 0) {

    ch$lvl <- "ERROR"

    ch$msg <- unique(invalid_orgunits$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ", INVALID OR BLANK ORG UNITS: ",
            ifelse(
              NROW(na_orgunits) > 0,
              paste0("There are ", NROW(na_orgunits),
                     " rows where PSNU/DSNU was left blank."),
              ""),
            " Please also review the below PSNUs/DSNUs with invalid or missing org",
            " unit UIDs. (This is an 11-digit alphanumeric code assigned in DATIM to",
            " each organization unit.) If you believe these are valid, confirm in",
            " both DATIM & FACTSInfo that the below are correctly added and active",
            " for the appropriate COP Year. ->  \n\t* ",
            paste(invalid_orgunits$PSNU[!is.na(invalid_orgunits$PSNU)], collapse = "\n\t* "),
            "\n")
        })

    ch$result <- invalid_orgunits
    attr(ch$result, "test_name") <- "Invalid orgunits"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
checkInvalidPrioritizations <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  valid_orgunits_local <- getValidOrgUnits(d$info$cop_year)
  valid_orgunits_local$hierarchy_level  <- unlist(lapply(valid_orgunits_local$ancestors, function(x) NROW(x) + 1L))
  valid_orgunits_local <- valid_orgunits_local[, c("uid", "ou_uid", "country_uid", "hierarchy_level")]


  data <- d$sheets[["Prioritization"]][, c("PSNU", "IMPATT.PRIORITY_SNU.T")]
  names(data)[names(data) == "IMPATT.PRIORITY_SNU.T"] <- "value"
  data <- data[, c("PSNU", "value")]
  data$snu_uid <- extract_uid(data$PSNU)

  data %<>% dplyr::left_join(valid_orgunits_local, by = c("snu_uid" = "uid"))

  dataset_levels_local <- datapackr::dataset_levels %>%
    dplyr::filter(cop_year == d$info$cop_year, ou_uid == d$info$operating_unit$ou_uid) %>%
    dplyr::select(ou_uid, country_uid, prioritization)

  data %<>% dplyr::left_join(dataset_levels_local)

  #
  data <- data %>%
    dplyr::mutate(
      isInvalidPSNU = dplyr::case_when(
        is.na(ou_uid) | is.na(country_uid) ~ TRUE,
        grepl("_Military", PSNU) ~ FALSE,
        hierarchy_level == prioritization ~ FALSE,
        TRUE ~ TRUE
      )
    )

  isInvalidPrioritization <- function(PSNU, value) {

    if (grepl("_Military", PSNU)) {
       value != "M"
    } else {
      !(value %in% prioritization_dict()$value)
    }

  }

  data$isInvalidPrioritization <- mapply(isInvalidPrioritization, data$PSNU, data$value)

  invalid_prioritizations <- data[data$isInvalidPSNU | data$isInvalidPrioritization, ]

  if (NROW(invalid_prioritizations) > 0) {

    inv_pzs_msg <-
      utils::capture.output(
        print(as.data.frame(invalid_prioritizations), row.names = FALSE))

    ch$lvl <- "ERROR"

    ch$msg <-
      paste0(
        ch$lvl, "! In tab Prioritization",
        ": INVALID PRIORITIZATIONS: The following PSNUs have been assigned",
        " invalid or blank prioritizations. Please note that all PSNUs must have",
        " an assigned prioritization, and prioritizations can only be assigned ",
        paste_oxford(prioritization_dict()$value, final = "or"), ". -> \n\t",
        paste(inv_pzs_msg, collapse = "\n\t"),
        "\n")

    ch$result <- invalid_prioritizations
    attr(ch$result, "test_name") <- "Invalid prioritizations"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#Extracts grey cells from Row3 for all sheets
getCriticalColumns <- function()  {

  template_file <- system.file("extdata/COP23_Data_Pack_Template.xlsx", package = "datapackr")

  template <- readxl::read_excel(template_file)
  cells <- tidyxl::xlsx_cells(template_file)
  formats <- formats <- tidyxl::xlsx_formats(template_file)

  grey_cells <- which(formats$local$fill$patternFill$fgColor$rgb == "FFFFFFFF")
  critical_cols <- cells$local_format_id %in% grey_cells

  critical_columns <- cells[critical_cols, ] %>%
    dplyr::filter(row == 3)  %>%
    dplyr::select(sheet_name = sheet, col) %>%
    dplyr::mutate(critical = "Y")

  critical_columns
}


#' @export
#' @rdname unPackDataChecks
checkFormulas <- function(sheets, d, quiet = TRUE) {

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  header_row <- headerRow(tool = d$info$tool, cop_year = d$info$cop_year)

  # Pull in formulas from schema
  formulas_schema <- d$info$schema %>%
    dplyr::filter(
      sheet_name %in% sheets,
      !is.na(formula)) %>%
    # TODO: Maybe use the below example code to add functionality to detect
    # incorrect row reference in formula
    # tidyr::crossing(row = ((header_row+1):max(formulas_datapack$row))) %>%
    # dplyr::select(row, col, indicator_code, formula) %>%
    # dplyr::mutate(
    #   formula =
    #     stringr::str_replace_all(
    #       formula,
    #       pattern = paste0("(?<=[:upper:])", header_row+1),
    #       replacement = as.character((header_row+1):max(formulas_datapack$row))
    #     )
    # )
    dplyr::mutate(
      formula = stringr::str_replace_all(formula,
                                         "(?<=[:upper:])\\d+",
                                         "\\\\d+"))

   if (d$info$cop_year == "2022") {
     formulas_schema %<>% dplyr::mutate(critical =
       dplyr::case_when(
         indicator_code == "ID" | col_type == "target" ~ "Y",
         TRUE ~ "N")) %>%
       dplyr::select(-col)
   }

  if (d$info$cop_year >= "2023") {

    critical_columns <- getCriticalColumns()

    formulas_schema <- formulas_schema %>%
      dplyr::left_join(critical_columns, by = c("sheet_name", "col")) %>%
      dplyr::mutate(critical = dplyr::case_when(is.na(critical) ~ "N",
                                                TRUE ~ critical)) %>%
      dplyr::select(-col)

  }

  formulas_schema %<>% dplyr::select(sheet_num, sheet_name, indicator_code, fx_schema = formula,
                                     critical)
  # Pull in formulas from Data Pack sheet
  formulas_datapack <-
    tidyxl::xlsx_cells(path = d$keychain$submission_path,
                       sheets = sheets,
                       include_blank_cells = TRUE)

  #By default, tidyxl scans forward when include_blank_cells
  #is true is TRUE.
  #Lets try and define where the last row of data is and exclude these blank

  last_row_of_data <- formulas_datapack %>%
    dplyr::filter(data_type != "blank") %>%
    dplyr::pull(row) %>%
    max()

  formulas_datapack %<>%
    dplyr::filter(row <= last_row_of_data) %>%
    # Note that this function won't pick up any cols with blank indicator_code
    dplyr::filter(row >= header_row) %>%
    dplyr::mutate(formula = dplyr::if_else(is.na(formula),
                                           as.character(numeric),
                                           formula),
                  formula = dplyr::if_else(is.na(formula), character, formula)) %>%
    dplyr::select(sheet_name = sheet, row, col, character, formula)

  indicator_codes <- formulas_datapack %>%
    dplyr::filter(row == header_row, !is.na(character)) %>%
    dplyr::select(sheet_name, col, indicator_code = character) %>%
    dplyr::semi_join(formulas_schema, by = c("indicator_code", "sheet_name"))

  formulas_datapack %<>%
    dplyr::left_join(indicator_codes, by = c("col", "sheet_name")) %>%
    dplyr::select(sheet_name, row, indicator_code, formula) %>%
    dplyr::filter(row != header_row,
                  !is.na(indicator_code)) %>%
    # purrr::when( # DP-472
    #   sheet == "PSNUxIM" & d$info$tool == "Data Pack" ~ .,
    #   ~  dplyr::group_by(., row) %>%
    #     dplyr::mutate(occurrence = duplicated(indicator_code)) %>%
    #     dplyr::ungroup() %>%
    #     dplyr::filter(occurrence == FALSE) %>%
    #     dplyr::select(-occurrence) %>%
    #     # Limit to only columns that DUIT cares about
    #     dplyr::filter(indicator_code %in% formulas_schema$indicator_code)
    # ) %>%
    #TODO: Add to catch where referencing wrong row
    dplyr::mutate(
      formula = stringr::str_replace_all(formula,
                                         "(?<=[:upper:])\\d+",
                                         "\\\\d+"))

  # Compare formulas from schema against Data Pack to see diffs
  altered_formulas <- formulas_datapack %>%
    dplyr::anti_join(formulas_schema,
                     by = c("sheet_name" = "sheet_name",
                            "indicator_code" = "indicator_code",
                            "formula" = "fx_schema")) %>%
    dplyr::left_join(formulas_schema,
                     by = c("sheet_name", "indicator_code")) %>%
    # dplyr::left_join( # DP-472
    #   formulas_schema,
    #   by = ifelse(sheet == "PSNUxIM" & d$info$tool == "Data Pack",
    #               c("col" = "col"),
    #               c("indicator_code" = "indicator_code"))) %>%
    # purrr::when(sheet == "PSNUxIM" & d$info$tool == "Data Pack" ~ dplyr::rename(., indicator_code = indicator_code.y),
    #             ~ .) %>%
    dplyr::select(sheet_num, sheet_name, row, indicator_code,
                  correct_fx = fx_schema, submitted_fx = formula, critical) %>%
    dplyr::filter(critical == "Y") #Ignore non-critical formulas

  if (NROW(altered_formulas) > 0) {

    ch$lvl <- "WARNING"

    cols_affected <- altered_formulas %>%
      dplyr::count(sheet_num, sheet_name, indicator_code, critical, name = "count") %>%
      dplyr::mutate(fx_violations = paste0(indicator_code, ":  ", count))

    critical <- cols_affected[cols_affected$critical == "Y", ]

    ch$msg <- unique(cols_affected$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x, ", ",
            sum(critical$count[critical$sheet_name == x]),
            " CRITICAL ALTERED FORMULAS",
            " Altering formulas in Grey colored columns without DUIT and PPM",
            " approval may lead to programmatic and technical issues in your Data ",
            " Pack. This warning may be triggered by deleting or overwriting a",
            " formula, or a manual fix not being applied. See the provided",
            " Validation Results file for detail on both critical and",
            " non-critical formulas",
            ifelse(
              NROW(critical[critical$sheet_name == x, ]) > 0,
              paste0(", and below for the number of violations",
                     " against critical columns ->  \n\t* ",
                     paste(critical$fx_violations[critical$sheet_name == x],
                           collapse = "\n\t* ")),
              "."),
            "\n")
        })

    altered_formulas %<>%
      dplyr::group_by(sheet_num, sheet_name, indicator_code, correct_fx,
                      submitted_fx, critical) %>%
      dplyr::summarise(affected_rows = formatSetStrings(row),
                       .groups = "drop") %>%
      dplyr::ungroup()

    # Alternative with less detail but more manageably sized: (Keep and check with DUIT)
    # altered_formulas %<>%
    #   dplyr::group_by(sheet_num, sheet_name, indicator_code, correct_fx,
    #                   critical) %>%
    #   dplyr::summarise(affected_rows = formatSetStrings(row),
    #                    .groups = "drop") %>%
    #   dplyr::ungroup()

    ch$result <- altered_formulas
    attr(ch$result, "test_name") <- "Altered Formulas"

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
checkDisaggs <- function(sheets, d, quiet = TRUE) {

  if (any(c("SNU x IM", "PSNUxIM") %in% sheets)) {
    interactive_warning("Sorry! Can't check the PSNUxIM tab with this function.") # DP-472
  }
  sheets <- sheets[sheets != "PSNUxIM"]

  #TODO: Add functionality for PSNUxIM

  ch <- list(result = NULL,
            msg = NULL,
            lvl = NULL,
            has_error = FALSE)

  valid_disaggs <- d$info$schema %>%
    dplyr::filter(sheet_name %in% sheets
                  & col_type == "target") %>%
    dplyr::select(sheet_name, indicator_code,
                  valid_ages, valid_sexes, valid_kps) %>%
    tidyr::unnest(valid_ages, names_sep = ".") %>%
    tidyr::unnest(valid_sexes, names_sep = ".") %>%
    tidyr::unnest(valid_kps, names_sep = ".") %>%
    dplyr::select(sheet_name, indicator_code, Age = valid_ages.name,
                  Sex = valid_sexes.name, KeyPop = valid_kps.name)

  defunct_disaggs <- unPackDataPackSheet(d,
                                         sheets,
                                         clean_orgs = TRUE,
                                         clean_disaggs = FALSE,
                                         clean_values = TRUE) %>%
    dplyr::anti_join(
      valid_disaggs,
      by = c("sheet_name", "indicator_code", "Age", "Sex", "KeyPop")) %>%
    dplyr::select(sheet_name, indicator_code, Age, Sex, KeyPop) %>%
    dplyr::distinct()

  if (NROW(defunct_disaggs) > 0) {

    ch$lvl <- "ERROR"

    ch$msg <- unique(defunct_disaggs$sheet_name) %>%
      purrr::set_names() %>%
      purrr::map(
        function(x) {
          paste0(
            ch$lvl, "! In tab ", x,
            ": INVALID DISAGGS. Please review all tabs flagged by this test to ensure",
            " no Age, Sex, or Key Population disaggregates have been inadvertently or",
            " incorrectly altered. If you believe this has been flagged in error, ",
            " please first refer to MER Guidance to confirm valid disaggregates for",
            " the data element flagged. (Check MER Guidance for correct alternatives.",
            " Also note that single-digit ages should be left-padded with zeros,",
            " e.g., 01-04 instead of 1-4.) -> \n\t",
            paste(
              utils::capture.output(
                print(
                  as.data.frame(
                    defunct_disaggs[defunct_disaggs$sheet_name == x, ]),
                  row.names = FALSE)),
              collapse = "\n\t"),
            "\n")
        })

    ch$result <- defunct_disaggs
    attr(ch$result, "test_name") <- "Defunct disaggs"
    ch$has_error <- TRUE

    if (!quiet) {
      messages <- MessageQueue()
      for (i in seq_along(ch$msg)) {
        messages <- appendMessage(messages, ch$msg[[i]], ch$lvl)
      }
      printMessages(messages)
    }
  }

  return(ch)
}


#' @export
#' @rdname unPackDataChecks
#'
checkToolEmptySheets <- function(d, sheets, quiet = TRUE) {

  if (!quiet) {
    messages <- MessageQueue()
  }

  # Check if all key header columns missing
  header_cols <- purrr::map(sheets, function(x) {
    d$info$schema %>%
      dplyr::filter(sheet_name %in% x,
                    col_type == "row_header", !indicator_code %in% c("SNU1", "ID")) %>%
      dplyr::pull(indicator_code) %>%
      #c(., "mechCode_supportType") %>% # DP-472
      unique()
  })

  has_all_header_columns <-
    purrr::map2(d$sheets[sheets], header_cols,
                function(x, y) {
                  Reduce("+", y %in% names(x)) == length(y)
                }) %>%
    unlist()

  if (any(!has_all_header_columns)) {

    lvl <- "ERROR"

    msg <-
      paste0(
        lvl, "! MISSING KEY COLUMNS: The following sheets are missing critical ",
        "columns, usually PSNU, Age, Sex, and/or KeyPop. This prevents us from ",
        "checking and reading any data from these sheets. -> \n  * ",
        paste0(sheets[!has_all_header_columns], collapse = "\n  * "),
        "\n")

    d$tests$missing_index_columns <- data.frame(sheet_name = sheets[!has_all_header_columns])
    attr(d$tests$missing_index_columns, "test_name") <- "Missing index columns"
    d$info$messages <- appendMessage(d$info$messages, msg, lvl)
    d$info$has_error <- TRUE

    if (!quiet) {
      messages <- appendMessage(messages, msg, lvl)
    }
  }

  # Check if no rows of data
  has_rows_data <-
    purrr::map(d$sheets[sheets],
               function(x) {
                 NROW(x) > 0
               }) %>%
    unlist()

  if (any(!has_rows_data)) {

    lvl <- "INFO"

    msg <-
      paste0(
        lvl, "! SHEETS WITH NO DATA: The following sheets appear to have no ",
        "rows of data. If this is intentional, no need to worry. -> \n  * ",
        paste0(sheets[!has_rows_data], collapse = "\n  * "),
        "\n")

    d$tests$no_rows_data <- data.frame(sheet_name = sheets[!has_rows_data])
    attr(d$tests$no_rows_data, "test_name") <- "No rows of data"
    d$info$messages <- appendMessage(d$info$messages, msg, lvl)

    if (!quiet) {
      messages <- appendMessage(messages, msg, lvl)
    }
  }

  if (!quiet) {
    printMessages(messages)
  }

    return(d)

}


#' @export
#' @rdname unPackDataChecks
checkSheetData <- function(d,
                           sheets = NULL,
                           quiet = TRUE,
                           ...) {

  interactive_print("Checking sheet data...")

  dots <- list(...)

  # Check/Fill in parameters ####
  params <- check_params(cop_year = d$info$cop_year,
                         tool = d$info$tool,
                         schema = d$info$schema,
                         sheets = sheets,
                         all_sheets = dots$all_sheets,
                         psnuxim = dots$psnuxim)

  for (p in names(params)) {
    assign(p, purrr::pluck(params, p))
  }

  rm(params, p)

  sheets <- sheets[!sheets %in% c("KP Validation")]

  # Apply the list of check functions ----
  funs <- list(
    duplicate_rows = checkDupeRows,
    missing_cols = checkMissingCols,
    duplicate_columns = checkDupeCols,
    columns_out_of_order = checkOutOfOrderCols,
    non_numeric = checkNonNumeric,
    negative_values  = checkNegativeValues,
    decimal_values = checkDecimalValues,
    invalid_orgunits = checkInvalidOrgUnits,
    invalid_prioritizations = checkInvalidPrioritizations,
    altered_formulas = checkFormulas,
    defunct_disaggs = checkDisaggs
  )

  data_checks <-  purrr::map(funs, purrr::exec, sheets, d)

  d$tests <-
    append(d$tests,
           purrr::map(data_checks, ~ purrr::pluck(.x, "result"))) %>%
    purrr::discard(is.null)

  msg <- purrr::map(data_checks, ~ Reduce(f = c,
                                              x = purrr::pluck(.x, "msg"))) %>%
    Reduce(f = c, x = .)
  lvl <- purrr::map(data_checks,
                    function(x) {
                      rep(purrr::pluck(x, "lvl"),
                          length(purrr::pluck(x, "msg")))
                    }) %>%
    Reduce(f = c, x = .)

  for (i in seq_along(msg)) {
    d$info$messages <- appendMessage(d$info$messages, msg[i], lvl[i])
  }

  d$info$has_error <-
    purrr::map_lgl(data_checks, function(x) purrr::pluck(x, "has_error")) %>%
    c(., d$info$has_error) %>%
    any()

  # TODO: Make sure all functions note sheet name as column in result
  # TODO: Make sure all functions row bind results


    # TODO: TEST AGYW Tab for missing DSNUs #### This will be addressed in future PR
    # if (sheet == "AGYW") {
    #   DataPack_DSNUs <- d$data$extract %>%
    #     dplyr::select(PSNU, psnu_uid = psnuid) %>%
    #     dplyr::distinct() %>%
    #     dplyr::mutate(DataPack = 1)
    #
    #   DATIM_DSNUs <- datapackr::valid_PSNUs %>%
    #     dplyr::filter(country_uid %in% d$info$country_uids) %>%
    #     add_dp_psnu(.) %>%
    #     dplyr::arrange(dp_psnu) %>%
    #     dplyr::filter(!is.na(DREAMS)) %>%
    #     dplyr::select(PSNU = dp_psnu, psnu_uid, snu1) %>%
    #     dplyr::mutate(DATIM = 1)
    #
    #   DSNU_comparison <- DataPack_DSNUs %>%
    #     dplyr::full_join(DATIM_DSNUs, by = "psnu_uid")
    #
    #   d$tests$DSNU_comparison <- DSNU_comparison
    #   attr(d$tests$DSNU_comparison, "test_name") <- "DSNU List Comparison"
    #
    #   if (any(is.na(DSNU_comparison$DataPack))) {
    #     missing_DSNUs <- DSNU_comparison %>%
    #       dplyr::filter(is.na(DataPack))
    #
    #     msg <- paste0(
    #       "WARNING! In tab ",
    #       sheet,
    #       ": MISSING DREAMS SNUs found! ->  \n\t* ",
    #       paste(missing_DSNUs$PSNU.y, collapse = "\n\t* "),
    #       "\n")
    #
    #     d$info$messages <- appendMessage(d$info$messages, msg, "WARNING")
    #     d$info$missing_DSNUs <- TRUE
    #   }
    #
    #   if (any(is.na(DSNU_comparison$DATIM))) {
    #     invalid_DSNUs <- DSNU_comparison %>%
    #       dplyr::filter(is.na(DATIM))
    #
    #     msg <- paste0(
    #       "WARNING! In tab ",
    #       sheet,
    #       ": INVALID DREAMS SNUs found! ->  \n\t* ",
    #       paste(invalid_DSNUs$PSNU.x, collapse = "\n\t* "),
    #       "\n")
    #
    #     d$info$messages <- appendMessage(d$info$messages, msg, "WARNING")
    #   }
    #
    # }

  return(d)

}
pepfar-datim/datapackr documentation built on April 14, 2024, 10:35 p.m.