R/checkAnalytics.R

Defines functions checkAnalytics analyze_indexpos_ratio analyze_linkage analyze_retention analyze_tbknownpos analyze_pmtctknownpos analyze_vmmc_indeterminate analyze_eid_2mo HTS_POS_Modalities

Documented in analyze_eid_2mo analyze_indexpos_ratio analyze_linkage analyze_pmtctknownpos analyze_retention analyze_tbknownpos analyze_vmmc_indeterminate checkAnalytics

HTS_POS_Modalities <- function(cop_year) {
    #TODO: This function needs a parameter based on COP year.
    #More work further down, so I am not going to fix it
    #at the moment. Each of the checks is being fed a
    # data object, but this object does not seem to contain
    # a reference to the cop year. Since the modalities
    # differ from year to year though, this list needs
    # to be determined based on the year we are dealing with.
    # TODO:

    datapackr::getMapDataPack_DATIM_DEs_COCs(cop_year) %>%
    dplyr::select(indicator_code, hts_modality, resultstatus) %>%
    tidyr::drop_na() %>%
    dplyr::filter(resultstatus %in% c("Newly Tested Positives", "Positive")) %>%
    dplyr::distinct() %>%
    dplyr::pull(indicator_code)
}
#' @export
#' @title Check Data Pack for <90\% PMTCT_EID from ≤02 months
#'
#' @description Check data gathered from Data Pack to identify cases where
#' the targeted percent of infants (<1 yr) born to HIV-positive women tested for
#' HIV (EID) between 0 and 2 months old is less than 90\%.
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_eid_2mo <- function(data) {

  a <- NULL

  this_cop_year <- data$cop_year[[1]]

    required_names <- c("PMTCT_EID.D.T",
                        "PMTCT_EID.N.2.T")

  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "PMTCT_EID coverage by 2 months issues"
    a$msg <- "Could not analyze PMTCT EID due to missing data."
    return(a)
  }

    analysis <- data %>%
      dplyr::mutate(
        PMTCT_EID.2mo.rate = PMTCT_EID.N.2.T / PMTCT_EID.D.T
      ) %>%
      dplyr::filter(!is.na(PMTCT_EID.2mo.rate)) %>%
      dplyr::select(
        psnu, psnu_uid, age, sex, key_population,
        PMTCT_EID.N.2.T,
        PMTCT_EID.D.T,
        PMTCT_EID.2mo.rate)

  issues <- analysis %>%
    dplyr::filter(round(PMTCT_EID.2mo.rate, 2) < 0.90)

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "PMTCT_EID coverage by 2 months issues"

      national_avg <- analysis %>%
        dplyr::select(
          PMTCT_EID.D.T,
          PMTCT_EID.N.2.T) %>%
        dplyr::summarise(
          PMTCT_EID.D.T =
            sum(PMTCT_EID.D.T, na.rm = TRUE),
          PMTCT_EID.N.2.T =
            sum(PMTCT_EID.N.2.T, na.rm = TRUE)) %>%
        dplyr::mutate(
          PMTCT_EID.2mo.rate =
            PMTCT_EID.N.2.T / PMTCT_EID.D.T)

    a$msg <-
      paste0(
        "WARNING! PMTCT_EID coverage by 2 months old < 90%: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(analysis$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Total rate of EID coverage by 2 months across all PSNUs: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * national_avg$PMTCT_EID.2mo.rate)),
        "\n\n\t* ",
        "Lowest observed rate of EID coverage by 2 months: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * min(issues$PMTCT_EID.2mo.rate))),
        "\n")

  }

  return(a)
}


#' @export
#' @title Check Data Pack data for VMMC_CIRC Indeterminate Rate > 5\%
#'
#' @description Check data gathered from Data Pack to identify cases where
#' the expected number of VMMC_CIRC Indeterminate patients is greater than
#' 5\% of the total number of VMMC_CIRC patients.
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_vmmc_indeterminate <- function(data) {

  a <- NULL
  required_names <- c("VMMC_CIRC.Pos.T",
                      "VMMC_CIRC.Neg.T",
                      "VMMC_CIRC.Unk.T")

  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "VMMC Indeterminate rate issues"
    a$msg <- "Could not analyze VMMC_CIRC Indeterminate Rate due to missing data."
    return(a)
  }

  issues <- data %>%
    dplyr::mutate(
      VMMC_CIRC.T =
        (VMMC_CIRC.Pos.T
         + VMMC_CIRC.Neg.T
         + VMMC_CIRC.Unk.T),
      VMMC_CIRC.indeterminateRate =
        (VMMC_CIRC.Unk.T) /
        (VMMC_CIRC.T)) %>%
    dplyr::filter(round(VMMC_CIRC.indeterminateRate, 2) > 0.05, is.na(key_population)) %>%
    dplyr::select(
      psnu, psnu_uid, age, sex, key_population,
      VMMC_CIRC.T,
      VMMC_CIRC.Pos.T,
      VMMC_CIRC.Neg.T,
      VMMC_CIRC.Unk.T,
      VMMC_CIRC.indeterminateRate)

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "VMMC Indeterminate rate issues"

    national_avg <- data %>%
      dplyr::select(
        VMMC_CIRC.Pos.T,
        VMMC_CIRC.Neg.T,
        VMMC_CIRC.Unk.T) %>%
      dplyr::summarise(
        VMMC_CIRC.Pos.T =
          sum(VMMC_CIRC.Pos.T, na.rm = TRUE),
        VMMC_CIRC.Neg.T =
          sum(VMMC_CIRC.Neg.T, na.rm = TRUE),
        VMMC_CIRC.Unk.T =
          sum(VMMC_CIRC.Unk.T, na.rm = TRUE)) %>%
      dplyr::mutate(
        VMMC_CIRC.indeterminateRate =
          (VMMC_CIRC.Unk.T) /
          (VMMC_CIRC.Pos.T
           + VMMC_CIRC.Neg.T
           + VMMC_CIRC.Unk.T)
      )

    a$msg <-
      paste0(
        "WARNING! VMMC_CIRC Indeterminate > 5% : \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Highest indeterminate/not tested rate observed: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * max(
                               issues$VMMC_CIRC.indeterminateRate
                               ))),
        "\n\n\t* ",
        "National average indeterminate/not tested rate: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * max(
                               national_avg$VMMC_CIRC.indeterminateRate
                             ))),
        "\n")

  }

  return(a)
}


#' @export
#' @title Check Data Pack data for PMTCT Known Pos Ratio > 75\%.
#'
#' @description Check data gathered from Data Pack to identify cases where
#' PMTCT Known Pos Ratio > 75\%.
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_pmtctknownpos <- function(data) {
  a <- NULL

  this_cop_year <- as.character(data$cop_year[1])

  required_names <- switch(this_cop_year,
                           "2023" =  c("PMTCT_STAT.N.New.Pos.T",
                                       "PMTCT_STAT.N.KnownPos.T",
                                       "PMTCT_STAT.N.New.Neg.T"),
                           "2024" =  c("PMTCT_STAT.N.New.Pos.T",
                                       "PMTCT_STAT.N.Known.Pos.T",
                                       "PMTCT_STAT.N.New.Neg.T"),
                           stop("Unsupported COP Year"))

  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "PMTCT Known Pos issues"
    a$msg <- "Could not analyze PMTCT Known Pos issues due to missing data."
    return(a)
  }

  issues <- if (this_cop_year == "2023") {
    data %>%
    dplyr::filter(is.na(key_population)) %>%
    dplyr::mutate(
      PMTCT_STAT.N.Total =
        PMTCT_STAT.N.New.Pos.T
      + PMTCT_STAT.N.KnownPos.T
      + PMTCT_STAT.N.New.Neg.T,
      knownpos_ratio =
        (PMTCT_STAT.N.KnownPos.T / PMTCT_STAT.N.Total)) %>%
    dplyr::select(
      psnu, psnu_uid, age, sex, key_population,
      PMTCT_STAT.N.Total,
      PMTCT_STAT.N.New.Pos.T,
      PMTCT_STAT.N.KnownPos.T,
      PMTCT_STAT.N.New.Neg.T,
      knownpos_ratio
    ) %>%
    dplyr::filter(!is.na(knownpos_ratio)) %>%
    dplyr::filter(
      round(knownpos_ratio, 2) > 0.75
    )
  } else if (this_cop_year == "2024") {
    data %>%
      dplyr::filter(is.na(key_population)) %>%
      dplyr::mutate(
        PMTCT_STAT.N.Total =
          PMTCT_STAT.N.New.Pos.T
        + PMTCT_STAT.N.Known.Pos.T
        + PMTCT_STAT.N.New.Neg.T,
        knownpos_ratio =
          (PMTCT_STAT.N.Known.Pos.T / PMTCT_STAT.N.Total)) %>%
      dplyr::select(
        psnu, psnu_uid, age, sex, key_population,
        PMTCT_STAT.N.Total,
        PMTCT_STAT.N.New.Pos.T,
        PMTCT_STAT.N.Known.Pos.T,
        PMTCT_STAT.N.New.Neg.T,
        knownpos_ratio
      ) %>%
      dplyr::filter(!is.na(knownpos_ratio)) %>%
      dplyr::filter(
        round(knownpos_ratio, 2) > 0.75
      )
    }

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "PMTCT Known Pos issues"

    a$msg <-
      paste0(
        "WARNING! PMTCT KNOWN POS Ratio > 75%: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Highest Known Pos ratio observed: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * max(issues$knownpos_ratio))),
        "\n")
  }

  return(a)
}


#' @export
#' @title Check Data Pack data for TB Known Pos ratio > 75\%.
#'
#' @description Check data gathered from Data Pack to identify cases where
#' TB Known Pos ratio > 75\%
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_tbknownpos <- function(data) {
  a <- NULL

  this_cop_year <- as.character(data$cop_year[1])

  required_names <- switch(this_cop_year,
                           "2023" =  c("TB_STAT.N.New.Pos.T",
                                       "TB_STAT.N.KnownPos.T",
                                       "TB_STAT.N.New.Neg.T"),
                           "2024" =  c("TB_STAT.N.New.Pos.T",
                                       "TB_STAT.N.Known.Pos.T",
                                       "TB_STAT.N.New.Neg.T"),
                           stop("Unsupported COP Year"))

  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "TB Known Pos issues"
    a$msg <- "Could not analyze TB Known Pos issues due to missing data."
    return(a)
  }

  issues <- if (this_cop_year == "2023") {
    data %>%
    dplyr::mutate(
      TB_STAT.N.Total =
        TB_STAT.N.New.Pos.T
      + TB_STAT.N.KnownPos.T
      + TB_STAT.N.New.Neg.T,
      knownpos_ratio = TB_STAT.N.KnownPos.T / TB_STAT.N.Total) %>%
    dplyr::select(
      psnu, psnu_uid, age, sex, key_population,
      TB_STAT.N.Total,
      TB_STAT.N.New.Pos.T,
      TB_STAT.N.KnownPos.T,
      TB_STAT.N.New.Neg.T,
      knownpos_ratio) %>%
    dplyr::filter(!is.na(knownpos_ratio)) %>%
    dplyr::filter(
      round(knownpos_ratio, 2) > 0.75)
    } else if (this_cop_year == "2024") {
      data %>%
      dplyr::mutate(
        TB_STAT.N.Total =
          TB_STAT.N.New.Pos.T
        + TB_STAT.N.Known.Pos.T
        + TB_STAT.N.New.Neg.T,
        knownpos_ratio = TB_STAT.N.Known.Pos.T / TB_STAT.N.Total) %>%
      dplyr::select(
        psnu, psnu_uid, age, sex, key_population,
        TB_STAT.N.Total,
        TB_STAT.N.New.Pos.T,
        TB_STAT.N.Known.Pos.T,
        TB_STAT.N.New.Neg.T,
        knownpos_ratio) %>%
      dplyr::filter(!is.na(knownpos_ratio)) %>%
      dplyr::filter(
        round(knownpos_ratio, 2) > 0.75)
  }

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "TB Known Pos issues"

    a$msg <-
      paste0(
        "WARNING! TB KNOWN POS Ratio > 75%: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Highest Known Pos ratio observed: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * max(issues$knownpos_ratio))),
        "\n")
  }

  return(a)
}


#' @export
#' @title Check Data Pack data for retention < 98\% or >100\%
#'
#' @description Check data gathered from Data Pack to identify cases where
#' retention is less than the standard of 98\% or >100\%.
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_retention <- function(data) {
  a <- NULL

  this_cop_year <- as.character(data$cop_year[1])

  required_names <- switch(this_cop_year,
                           "2023" =  c("TX_CURR.T",
                                       "TX_CURR.Expected.T_1",
                                       "TX_NEW.T"),
                           "2024" =  c("TX_CURR.T",
                                       "TX_CURR.Expected.T_1",
                                       "TX_NEW.T"),
                           stop("Unsupported COP Year"))


  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "Retention rate issues"
    a$msg <- "Could not analyze Retention rate issues due to missing data."
    return(a)
  }

    analysis <- data %>%
      dplyr::group_by(psnu, psnu_uid, age, sex, key_population, cop_year) %>%
      dplyr::summarise_all(sum) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(
        TX.Retention.T =
          (TX_CURR.T)
        / (TX_CURR.Expected.T_1 + TX_NEW.T)
      ) %>%
      dplyr::filter(!is.na(TX.Retention.T)) %>%
      dplyr::mutate(TX.Retention.T = round(TX.Retention.T, 2))

    issues <- analysis %>%
      dplyr::filter(TX.Retention.T < 0.98 | TX.Retention.T > 1) %>%
      dplyr::select(
        psnu, psnu_uid, age, sex, key_population,
        TX.Retention.T,
        TX_CURR.T,
        TX_CURR.Expected.T_1,
        TX_NEW.T)

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "Retention rate issues"

      national_avg <- data %>%
        dplyr::select(
          TX_CURR.T,
          TX_CURR.Expected.T_1,
          TX_NEW.T) %>%
        dplyr::summarise(
          TX_CURR.T = sum(TX_CURR.T, na.rm = TRUE),
          TX_CURR.Expected.T_1  = sum(TX_CURR.Expected.T_1, na.rm = TRUE),
          TX_NEW.T = sum(TX_NEW.T, na.rm = TRUE)) %>%
        dplyr::mutate(
          TX.Retention.T =
            (TX_CURR.T)
          / (TX_CURR.Expected.T_1 + TX_NEW.T)
        )

    a$msg <-
      paste0(
        "WARNING! RETENTION RATES <98% OR >100%: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Lowest retention rate observed: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * min(issues$TX.Retention.T))),
        "\n\n\t* ",
        "National average retention rate: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * min(national_avg$TX.Retention.T))),
        "\n")

  }

  return(a)
}


#' @export
#' @title Check Data Pack data for linkage < 95\% or >100\%
#'
#' @description Check data gathered from Data Pack to identify cases where
#' linkage rates are less than the standard of 95\% or >100\%.
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_linkage <- function(data) {
  a <- NULL


  hts_modalities <- HTS_POS_Modalities(data$cop_year[1])

  required_names <- c("TX_NEW.T", "TX_NEW.KP.T")

  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "Linkage rate issues"
    a$msg <- "Could not analyze Linkage rate issues due to missing data."
    return(a)
  }



  analysis <- data %>%
    dplyr::mutate(age = dplyr::case_when(age %in% c("50-54", "55-59", "60-64", "65+") ~ "50+",
                                         TRUE ~ age)) %>%
    dplyr::mutate(
      HTS_TST_POS.T  = rowSums(dplyr::select(., tidyselect::any_of(hts_modalities))),
      HTS_TST.Linkage.T =
        dplyr::case_when(
          HTS_TST_POS.T == 0 ~ NA_real_,
          TRUE ~
            TX_NEW.T
          / HTS_TST_POS.T
        ),
      HTS_TST.KP.Linkage.T =
        dplyr::case_when(
          HTS_TST.KP.Pos.T == 0 ~ NA_real_,
          TRUE ~
            TX_NEW.KP.T
          / HTS_TST.KP.Pos.T
        )
    )

  issues <- analysis %>%
    dplyr::filter((round(HTS_TST.Linkage.T, 2) < 0.95 | round(HTS_TST.Linkage.T, 2) > 1.0
                   | round(HTS_TST.KP.Linkage.T, 2) < 0.95 | round(HTS_TST.KP.Linkage.T, 2) > 1.0)
  # Need to analyze <01 linkage separately due to EID
                   & (age != "<01" | is.na(age))) %>%
    dplyr::select(psnu, psnu_uid, age, sex, key_population,
                  HTS_TST.Linkage.T, HTS_TST_POS.T, TX_NEW.T,
                  HTS_TST.KP.Linkage.T, HTS_TST.KP.Pos.T, TX_NEW.KP.T)

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "Linkage rate issues"

    national_avg <- analysis %>%
      dplyr::filter(age != "<01" | is.na(age)) %>%
      dplyr::select(
       HTS_TST_POS.T,
       TX_NEW.T,
       HTS_TST.KP.Pos.T,
       TX_NEW.KP.T) %>%
      dplyr::summarise_all(list(sum), na.rm = TRUE) %>%
      dplyr::mutate(
       HTS_TST.Linkage.T =
         TX_NEW.T / HTS_TST_POS.T,
       HTS_TST.KP.Linkage.T =
         TX_NEW.KP.T / HTS_TST.KP.Pos.T
      )

    a$msg <-
      paste0(
        "WARNING! LINKAGE RATES <95% OR >100%: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
        " PSNUs affected.\n\n\t* ",
        "Lowest linkage rate observed: ",
        crayon::bold(sprintf("%.1f%%",
                             100 * min(issues$HTS_TST.Linkage.T))),
        "\n\n\t* ",
        "National average Total Population linkage rate: ",
        crayon::bold(sprintf("%.1f%%", 100 * min(national_avg$HTS_TST.Linkage.T))),
        "\n\n\t* ",
        "National average Key Population linkage rate: ",
        crayon::bold(sprintf("%.1f%%", 100 * min(national_avg$HTS_TST.KP.Linkage.T))),
        "\n")

  }

  return(a)

}

#' @export
#' @title Check Data Pack data for low representation of HTS_INDEX_POS
#'
#' @description Check data gathered from Data Pack to identify cases where
#' the proportion of HTS_TST_POS represented by HTS_INDEX_POS is too low for the
#' given ART Coverage rate:
#'
#' \tabular{cc}{
#'   \strong{ART Coverage}\tab  \strong{HTS_INDEX_POS \% of HTS_TST_POS}\cr
#'   <70\%\tab  30\%\cr
#'   70\% <= x < 80\%\tab  50\%\cr
#'   >= 80\%\tab  75\%\cr
#' }
#'
#' @param data Analytics object to analyze
#'
#' @return a
#'
analyze_indexpos_ratio <- function(data) {

  a <- NULL

  this_cop_year <- data$cop_year[[1]]

     required_names <- c("HTS.Index.Pos.T",
                         "PLHIV.T",
                         "TX_CURR_SUBNAT.T")


  if (any(!(required_names %in% names(data)))) {
    a$test_results <- data.frame(msg = "Missing data.")
    attr(a$test_results, "test_name") <- "HTS_INDEX_POS Rate Issues"
    a$msg <- "Could not analyze HTS_INDEX_POS Rate Issues due to missing data."
    return(a)
  }

  hts_modalities <- HTS_POS_Modalities(this_cop_year)

    analysis <- data %>%
      dplyr::filter(is.na(key_population)) %>%
      dplyr::select(-age, -sex, -key_population) %>%
      dplyr::group_by(psnu, psnu_uid) %>%
      dplyr::summarise(dplyr::across(dplyr::everything(), sum), .groups = "drop") %>%
      dplyr::mutate(
        HTS_TST_POS.T = rowSums(dplyr::select(., tidyselect::any_of(hts_modalities))),
        HTS_INDEX.total = HTS.Index.Pos.T,
        HTS_TST_POS.index_rate =
          dplyr::case_when(
            HTS_TST_POS.T == 0 ~ NA_real_,
            TRUE ~ HTS_INDEX.total
            / (HTS_TST_POS.T)
          ),
        ART_coverage = dplyr::case_when(
          PLHIV.T == 0 ~ NA_real_,
          TRUE ~ TX_CURR_SUBNAT.T
          / PLHIV.T)) %>%
      dplyr::select(psnu, psnu_uid, TX_CURR_SUBNAT.T, PLHIV.T, ART_coverage,
                    HTS_INDEX.total, HTS_TST_POS.T, HTS_TST_POS.index_rate)

  issues <- analysis %>%
    dplyr::mutate(
      index_issues =
        (round(ART_coverage, 2) < 0.70 & round(HTS_TST_POS.index_rate, 2) < 0.30)
        | (round(ART_coverage, 2) >= 0.70 & round(ART_coverage, 2) < 0.80 & round(HTS_TST_POS.index_rate, 2) < 0.50)
        | (round(ART_coverage, 2) >= 0.80 & round(HTS_TST_POS.index_rate, 2) < 0.75)) %>%
    dplyr::filter(index_issues) %>%
    dplyr::mutate(
      category = dplyr::case_when(
        (ART_coverage > 1.0) | (ART_coverage == 0.0) | is.na(ART_coverage) ~ "Inspect ART Coverage",
        HTS_TST_POS.T < 10.0 ~ "Low baseline HTS_TST_POS",
        TRUE ~ "Possible under-utilization of Index testing"
      )
    )

  if (NROW(issues) > 0) {

    a$test_results <- issues
    attr(a$test_results, "test_name") <- "HTS_INDEX_POS Rate Issues"

    a$msg <-
      paste0(
        "WARNING! HTS_INDEX_POS RATES TOO LOW: \n\n\t* ",
        crayon::bold(
          paste0(
            length(unique(issues$psnu_uid)), " of ",
            length(unique(data$psnu_uid)))),
          " PSNUs affected. \n\n\t* ",
        "Likely root causes: \n\n\t\t- ",
        crayon::bold(length(issues$category[issues$category == "Inspect ART Coverage"])),
          " cases possibly due to faulty ART Coverage statistics",
        "\n\n\t\t- ",
        crayon::bold(length(issues$category[issues$category == "Low baseline HTS_TST_POS"])),
          " cases possibly due to low baseline HTS_TST_POS",
        "\n\n\t\t- ",
        crayon::bold(length(issues$category[issues$category == "Possible under-utilization of Index testing"])),
          " cases possibly due to actual HTS_INDEX_POS rate issue",
        "\n")
  }

  a

}


#' @export
#' @title Check Data Pack data for analytics concerns
#'
#' @description Check data gathered from Data Pack to identify
#' validation concerns at the PSNU level.
#'
#' @param d datapackr object
#' @param model_data_path Filepath to model data produced from most recent DATIM
#' sync.
#' @param d2_session R6 datimutils object which handles authentication with DATIM
#'
#' @return d
#'
checkAnalytics <- function(d,
                           model_data_path,
                           d2_session = dynGet("d2_default_session",
                                               inherits = TRUE)) {

  # Start running log of all warning and information messages ####
  d$keychain$model_data_path <- model_data_path
  d$info$analytics_warning_msg <- NULL
  d$info$has_analytics_error <- FALSE

  # Prepare analytics data ####
  data <- d$data$analytics %>%
    dplyr::select(psnu, psnu_uid,
                  indicator_code, age, sex, key_population, value = target_value) %>%
    dplyr::group_by(dplyr::across(-value)) %>%
    dplyr::summarise(value = sum(value)) %>%
    dplyr::ungroup()

  #Special analytics indicators needed for some checksin COP23
  if (d$info$cop_year >= "2023") {
    pmtct_eid_d <- extractRawColumnData(d, "EID", "PMTCT_EID.D.T")
    tx_curr_expected <- extractRawColumnData(d, "Cascade", c("Age", "Sex", "TX_CURR.Expected.T_1"))
    data <- data %>%
      dplyr::bind_rows(pmtct_eid_d, tx_curr_expected)
  }


  # Prepare model data ####
  #TODO: Generalize this as function
  model_data <- readRDS(d$keychain$model_data_path)

  if (!all(d$info$country_uids %in% names(model_data))) {
    missing <- country_uids[!d$info$country_uids %in% names(model_data)]
    analytics_warning_msg <-
      paste0(
        "Model data file does not have data for the following country_uids: \r\n\t* ",
        paste(missing, collapse = "\r\n\t* ")
      )

    d$info$analytics_warning_msg <- append(d$info$analytics_warning_msg,
                                           analytics_warning_msg)
  }

  category_options <- datimutils::getMetadata(end_point = "categoryOptions",
                                              "categories.id:ne:SH885jaRe0o",
                                              d2_session = d2_session)

  model_data_country <- model_data[d$info$country_uids] %>%
    dplyr::bind_rows() %>%
    tidyr::drop_na(value) %>%
    dplyr::left_join(
      getValidOrgUnits(d$info$cop_year) %>%
        dplyr::filter(country_uid %in% d$info$country_uids) %>%
        dplyr::select(psnu = name, psnu_uid = uid),
      by = c("psnu_uid" = "psnu_uid")
    ) %>%
    dplyr::left_join(dplyr::rename(category_options, age = name),
                     by = c("age_option_uid" = "id")) %>%
    dplyr::left_join(dplyr::rename(category_options, sex = name),
                     by = c("sex_option_uid" = "id")) %>%
    dplyr::left_join(dplyr::rename(category_options, key_population = name),
                     by = c("kp_option_uid" = "id")) %>%
    # Special handling for certain category options which
    # have leading zeros in the Datapack
    dplyr::mutate(age = dplyr::case_when(age == "5-9" ~ "05-09",
                                         age == "1-4" ~ "01-04",
                                         age == "1-9" ~ "01-09",
                                         age == "<1" ~ "<01",
                                         TRUE ~ age)) %>%
    dplyr::select(names(data))

  # Add model_data to analytics dataset ####
  data %<>%
    dplyr::bind_rows(model_data_country) %>%
    dplyr::arrange(dplyr::across(-value)) %>%
    tidyr::pivot_wider(names_from = indicator_code,
                       values_from = value) %>%
    addcols((d$info$schema %>%
                dplyr::filter(col_type %in% c("target", "past"),
                              sheet_name != "PSNUxIM") %>%
                dplyr::pull(indicator_code) %>%
               unique(.)),
            type = "numeric") %>%
    dplyr::mutate(dplyr::across(c(-psnu, -psnu_uid, -age, -sex, -key_population),
                     ~tidyr::replace_na(.x, 0))) %>%
    dplyr::mutate(cop_year = d$info$cop_year)

  #Apply the list of analytics checks functions
  funs <- list(
    retention = analyze_retention,
    linkage = analyze_linkage,
    index_rate = analyze_indexpos_ratio,
    pmtctknownpos_issues = analyze_pmtctknownpos,
    tbknownpos_issues = analyze_tbknownpos,
    vmmc_indeterminate_rate = analyze_vmmc_indeterminate,
    eid_coverage_2mo  = analyze_eid_2mo
  )

  analytics_checks <-  purrr::map(funs, purrr::exec, data)

  d$info$analytics_warning_msg <-
    append(
      d$info$analytics_warning_msg,
      purrr::map(analytics_checks,
                 function(x) purrr::pluck(x, "msg"))) %>%
    purrr::discard(is.null)


  d$tests <-
    append(d$tests,
           purrr::map(analytics_checks,
                      function(x) purrr::pluck(x, "test_results"))) %>%
    purrr::discard(is.null)

  # If warnings, show all grouped by sheet and issue ####
  if (!is.null(d$info$analytics_warning_msg) && interactive()) {
    options(warning.length = 8170)

    messages <-
      paste(
        paste(
          seq_len(NROW(d$info$analytics_warning_msg)),
          ": ", d$info$analytics_warning_msg
          # stringr::str_squish(gsub("\n", "", d$info$analytics_warning_msg))
        ),
        sep = "",
        collapse = "\r\n")

    key <- paste0(
      "*********************\r\n",
      "KEY:\r\n",
      "- WARNING!: Problematic, but doesn't stop us from processing your tool.\r\n",
      "- ERROR!: You MUST address these issues and resubmit your tool.\r\n",
      "*********************\r\n\r\n")

    cat(crayon::red(crayon::bold("ANALYTICS ISSUES: \r\n\r\n")))
    cat(crayon::red(key))
    cat(crayon::red(messages))
  }

  return(d)
}
pepfar-datim/datapackr documentation built on April 14, 2024, 10:35 p.m.