R/check_targets.R

#' Check the targets and seed for compatibility before running \code{ip_fit()}.
#'
#' In IPFs, common subtotals in each target must be equal, otherwise the IPF will not converge on a single result.
#' This function checks the common dimensions over each target combination for compatibility.
#' Further, this function checks the IPF seed, if provided, for compabitibilty with the targets.
#' If the seed, when summed over each dimension in a given target, contains any missing values or zero values, the target must also have a zero or missing value.
#'
#' @param targets A list of data frames containing subtotal targets. All totals in each target should be equal.
#' Series supplied in each target will shape the final data frame.
#' @param seed An optional data frame of seed values containing all final dimensions expected as output of \code{ip_fit()}.
#' Leave as \code{NULL} if the targets will be run using \code{ip_expand()} or will be used with a generic seed.
#' @param target.value.names The names of the series in \code{targets} containing subtotals to scale. Can be string or array of strings. Defaults to "value".
#' @param seed.value.name The name of the series in \code{seed} containing values. Defaults to "value".
#' @param max.error The maximum total absolute difference allowed between final scaled values and targets.
#' @return Messages in the console and an output list of  data frames containing any incompitable line items in the seed and targets.
#' @examples
#' tar1 <- data.frame(x = letters[1:2], value = c(50, 50))
#' tar2 <- data.frame(y = letters[3:5], value = c(20, 40, 40))
#' tar3 <- data.frame(x = c(rep(letters[1], 5), rep(letters[2], 5)),
#'                    z = rep(letters[6:10], 2),
#'                    value = c(5, 10, 15, 20, 0, 10, 10, 10, 10, 10))
#'
#' tar.list <- list(tar1, tar2, tar3)
#' check_targets(tar.list)
#'
#' #This will find errors with the targets
#' tar3b <- data.frame(x = c(rep(letters[1], 5), rep(letters[2], 5)),
#'                    z = rep(letters[6:10], 2),
#'                    value = c(0, 10, 15, 20, 0, 10, 10, 10, 10, 10))
#'
#' tar.list <- list(tar1, tar2, tar3b)
#' seed <- ipfitr::ip_create_seed(tar.list)
#' check_targets(tar.list, seed)
#'
#' #This will find errors with the targets and the seed
#' seed2 <- seed %>% mutate(value = ifelse(x == "a" & z == "g", 0, value))
#' check_targets(tar.list, seed2)
#'
#' @export
check_targets <- function(targets, seed = NULL,
                          target.value.names = "value", seed.value.name = "value",
                          max.error = 0.01) {

  #Warnings
  if(is.null(targets) | !is.list(targets) | !is.data.frame(targets[[1]])) {
    stop("Targets must be a list of data frames.")
  }

  #Initialize
  if(length(targets) == 1 & is.null(seed)) {
    stop("Check_targets() looks for compatibility issues between targets and the seed. If only one target is supplied, you must also supply a seed.
         Otherwise, no need to check. :)")
  } else if(length(targets) == 1) {
    message("Beginning check... one target and seed provided")
  } else {
    message(paste0("Beginning check... ", length(targets), " targets provided",
                   if(is.null(seed)){"."} else {", as well as a seed."}))
  }

  ###
  # Check the targets against each other ----
  ###
  num.tars <- length(targets)
  tar.list <- targets

  names(tar.list) <- paste0("Tar", 1:num.tars)

  if(length(tar.list) <= 1){
    target.checks.op <- list()
  } else {
    tar.combo <- data.frame(TarA = c(), TarB = c(), stringsAsFactors = FALSE)
    for(i in num.tars:2){
      for(j in (i-1):1){
        tar.combo <- tar.combo %>% bind_rows(
          data.frame(TarA = paste0("Tar", j), TarB = paste0("Tar", i), stringsAsFactors = FALSE)
        )
      }
    }
    tar.combo <- tar.combo %>%
      arrange(TarA, TarB)

    combine_tars_a <- function(tara, tarb, list_of_tars){
      TarA <- list_of_tars[[tara]]
      TarB <- list_of_tars[[tarb]]

      common.dims <- names(TarA)[names(TarA) %in% names(TarB)]
      common.dims <- common.dims[!(common.dims %in% target.value.names)]

      names(TarA)[names(TarA) %in% target.value.names] <- ".value"
      names(TarB)[names(TarB) %in% target.value.names] <- ".value"

      combo.tar <- TarA %>%
        mutate(.dftype = "Checker") %>% #This cleans up joins for targets with no common series
        group_by_(.dots = as.list(c(".dftype", common.dims))) %>%
        summarize(TarA = sum(.value, na.rm=TRUE)) %>%
        ungroup() %>%
        full_join(
          TarB %>%
            mutate(.dftype = "Checker")  %>%
            group_by_(.dots = as.list(c(".dftype", common.dims))) %>%
            summarize(TarB = sum(.value, na.rm=TRUE)) %>%
            ungroup() ,
          by = c(".dftype", common.dims)
        ) %>%
        mutate(Check_value = (TarA - TarB),
               Check_trigger = abs(Check_value) > max.error,
               Check_trigger = ifelse(is.na(Check_trigger), FALSE, Check_trigger),
               Mismatch_trigger = is.na(TarA) | is.na(TarB)) %>%
        select(-.dftype)

      names(combo.tar)[names(combo.tar) == "TarA"] <- tara
      names(combo.tar)[names(combo.tar) == "TarB"] <- tarb

      return(combo.tar)
    }

    target.checks <- purrr::pmap(list(a = tar.combo$TarA, b = tar.combo$TarB),
                                 function(a, b){combine_tars_a(a, b, tar.list)})

    target.checks.names <- tar.combo %>% mutate(.name = paste(TarA, " & ", TarB)) %>% pull
    names(target.checks) <- target.checks.names

    #Only keep dfs with violations
    target.checks.op <- target.checks[purrr::map_lgl(target.checks, function(x){any(x$Check_trigger, na.rm=TRUE)})]

    #Look for targets with values in some but not others
    target.mismatch.op <- target.checks[purrr::map_lgl(target.checks, function(x){any(x$Mismatch_trigger, na.rm=TRUE)})]

    if(length(target.checks.op) == 0 && length(target.mismatch.op) == 0) {
      message("\nTargets are good! No issues here.\n===================================")
    } else {
      message("\nAt least one violation has been found within the targets. See output.\n===================================")
    }
  } #End target check


  ####
  # Check the targets against the seed ----
  ####

  if(is.null(seed)){ seed.checks.op <- list()} else {

    message("Checking each target against the seed... This will look for 0 or NA values over seed subtotals and compare them with the targets.\nIf the seed has a 0 subtotal, then the matching target should also be 0 (IPF cannot scale zero to a non-zero).")

    check_seed_a <- function(TarA, SeedA){
      dims.in.tar <- names(TarA)
      dims.in.tar <- dims.in.tar[!(dims.in.tar %in% c(target.value.names))]

      names(SeedA)[names(SeedA) == seed.value.name] <- ".value"
      names(TarA)[names(TarA) %in% target.value.names] <- ".target"

      seed.collapse <- SeedA %>%
        mutate_if(is.factor, as.character) %>%
        group_by_(.dots = as.list(dims.in.tar)) %>%
        summarize(.seed = sum(.value, na.rm = TRUE)) %>%
        ungroup() %>%
        full_join(TarA %>% mutate_if(is.factor, as.character), by = dims.in.tar) %>%
        mutate(.seed = round(.seed, 3),
               .target = round(.target, 3)) %>%
        mutate(Check_trigger = (.seed == 0 | is.na(.seed)) & (.target > 0 | !is.na(.target)),
               Mismatch_trigger = is.na(.seed) | is.na(.target))

    }

    seed.checks <- purrr::map(tar.list, function(x){check_seed_a(x, seed)})

    names(seed.checks) <- paste(names(seed.checks), "& Seed")

    #Only keep dfs with violations
    seed.checks.op <- seed.checks[purrr::map_lgl(seed.checks, function(x){any(x$Check_trigger)})]
    seed.mismatch.op <- seed.checks[purrr::map_lgl(seed.checks, function(x){any(x$Mismatch_trigger)})]

    #Output message
    if(length(seed.checks.op) == 0 && length(seed.mismatch.op) == 0){
      message("\nThe seed and targets line up! No issues here.\n===================================")
    } else {
      message("\n!!! Zero subtotals found in seed where targets have values. IPF will not converge. See output.\n===================================")
    }

  } #End seed check

  check.op <- purrr::map(c(target.checks.op, target.mismatch.op, seed.checks.op), function(x){
    x %>% filter(Check_trigger | Mismatch_trigger) %>% select(-Check_trigger, -Mismatch_trigger)
  })

  return(check.op)

}
ryantimpe/ipfitr documentation built on May 14, 2019, 8:55 a.m.