R/ggcc_preprocess.R

#' @title ggcc_preprocess
#'
#' @description Function to parse data for use by ggcc.
#'
#' @param df Dataframe to parse.
#' @param filter_str String to filter df.
#' @param clre Bool to indicate if CC is for typical CLRE. Defaults to TRUE.
#' @return Returns parsed dataframe.
#'
#' @export
ggcc_preprocess <- function(
  df,
  filter_str,
  clre = TRUE
) {

  ccdf <- df %>%
    dplyr::filter_(filter_str) %>%
    dplyr::mutate(
      monyr = as.character(YYYYMM)
    ) %>%
    {if (clre) add_clre_tech(., write = FALSE) else .} %>%
    {if (clre) join_clre_limits(.) else .} %>%
    {if (clre) dplyr::group_by(., monyr, alert_limit) else dplyr::group_by(., monyr)} %>%
    dplyr::summarise(complaints = n()) %>%
    dplyr::right_join(
      y = data.frame(
        monyr = strftime(c(Sys.time() - months(13:2), last_month()),
                         format = '%Y%m'),
        stringsAsFactors = FALSE),
      by = 'monyr') %>%
    dplyr::ungroup() %>%
    {if (clre) dplyr::arrange(., alert_limit) else .} %>%
    {if (clre) {
      dplyr::mutate(
        .,
        alert_limit = zoo::na.locf(alert_limit),
        complaints = replace(complaints, is.na(complaints), 0)
      )
      } else {
        dplyr::mutate(
          .,
          complaints = replace(complaints, is.na(complaints), 0)
        )
      }
    } %>%
    dplyr::arrange(monyr) %>%
    dplyr::mutate(range = c(NA, abs(diff(complaints))))

  avgr <- round(mean(ccdf$range, na.rm = TRUE), digits = 1)
  lclr <- 0
  uclr <- round(3.267*avgr, digits = 1)

  pseudo_sd <- avgr / 1.128
  avg <- round(mean(ccdf$complaints), digits = 1)
  lcl <- round(mean(ccdf$complaints) - 3*pseudo_sd, digits = 1)
  ucl <- round(mean(ccdf$complaints) + 3*pseudo_sd, digits = 1)
  lzb <- round(mean(ccdf$complaints) - 2*pseudo_sd, digits = 1)
  uzb <- round(mean(ccdf$complaints) + 2*pseudo_sd, digits = 1)
  lzc <- round(mean(ccdf$complaints) - pseudo_sd, digits = 1)
  uzc <- round(mean(ccdf$complaints) + pseudo_sd, digits = 1)

  # test2_func <- function(x) {
  #   side <- (x - avg) > 0
  #   nums <- rle(side)
  #   if (any(nums$lengths >= 9)) {
  #     ind <- c(1:length(nums$lengths))[nums$lengths >= 9]
  #     run <- nums$lengths[ind]
  #     if (ind == 1) {
  #       return(rep(c(0, 2, 0), c(8, 1, length(x) - 9)))
  #     } else {
  #       before <- sum(nums$lengths[1:(ind - 1)])
  #       return(rep(c(0, 2, 0), c(run + before - 1, 1, (length(x) - run - before))))
  #     }
  #   } else {
  #     return(rep(0, length(x)))
  #   }
  # }
  #
  # test3_func <- function(x) {
  #   diffs <- diff(x) > 0
  #   nums <- rle(diffs)
  #   if (any(nums$lengths >= 6)) {
  #     ind <- c(1:length(nums$lengths))[nums$lengths >= 6]
  #     run <- nums$lengths[ind]
  #     if (ind == 1) {
  #       return(rep(c(0, 3, 0), c(5, 1, length(x) - 6)))
  #     } else {
  #       before <- sum(nums$lengths[1:(ind - 1)])
  #       return(rep(c(0, 3, 0), c(before+run, 1, (length(x) - run - before - 1))))
  #     }
  #   } else {
  #     return(rep(0, length(x)))
  #   }
  # }
  #
  # test5_func <- function(x) {
  #   ret <- rep(0, length(x))
  #   for(i in c(3:length(x))) {
  #     test5 <- abs(x[(i-2):i] - avg) >= 2*pseudo_sd
  #     if (sum(test5) >= 2 & (all((x[(i-2):i] - avg)[test5] > 0) | all((x[(i-2):i] - avg)[test5] < 0))) {
  #       ret[(i-2):i][test5] <- 5
  #     }
  #   }
  #   return(ret)
  # }
  #
  # test6_func <- function(x) {
  #   ret <- rep(0, length(x))
  #   for(i in c(5:length(x))) {
  #     test6 <- abs(x[(i-4):i] - avg) >= pseudo_sd
  #     if (sum(test6) >= 4 & (all((x[(i-4):i] - avg)[test6] > 0) | all((x[(i-4):i] - avg)[test6] < 0))) {
  #       ret[(i-4):i][test6] <- 6
  #     }
  #   }
  #   return(ret)
  # }
  #
  # test8_func <- function(x) {
  #   ret <- rep(0, length(x))
  #   for (i in c(8:length(x))) {
  #     test8 <- sum(abs(x[(i-7):i] - avg) >= 2 * pseudo_sd)
  #     if (test8 == 8) {
  #       ret[i] <- 8
  #     }
  #   }
  #   return(ret)
  # }
  #
  # get_test <- function(x) {
  #   start <- ifelse(clre, 5, 4)
  #   inds <- as.numeric(x[start:length(x)]) > 0
  #   if (sum(inds) > 0) {
  #     test <- min(as.numeric(x[4:length(x)][inds]), na.rm = TRUE)
  #   } else {
  #     test <- 0
  #   }
  #
  #   return(test)
  # }
  #
  # red_lines <- function(x) {
  #   range <- data.frame(test = c(1,2,3,5,6,8), range = c(0, 0, 5, 2, 4, 7))
  #   reds <- rep(0, length(x))
  #   if (x[1] > 0) {
  #     reds[1] <- 1
  #   }
  #   for (i in 2:length(x)) {
  #     if (x[i] > 0) {
  #       back <- range$range[range$test == x[i]]
  #       # reds[(i-back):i] <- 1
  #       reds[i] <- 1
  #     }
  #   }
  #
  #   x <- rle(reds)
  #   for (i in 1:length(x$values)) {
  #     if (x$values[i] == 1) {
  #       x$values[i] <- i
  #     }
  #   }
  #   reds <- rep(x$values, x$lengths)
  #   return(reds)
  # }

  ccdf %<>%
    # dplyr::mutate(
    #   test1 = ifelse(abs(complaints - avg) >= 3 * pseudo_sd, 1, 0)
    # ) %>%
    # dplyr::mutate(
    #   test = apply(., 1, get_test)
    # ) %>%
    {if (clre) {
      dplyr::mutate(
        .,
        # count_label = ifelse(test > 0, 1, 0),
        # reds = red_lines(test),
        alert_lim = alert_limit
      )
      } else {
      # dplyr::mutate(
      #   .,
      #   count_label = ifelse(test > 0, 1, 0),
      #   reds = red_lines(test)
      # )
        .
      }
    } %>%
    {if (clre) {
      dplyr::select(
        .,
        monyr,
        complaints,
        range,
        # test,
        # count_label,
        # reds,
        alert_lim
      )
    } else {
      dplyr::select(
        .data = .,
        monyr,
        complaints,
        range
        # test,
        # count_label,
        # reds
      )
    }}

  return(ccdf)
}
kimjam/srms documentation built on May 20, 2019, 10:21 p.m.