R/cp_functions.R

Defines functions find_cp_linreg find_cp_cusum find_cp_pettitt find_cp_loss_fun

Documented in find_cp_cusum find_cp_linreg find_cp_loss_fun find_cp_pettitt

####################################################
#These are new, alternative changepoint functions
#to use when identifying a changepoint and the
#expected number of missed visits.They each take
#as an input the output of an object from
#count_prior_events_truven, and output a summary
#of the model fit, miss_bins, and other info
###################################################

#' Identify change point using a loss function method and find expected SSD visits/calculate misses. This is done
#' by fitting a linear model for data prior to i, with i ranging from period 0 to max period - 2. For a specified loss function,
#' the first local minima is used used to identifiy the optimal change point
#'
#' @title find_cp_loss_fun
#' @param data A dataframe output by count_prior_events_truven
#' @param var_name A character string of outcome for which to apply analysis
#' @param return_miss_only Logical to only return miss information
#' @param week_period Logical to incorporate a "day of the week" effect into
#' the linear model. Note this is only sensible for one-day period aggregation.
#' @param loss_function The loss function used to identify the change point (mean squared error (MSE), root mean squared error (RMSE),
#' mean absolute error (MAE), mean squared log error (MSLE), or root mean squared log error (RMSLE)
#' @param return_loss_fun_tab_only Logical to only return loss function table that includes estimates for all loss functions available
#' @param specify_cp Set a specific change point you want to use instead of searching for optimal change point. Enter a postive integer value
#' repersenting the days before the index on which you you want to specify the change point. (e.g. 100 would be 100 days before the index)
#' @return A list containing miss information, changepoint information, predictions,
#' the model itself, and a plot of the middle finger curve and model.
#'
#' @examples
#' cp_result_pettit <- final_time_map %>%
#' filter(days_since_dx >= -180) %>%
#' count_prior_events_truven(event_name = "any_ssd", start_day = 1, by_days = 1) %>%
#' find_cp_loss_fun(var_name = "n_miss_visits", return_miss_only = FALSE, week_period=TRUE, loss_function = "MAE")
#'
#' @export
#'
find_cp_loss_fun <- function(data, var_name = "n_miss_visits", return_miss_only = FALSE, return_loss_fun_tab_only = FALSE,
                            week_period=FALSE, specify_cp = NULL, loss_function = "RMSE"){

  #Require some necessary packages
  require(tidyverse)

  if (!loss_function %in% c("MSE", "RMSE", "MAE", "MSLE", "RMSLE")){
    stop('The method selected is not avaiable, please select from one of the following:
         "MSE", "RMSE", "MAE", "MSLE", or "RMSLE"')
  }

  #Reorder data for easy time series usage
  cp_out <- arrange(data, -period)

  #Create a dummy column that is variable of interest
  cp_out$var_name <- cp_out[[var_name]]

  # fit linear model by period from 0 to max period - 2
  loss_fun_table <- tibble()
  data <- cp_out %>% arrange(period) %>%
    mutate(week_period = as.factor(period %% 7))
  for (i in data %>% filter(period < max(period) -1) %>% .$period){
    dataset <- data %>% filter(period > i)

    if(week_period){
      pred <- predict.lm(lm(var_name ~ period + week_period, data = dataset))
    } else {
      pred <- predict.lm(lm(var_name ~ period, data = dataset))
    }

    out <- dataset %>% mutate(pred = pred)

    suppressWarnings(
    out1 <- tibble(period = i,
                   RMSE = RMSE(out$pred, out$var_name),
                   MSE = MSE(out$pred, out$var_name),
                   MAE = MAE(out$pred, out$var_name),
                   MSLE = MSLE(out$pred, out$var_name),
                   RMSLE = RMSLE(out$pred, out$var_name))
    )

    loss_fun_table <- bind_rows(loss_fun_table, out1)
  }

  if (return_loss_fun_tab_only){
   return(loss_fun_table)
  }

  #Identify CP and find which period it corresponds to
  if (is.null(specify_cp)){
    local_min <- which(diff(sign(diff(loss_fun_table[[loss_function]])))==-2)+1
    cp <- local_min[1]
  } else {
    cp <- specify_cp
  }

  #Extract data for the model, all periods after cp
  model_data <- cp_out %>% filter(period > cp) %>% mutate(period_neg=-1*period) %>%
    mutate(week_period = as.factor(period %% 7))

  #Fit model, different if request periodicity
  if(week_period){
    model <- lm(var_name ~ period_neg + week_period, data = model_data)
  } else{
    model <- lm(var_name ~ period_neg, data = model_data)
  }

  #Get prediction covariates and make predictions
  if(week_period){
    pred_vars <- cp_out %>% mutate(period_neg = -1*period) %>%
      mutate(week_period = as.factor(period %% 7)) %>% select(var_name, period_neg, week_period)
  } else{
    pred_vars <- cp_out %>% mutate(period_neg = -1*period) %>% select(var_name, period_neg)
  }

  model_pred_intervals <- predict.lm(model, pred_vars, interval = "prediction", level = 0.90)

  #Collect all data needed for cp_out in this function

  #First get miss bins and statistics. Hard code num_miss and num_miss_upper_int to be
  #floored at 0
  miss_bins <- data.frame(period=cp_out$period,
                          Y=cp_out$var_name,
                          pred1= model_pred_intervals[, "fit"],
                          lower_int_pred1 = model_pred_intervals[, "lwr"],
                          upper_int_pred1 = model_pred_intervals[, "upr"],
                          pred=cp_out$var_name,
                          num_miss = cp_out$var_name - model_pred_intervals[, "fit"],
                          num_miss_upper_int = cp_out$var_name - model_pred_intervals[, "upr"]) %>%
    mutate(num_miss = num_miss*(num_miss>=0)) %>%
    mutate(num_miss_upper_int = num_miss_upper_int*(num_miss_upper_int>=0))

  #Filter to only times beyond CP
  miss_bins <- miss_bins %>% filter(period <= cp)
  if (return_miss_only){
    return(miss_bins)
  }

  #Output data about the changepoint itself
  change_point <- data.frame(Y = cp_out %>% filter(period == cp) %>% .$var_name,
                               t = which(cp_out$period == cp),
                               period = cp)

  #Output data about predictions
  pred <- data.frame(period=cp_out$period,
                     Y=cp_out$var_name,
                     t = 1:nrow(cp_out),
                     pred1= model_pred_intervals[, "fit"],
                     lower_int_pred1 = model_pred_intervals[, "lwr"],
                     upper_int_pred1 = model_pred_intervals[, "upr"],
                     pred=cp_out$var_name,
                     num_miss = cp_out$var_name - model_pred_intervals[, "fit"],
                     num_miss_upper_int = cp_out$var_name - model_pred_intervals[, "upr"])

  cp_plot <- pred %>% mutate(t = t-max(t)) %>% ggplot2::ggplot(aes(t, pred)) +
    ggtitle(paste0("Loss function = ", loss_function, " & week effect = ", week_period))+
    ggplot2::geom_line(aes(y = pred1), color = "red",size=.8) +
    geom_ribbon(aes(ymin = lower_int_pred1, ymax = upper_int_pred1), fill = "red", alpha = 0.2)+
    ggplot2::geom_line(size=.8) +
    ggplot2::geom_point(aes(t,Y),size=.8) +
    ggplot2::theme_light() +
    ggplot2::geom_vline(xintercept = change_point$period*-1 , color="blue", size=.8)

  #Compile output
  cp_out <- list(miss_bins=miss_bins,
                 change_point=change_point,
                 pred=pred,
                 model=model,
                 cp_plot=cp_plot)


  return(cp_out)

}


#A function to identify the changepoint using the Pettitt method. Requires:
#data: an output of count_prior_events_truven
#var_name: a character string of the var we are modeling for
#return_miss_only: a logical to only return miss visit counts
#week_period: a logical to fit the linear before model with factors for days until index mod 7,
#i.e. to have a crude form of week periodicity

#' Identify changepoint using pettitt method, and find expected SSD visits/calculate misses
#' by fitting a linear model before the changepoint
#'
#' @param data A dataframe output by count_prior_events_truven
#' @param var_name A character string of outcome for which to apply analysis
#' @param return_miss_only Logical to only return miss information
#' @param week_period Logical to incorporate a "day of the week" effect into
#' the linear model. Note this is only sensible for one-day period aggregation.
#' @param specify_cp Set a specific change point you want to use instead of searching for optimal change point. Enter a postive integer value
#' repersenting the days before the index on which you you want to specify the change point. (e.g. 100 would be 100 days before the index)
#' @param auto_reg Logical that determines whether expected counts use a time-series framework that incorporates autoregression.
#' If week_period is FALSE, will use a 7-day seasonality component. If week_period is TRUE, will use an additive indicator
#' @return A list containing miss information, changepoint information, predictions,
#' the model itself, and a plot of the middle finger curve and model.
#' @examples
#' cp_result_pettit <- final_time_map %>%
#' filter(days_since_dx >= -180) %>%
#' count_prior_events_truven(event_name = "any_ssd", start_day = 1, by_days = 1) %>%
#' find_cp_pettitt(var_name = "n_miss_visits", return_miss_only = FALSE, week_period=TRUE)
#' @export
find_cp_pettitt <- function(data, var_name = "n_miss_visits", return_miss_only = FALSE,
                            week_period=FALSE, specify_cp = NULL, auto_reg = FALSE){

  #Require some necessary packages
  require(trend)
  require(changepoint)
  require(tidyverse)
  require(forecast)

  #Reorder data for easy time series usage
  cp_out <- arrange(data, -period)

  #Create a dummy column that is variable of interest
  cp_out$var_name <- cp_out[[var_name]]


  #Convert to a time series object
  t_series <- ts(cp_out$var_name, start = min(-1*cp_out$period),
                   frequency = 1)

  if (is.null(specify_cp)){
    #Identify CP and find which period it corresponds to
    cp_est <- pettitt.test(t_series)$estimate[[1]]
    cp <- cp_out$period[pettitt.test(t_series)$estimate[[1]]]
  } else {
    cp <- specify_cp
  }

  #Extract data for the model, all periods after cp
  model_data <- cp_out %>% filter(period > cp) %>% mutate(period_neg=-1*period) %>%
    mutate(week_period = as.factor(period %% 7))

  #Fit model, different if request periodicity or autoregressive

  if(auto_reg){
    if(week_period){
      #Convert to time series object
      t_series <- ts(model_data$var_name,
                     frequency = 7)
      #See how far out we need to go in forecasting
      h <- nrow(cp_out) - nrow(model_data)

      #Set up covariates to use for prediction
      pred_data <- cp_out %>% filter(period <= cp) %>% mutate(period_neg=-1*period) %>%
        mutate(week_period = as.factor(period %% 7))

      #Calculate xreg matrices by hand, as function can't handle factors
      model_xreg <- model_data %>% mutate(num_period = period %% 7) %>%
        mutate(day1 = as.numeric(num_period == 1)) %>%
        mutate(day2 = as.numeric(num_period == 2)) %>%
        mutate(day3 = as.numeric(num_period == 3)) %>%
        mutate(day4 = as.numeric(num_period == 4)) %>%
        mutate(day5 = as.numeric(num_period == 5)) %>%
        mutate(day6 = as.numeric(num_period == 6)) %>%
        mutate(trend = period_neg) %>%
        select(day1,day2,day3,day4,day5,day6,trend) %>%
        as.matrix()
      pred_xreg <- pred_data %>% mutate(num_period = period %% 7) %>%
        mutate(day1 = as.numeric(num_period == 1)) %>%
        mutate(day2 = as.numeric(num_period == 2)) %>%
        mutate(day3 = as.numeric(num_period == 3)) %>%
        mutate(day4 = as.numeric(num_period == 4)) %>%
        mutate(day5 = as.numeric(num_period == 5)) %>%
        mutate(day6 = as.numeric(num_period == 6)) %>%
        mutate(trend = period_neg) %>%
        select(day1,day2,day3,day4,day5,day6,trend) %>%
        as.matrix()

      #Fit Arima model with additive effect for week
      model <- Arima(t_series, c(1,0,1), xreg=model_xreg, method = "ML")
      #Get forecast
      pred <- forecast(model, h=h, level = .9, xreg = pred_xreg)
      pred_mean <- c(pred$fitted,pred$mean)
      pred_upper <- c(fitted(model) + 1.96*sqrt(model$sigma2),pred$upper)
      pred_lower <- c(fitted(model) - 1.96*sqrt(model$sigma2),pred$lower)


    } else{
    #Convert to time series object
    t_series <- ts(model_data$var_name,
                   frequency = 7)
    #See how far out we need to go in forecasting
    h <- nrow(cp_out) - nrow(model_data)

    #Fit Arima model
    model <- Arima(t_series, c(1,0,1), seasonal = list(order = c(1,0,0)), method = "ML")
    #Get forecast
    pred <- forecast(model, h=h, level = .9)
    pred_mean <- c(pred$fitted,pred$mean)
    pred_upper <- c(fitted(model) + 1.96*sqrt(model$sigma2),pred$upper)
    pred_lower <- c(fitted(model) - 1.96*sqrt(model$sigma2),pred$lower)
    }



  } else{
    if(week_period){
      model <- lm(var_name ~ period_neg + week_period, data=model_data)
    } else{
      model <- lm(var_name ~ period_neg, data=model_data)
    }

    #Get prediction covariates and make predictions
    if(week_period){
      pred_vars <- cp_out %>% mutate(period_neg=-1*period) %>%
        mutate(week_period = as.factor(period %% 7)) %>% select(var_name,period_neg,week_period)
    } else{
      pred_vars <- cp_out %>% mutate(period_neg=-1*period) %>% select(var_name,period_neg)
    }

    model_pred_intervals <- predict.lm(model, pred_vars, interval = "prediction", level = 0.90)

    #Extract data
    pred_mean <- model_pred_intervals[, "fit"]
    pred_lower <- model_pred_intervals[, "lwr"]
    pred_upper <- model_pred_intervals[, "upr"]

  }

  #Collect all data needed for cp_out in this function

  #First get miss bins and statistics. Hard code num_miss and num_miss_upper_int to be
  #floored at 0
  miss_bins <- data.frame(period=cp_out$period,
                          Y=cp_out$var_name,
                          pred1= pred_mean,
                          lower_int_pred1 = pred_lower,
                          upper_int_pred1 = pred_upper,
                          pred=cp_out$var_name,
                          num_miss = cp_out$var_name - pred_mean,
                          num_miss_upper_int = cp_out$var_name - pred_upper) %>%
    mutate(num_miss = num_miss*(num_miss>=0)) %>%
    mutate(num_miss_upper_int = num_miss_upper_int*(num_miss_upper_int>=0))

  #Filter to only times beyond CP
  miss_bins <- miss_bins %>% filter(period<=cp)
  if (return_miss_only){
    return(miss_bins)
  }

  if (is.null(specify_cp)){
    #Output data about the changepoint itself
    change_point <- data.frame(Y = cp_out$var_name[cp_est],
                               t = cp_est,
                               period = cp)
  } else {
    #Output data about the changepoint itself
    change_point <- data.frame(Y = cp_out %>% filter(period == cp) %>% .$var_name,
                               t = which(cp_out$period == cp),
                               period = cp)
  }

  #Output data about predictions
  pred <- data.frame(period=cp_out$period,
                     Y=cp_out$var_name,
                     t = 1:nrow(cp_out),
                     pred1= pred_mean,
                     lower_int_pred1 = pred_lower,
                     upper_int_pred1 = pred_upper,
                     pred=cp_out$var_name,
                     num_miss = cp_out$var_name - pred_mean,
                     num_miss_upper_int = cp_out$var_name - pred_upper)


  cp_plot <- pred %>% mutate(t = t-max(t)) %>% ggplot2::ggplot(aes(t, pred)) +
    ggtitle(paste0("Method = Pettitt, week effect = ", week_period,", auto_reg=",auto_reg))+
    ggplot2::geom_line(aes(y = pred1), color = "red",size=.8) +
    geom_ribbon(aes(ymin = lower_int_pred1, ymax = upper_int_pred1), fill = "red", alpha = 0.2)+
    ggplot2::geom_line(size=.8) +
    ggplot2::geom_point(aes(t,Y),size=.8) +
    ggplot2::theme_light() +
    ggplot2::geom_vline(xintercept = change_point$period*-1 , color="blue", size=.8)


  #Compile output
  cp_out <- list(miss_bins=miss_bins,
                 change_point=change_point,
                 pred=pred,
                 model=model,
                 cp_plot=cp_plot)

  return(cp_out)


}


#A function to identify the changepoint using the CUSUM method. Requires:
#data: an output of count_prior_events_truven
#var_name: a character string of the var we are modeling for
#return_miss_only: a logical to only return miss visit counts
#week_period: a logical to fit the linear before model with factors for days until index mod 7,
#i.e. to have a crude form of week periodicity


#' Identify changepoint using CUSUM method, and find expected SSD visits/calculate misses
#' by fitting a linear model before the changepoint
#'
#' @param data A dataframe output by count_prior_events_truven
#' @param var_name A character string of outcome for which to apply analysis
#' @param return_miss_only Logical to only return miss information
#' @param week_period Logical to incorporate a "day of the week" effect into
#' the linear model. Note this is only sensible for one-day period aggregation.
#' @param specify_cp Set a specific change point you want to use instead of searching for optimal change point. Enter a postive integer value
#' repersenting the days before the index on which you you want to specify the change point. (e.g. 100 would be 100 days before the index)
#' @param auto_reg Logical that determines whether expected counts use a time-series framework that incorporates autoregression.
#' If week_period is FALSE, will use a 7-day seasonality component. If week_period is TRUE, will use an additive indicator
#' @return A list containing miss information, changepoint information, predictions,
#' the model itself, and a plot of the middle finger curve and model.
#' @examples
#' cp_result_cusum <- final_time_map %>%
#' filter(days_since_dx >= -180) %>%
#' count_prior_events_truven(event_name = "any_ssd", start_day = 1, by_days = 1) %>%
#' find_cp_cusum(var_name = "n_miss_visits", return_miss_only = FALSE, week_period=TRUE)
#' @export
find_cp_cusum <- function(data, var_name = "n_miss_visits", return_miss_only = FALSE,
                            week_period=FALSE, specify_cp = NULL, auto_reg = FALSE){

  #Require some necessary packages
  require(trend)
  require(changepoint)
  require(tidyverse)
  require(forecast)

  #Reorder data for easy time series usage
  cp_out <- arrange(data, -period)

  #Create a dummy column that is variable of interest
  cp_out$var_name <- cp_out[[var_name]]

  #Convert to a time series object
  t_series <- ts(cp_out$var_name, start = min(-1*cp_out$period),
                 frequency = 1)

  if (is.null(specify_cp)){
  #Identify CP and find which period it corresponds to
    cp_est <- suppressWarnings( cpts(cpt.mean(t_series,pen.value=1,penalty='None',test.stat='CUSUM')) )
    cp <- cp_out$period[cp_est]
  } else {
    cp <- specify_cp
  }

  #Extract data for the model, all periods after cp
  model_data <- cp_out %>% filter(period > cp) %>% mutate(period_neg=-1*period) %>%
    mutate(week_period = as.factor(period %% 7))


  #Fit model, different if request periodicity or autoregressive

  if(auto_reg){
    if(week_period){
      #Convert to time series object
      t_series <- ts(model_data$var_name,
                     frequency = 7)
      #See how far out we need to go in forecasting
      h <- nrow(cp_out) - nrow(model_data)

      #Set up covariates to use for prediction
      pred_data <- cp_out %>% filter(period <= cp) %>% mutate(period_neg=-1*period) %>%
        mutate(week_period = as.factor(period %% 7))

      #Calculate xreg matrices by hand, as function can't handle factors
      model_xreg <- model_data %>% mutate(num_period = period %% 7) %>%
        mutate(day1 = as.numeric(num_period == 1)) %>%
        mutate(day2 = as.numeric(num_period == 2)) %>%
        mutate(day3 = as.numeric(num_period == 3)) %>%
        mutate(day4 = as.numeric(num_period == 4)) %>%
        mutate(day5 = as.numeric(num_period == 5)) %>%
        mutate(day6 = as.numeric(num_period == 6)) %>%
        mutate(trend = period_neg) %>%
        select(day1,day2,day3,day4,day5,day6,trend) %>%
        as.matrix()
      pred_xreg <- pred_data %>% mutate(num_period = period %% 7) %>%
        mutate(day1 = as.numeric(num_period == 1)) %>%
        mutate(day2 = as.numeric(num_period == 2)) %>%
        mutate(day3 = as.numeric(num_period == 3)) %>%
        mutate(day4 = as.numeric(num_period == 4)) %>%
        mutate(day5 = as.numeric(num_period == 5)) %>%
        mutate(day6 = as.numeric(num_period == 6)) %>%
        mutate(trend = period_neg) %>%
        select(day1,day2,day3,day4,day5,day6,trend) %>%
        as.matrix()

      #Fit Arima model with additive effect for week
      model <- Arima(t_series, c(1,0,1), xreg=model_xreg, method = "ML")
      #Get forecast
      pred <- forecast(model, h=h, level = .9, xreg = pred_xreg)
      pred_mean <- c(pred$fitted,pred$mean)
      pred_upper <- c(fitted(model) + 1.96*sqrt(model$sigma2),pred$upper)
      pred_lower <- c(fitted(model) - 1.96*sqrt(model$sigma2),pred$lower)


    } else{

      #Convert to time series object
      t_series <- ts(model_data$var_name, #start = min(-1*model_data$period),
                     frequency = 7)
      #See how far out we need to go in forecasting
      h <- nrow(cp_out) - nrow(model_data)

      #Fit Arima model
      model <- Arima(t_series, c(1,0,1), seasonal = list(order = c(1,0,0)), method = "ML")
      #Get forecast
      pred <- forecast(model, h=h, level = .9)
      pred_mean <- c(pred$fitted,pred$mean)
      pred_upper <- c(fitted(model) + 1.96*sqrt(model$sigma2),pred$upper)
      pred_lower <- c(fitted(model) - 1.96*sqrt(model$sigma2),pred$lower)
    }



  } else{
  if(week_period){
    model <- lm(var_name ~ period_neg + week_period, data=model_data)
  } else{
    model <- lm(var_name ~ period_neg, data=model_data)
  }

  #Get prediction covariates and make predictions
  if(week_period){
    pred_vars <- cp_out %>% mutate(period_neg=-1*period) %>%
      mutate(week_period = as.factor(period %% 7)) %>% select(var_name,period_neg,week_period)
  } else{
    pred_vars <- cp_out %>% mutate(period_neg=-1*period) %>% select(var_name,period_neg)
  }

  model_pred_intervals <- predict.lm(model, pred_vars, interval = "prediction", level = 0.90)

  #Extract data
  pred_mean <- model_pred_intervals[, "fit"]
  pred_lower <- model_pred_intervals[, "lwr"]
  pred_upper <- model_pred_intervals[, "upr"]

  }

  #Collect all data needed for cp_out in this function

  #First get miss bins and statistics. Hard code num_miss and num_miss_upper_int to be
  #floored at 0
  miss_bins <- data.frame(period=cp_out$period,
                          Y=cp_out$var_name,
                          pred1= pred_mean,
                          lower_int_pred1 = pred_lower,
                          upper_int_pred1 = pred_upper,
                          pred=cp_out$var_name,
                          num_miss = cp_out$var_name - pred_mean,
                          num_miss_upper_int = cp_out$var_name - pred_upper) %>%
    mutate(num_miss = num_miss*(num_miss>=0)) %>%
    mutate(num_miss_upper_int = num_miss_upper_int*(num_miss_upper_int>=0))

  #Filter to only times beyond CP
  miss_bins <- miss_bins %>% filter(period<=cp)
  if (return_miss_only){
    return(miss_bins)
  }

  if (is.null(specify_cp)){
    #Output data about the changepoint itself
    change_point <- data.frame(Y = cp_out$var_name[cp_est],
                               t = cp_est,
                               period = cp)
  } else {
    #Output data about the changepoint itself
    change_point <- data.frame(Y = cp_out %>% filter(period == cp) %>% .$var_name,
                               t = which(cp_out$period == cp),
                               period = cp)
  }

  #Output data about predictions
  pred <- data.frame(period=cp_out$period,
                     Y=cp_out$var_name,
                     t = 1:nrow(cp_out),
                     pred1= pred_mean,
                     lower_int_pred1 = pred_lower,
                     upper_int_pred1 = pred_upper,
                     pred=cp_out$var_name,
                     num_miss = cp_out$var_name - pred_mean,
                     num_miss_upper_int = cp_out$var_name - pred_upper)


  cp_plot <- pred %>% mutate(t = t-max(t)) %>% ggplot2::ggplot(aes(t, pred)) +
    ggtitle(paste0("Method = CUSUM, week effect = ", week_period,", auto_reg=",auto_reg))+
    ggplot2::geom_line(aes(y = pred1), color = "red",size=.8) +
    geom_ribbon(aes(ymin = lower_int_pred1, ymax = upper_int_pred1), fill = "red", alpha = 0.2)+
    ggplot2::geom_line(size=.8) +
    ggplot2::geom_point(aes(t,Y),size=.8) +
    ggplot2::theme_light() +
    ggplot2::geom_vline(xintercept = change_point$period*-1 , color="blue", size=.8)


  #Compile output
  cp_out <- list(miss_bins=miss_bins,
                 change_point=change_point,
                 pred=pred,
                 model=model,
                 cp_plot=cp_plot)

  return(cp_out)

}


#This function is the old find_change_point function, that finds changepoint using
#linear regression with a changepoint

#' Find the change point in count data using linear regression models
#'
#' @param data A dataset of visit counts
#' @param var_name The name of the count variable to find the change-point for
#' @param method The method used to fit curves before and after the changepoint. Options include "lm",
#' "lm_quad", "lm_cube", "quad", "cube", "exp", "spline"
#' @param eval_criteria The evaluation criteria used to find change points
#' @param return_miss_only Logical argument to only return the tibbles of miss visit counts
#' @param specify_cp Set a specific change point you want to use instead of searching for optimal change point. Enter a postive integer value
#' repersenting the days before the index on which you you want to specify the change point. (e.g. 100 would be 100 days before the index)
#' @param week_period Logical to incorporate a "day of the week" effect into
#' the linear model. Note this is only sensible for one-day period aggregation.
#' @examples
#' cp_result_original <- final_time_map %>%
#' count_prior_events_truven(event_name = "any_ssd", start_day = 1, by_days = 1) %>%
#' find_cp_linreg(var_name="n_miss_visits", method="lm_cube")
#'
#' @export
find_cp_linreg <- function(data,var_name="n_miss_visits",method="lm",eval_criteria="AIC", return_miss_only=FALSE,
                           specify_cp = NULL, week_period=FALSE){

  data$var_name <- data[[var_name]]

  data <- data %>%
    dplyr::arrange(dplyr::desc(period)) %>%
    dplyr::mutate(Y=var_name,
                  t=dplyr::row_number())

  if (is.null(specify_cp)) {
    if (method=="spline"){
      fits = tibble::tibble(cp=3:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp,
                                     ~fit_cp_spline(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (method=="lm"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_lm(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (method=="lm_cube"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_lm_cube(data = data, x=., periodicity=week_period) )) %>%
        tidyr::unnest(res)
    }

    if (method=="cube"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_cube(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (method=="quad"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_quad(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (method=="lm_quad"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_lm_quad(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (method=="exp"){
      fits = tibble::tibble(cp=2:max(data$t)) %>%
        dplyr::mutate(res=purrr::map(cp, ~fit_cp_exp(data = data, x=.) )) %>%
        tidyr::unnest(res)
    }

    if (eval_criteria %in% c("r.squared","adj.r.squared")){
      change_t <- fits$cp[fits[eval_criteria]==max(fits[eval_criteria])]
    } else {
      change_t <- fits$cp[fits[eval_criteria]==min(fits[eval_criteria])]
    }
  } else{
    change_t <- data$t[data$period == specify_cp]

    if (method=="spline"){
      fits = fit_cp_spline(data = data, x=change_t)
    }

    if (method=="lm"){
      fits = fit_cp_lm(data = data, x=change_t)
    }

    if (method=="lm_cube"){
      fits = fit_cp_lm_cube(data = data, x=change_t, periodicity=week_period)
    }

    if (method=="cube"){
      fits = fit_cp_cube(data = data, x=change_t)
    }

    if (method=="quad"){
      fits = fit_cp_quad(data = data, x=change_t)
    }

    if (method=="lm_quad"){
      fits = fit_cp_lm_quad(data = data, x=change_t)
    }

    if (method=="exp"){
      fits = fit_cp_exp(data = data, x=change_t)
    }
  }

  if (method=="spline"){
    out <- fit_cp_spline(data = data, x=change_t,return_all = TRUE)
  }

  if (method=="quad"){
    out <- fit_cp_quad(data = data, x=change_t,return_all = TRUE)
  }

  if (method=="cube"){
    out <- fit_cp_cube(data = data, x=change_t,return_all = TRUE)
  }

  if (method=="lm"){
    out <- fit_cp_lm(data = data, x=change_t,return_all = TRUE)
  }

  if (method=="lm_quad"){
    out <- fit_cp_lm_quad(data = data, x=change_t,return_all = TRUE)
  }

  if (method=="lm_cube"){
    out <- fit_cp_lm_cube(data = data, x=change_t,return_all = TRUE, periodicity=week_period)
  }

  if (method=="exp"){
    out <- fit_cp_exp(data = data, x=change_t,return_all = TRUE)
  }


  change_point <- data %>%
    dplyr::filter(t==change_t) %>%
    dplyr::select(Y,t,period)

  cp_plot <- out$pred %>% mutate(t = t-max(t)) %>% ggplot2::ggplot(aes(t, pred)) +
    ggtitle(paste0("Change point method = ", method)) +
    ggplot2::geom_line(aes(y = pred1), color = "red", size=.8) +
    ggplot2::geom_line(size=.8) +
    ggplot2::geom_point(aes(t,Y), size=.8) +
    ggplot2::theme_light() +
    ggplot2::geom_vline(xintercept = change_point$period*-1 , color="blue", size=.8) +
    ggplot2::geom_ribbon(aes(x=t, ymin = pred_low, ymax = pred_high),
                         fill = "red", alpha = 0.2, inherit.aes = FALSE)

  miss_bins <- out$pred  %>%
    dplyr::mutate(num_miss=pred-pred1) %>%
    dplyr::filter(num_miss>0) %>%
    dplyr::select(period,Y,pred,pred1,num_miss)

  miss_stats <- miss_bins %>%
    dplyr::summarise(miss_visits_est=sum(num_miss),
                     miss_visits_obs=sum(Y-pred1))

  if (return_miss_only==TRUE){
    out <- miss_bins
  } else {
    out <- c(list(change_point = change_point,
                  cp_plot = cp_plot,
                  cp_fits = fits),
             out,
             list(miss_bins = miss_bins,
                  miss_stats = miss_stats))
  }

  return(out)

}

#Wrapper function for all of these, so that they can be called easier. Should encompass
#all options past and present, and be 100% backward compatible


#' Find the change point in count data. This is a backwards-compatible wrapper function to
#' find the changepoint, which calls other methods.
#'
#' @param data A dataset of visit counts, output by count_prior_events_truven
#' @param var_name The name of the count variable to find the change-point for
#' @param method The method used to find changepoint. Options include "lm",
#' "lm_quad", "lm_cube", "quad", "cube", "exp", "spline", "pettitt", "cusum",
#' "MSE", "RMSE", "MAE", "MSLE", "RMSLE"
#' @param eval_criteria The evaluation criteria used to find change points, if using a
#' linear regression method
#' @param week_period Logical to incorporate a "day of the week" effect into the linear mode.
#' Note this is only sensible for one-day period aggregation
#' @param return_miss_only Logical argument to only return the tibbles of miss visit counts
#' @param specify_cp Set a specific change point you want to use instead of searching for optimal change point. Enter a postive integer value
#' repersenting the days before the index on which you you want to specify the change point. (e.g. 100 would be 100 days before the index)
#' @param auto_reg Logical that determines whether expected counts use a time-series framework that incorporates autoregression.
#' Will automatically fit periodicity, automatically setting week_period to TRUE. Only relevant for cusum and pettitt methods
#' @return A list containing tibbles of information about missed visits. These tibbles change
#' depending on the method used, but all contain miss predictions and a plot
#'
#' @examples
#'
#' cp_result_original <- final_time_map %>%
#' count_prior_events_truven(event_name = "any_ssd", start_day = 1, by_days = 1) %>%
#' find_change_point(var_name="n_miss_visits", method="lm_cube")
#'
#' @export
find_change_point <- function(data,var_name="n_miss_visits",method,eval_criteria="AIC",
                              return_miss_only=FALSE, week_period=FALSE, specify_cp = NULL,
                              auto_reg=FALSE){

  if(method %in% c("lm","lm_quad","lm_cube", "quad", "cube", "exp", "spline")){
    output <- find_cp_linreg(data, var_name=var_name,method=method,eval_criteria = eval_criteria,
                             return_miss_only = return_miss_only,
                             specify_cp = specify_cp, week_period=week_period)
    return(output)
  } else if(method=="pettitt"){
    output <- find_cp_pettitt(data, var_name=var_name, return_miss_only = return_miss_only,
                              week_period = week_period, specify_cp = specify_cp, auto_reg = auto_reg)
    return(output)
  } else if(method=="cusum"){
    output <- find_cp_cusum(data, var_name=var_name, return_miss_only = return_miss_only,
                            week_period = week_period, specify_cp = specify_cp, auto_reg = auto_reg)
    return(output)
  } else if(method %in% c("MSE", "RMSE", "MAE", "MSLE", "RMSLE")){
    output <- find_cp_loss_fun(data, var_name = var_name, return_miss_only = return_miss_only,
                               loss_function = method,
                               return_loss_fun_tab_only = FALSE,
                               week_period = week_period,
                               specify_cp = specify_cp)
    return(output)
  } else{
    cat("Error: No valid method was supplied. Returning ")
    return(NULL)
  }

}
aarmiller/delayDX documentation built on July 11, 2021, 4:01 p.m.