R/define_var_period.R

#' Define a tibble period variable (in days)
#'
#' @param event_start_tibble A tibble with a date column representing the start of a period and \code{population_member_id} column.
#' @param event_stop_tibble A tibble with a date column representing the stop of a period and \code{population_member_id} column.
#' @param event_start_var The name of the date column in \code{event_start_tibble}.
#' @param event_stop_var The name of the date column in \code{event_stop_tibble}.
#' @param population_member_id A unique identifier for each 'member' of the population of interest. Must be present in both tibbles.
#' @param period_name The desired name for the period column to be created in the new tibble.
#' @param exclusions A vector of \code{Interval} objects, typically created from \code{list_holidays_and_weekends()}. Defaults is a call to \code{list_holidays_and_weekends()}.
#' @param period_target A numeric value to serve as a threshold value. If provided, a second tibble will be calculated indicating whether or not the period value is less than or equal to the threshold value. Defaults is \code{NA}.
#' @param overwrite_negative A binary value indicating whether or not negative interval values should be set to \code{NA}. Defaults is \code{TRUE}.
#' @export

define_var_period <- function(event_start_tibble
                                 ,event_stop_tibble
                                 ,event_start_var
                                 ,event_stop_var
                                 ,id
                                 ,period_name
                                 ,exclusions = list_holidays_and_weekends()
                                 ,period_target = NA
                                 ,overwrite_negative = TRUE
                              ,jitter = Sys.getenv("OLIVER_REPLICA_JITTER")){

  # set names for the tibble
  dots1 <- setNames(list(lazyeval::interp(~ lubridate::interval(x, y)
                                          ,x = as.name(event_start_var)
                                          ,y = as.name(event_stop_var)))
                    ,'interval_raw')

  # initialize period_dat tibble by joining the two event tibbles together
  suppressMessages(
  period_dat <- inner_join(event_start_tibble
                           ,event_stop_tibble) %>%
    select_(id
            ,event_start_var
            ,event_stop_var) %>%
    mutate_(., .dots = dots1) %>%
    as_data_frame()
  )

  # add a column to period_dat based on the exclusions param
  period_dat$interval_delta <- NA

  suppressMessages(
  for (i in 1:nrow(period_dat)){
    period_dat$interval_delta[i] <- sum(exclusions %within% period_dat$interval_raw[i])
  }
  )

  # set names for the second table
  dots2 <- setNames(list(lazyeval::interp(~ x
                                ,x = quote(period_all))
                         ,lazyeval::interp(~ y
                                 ,y = quote(period_days)))
                    ,c(period_name, paste0(period_name, '_days'))
  )

  # put everything together
  suppressMessages(
  period_dat_value <- period_dat %>%
    mutate(period_all = as.period(interval_raw, unit = 'days') - days(interval_delta)
           ,period_days = as.numeric(period_all, 'days')
           ,met_target = if (is.na(period_target)) TRUE else period_days <= period_target
           ,negative_interval = int_length(interval_raw) < 0
    ) %>%
    mutate_(., .dots = dots2) %>%
    # jitter the values if TRUE
    # this will also jitter the values below
    mutate(period_days = ifelse(overwrite_negative & negative_interval, NA, period_days)
           ,period_days = if(jitter){period_days + rbinom(n = n()
                                                          ,size = period_target
                                                          ,prob = runif(1))} else period_days) %>%
    select_(id
            ,quote(period_days))
  )

  suppressMessages(
  period_dat_target <- period_dat %>%
    mutate(period_all = as.period(interval_raw, unit = 'days') - days(interval_delta)
           ,period_days = as.numeric(period_all, 'days')
           ,met_target = if (is.na(period_target)) TRUE else period_days <= period_target
           ,negative_interval = int_length(interval_raw) < 0
    ) %>%
    mutate_(., .dots = dots2) %>%
    mutate(met_target = ifelse(overwrite_negative & negative_interval, NA, met_target)) %>%
    select_(id
            ,quote(met_target)
    )
  )

  period_dat_quality <- period_dat_value %>%
    mutate(valid_data = ifelse(is.na(period_days), FALSE, TRUE)) %>%
    select(-period_days)

  return(list(period_dat_target
              ,period_dat_value
              ,period_dat_quality)
  )

}
mienkoja/oliveR documentation built on May 6, 2019, 6:01 p.m.