SvenR: Tools for exploration

Documented in twa

#' Time Weighted Averages
#' @description This function computes time weighted averages of \code{value_var} across zero or more groups.
#' Averages can be computed by trapezoids or left/right endpoints. Time weighted averages can be computed
#' with raw values or relative to a reference. Calculations can be above below, or about the supplied
#' reference value. By default, they are computed as raw.

#' @details If multiple rows have the same grouping variables and time, the median value will be selected.
#' Rows with missing. If only one value within a grouping level is supplied, that value will be returned.d
#' \code{twa} also computes the total time elapsed, the largest and smallest gap between consecutive points,
#' and the number of values received, used, and omitted. When \code{time_var} is \code{POSIXct}, \code{twa}
#' will default to minutes.
#' @param df a data frame, assumed to be in a 'long form' with time values in a single column
#' @param value_var the value to be avaraged, a column within df
#' @param time_var the time variable for weighting. Either \code{POSIXct} or numeric.
#' @param ... grouping variables within df
#' @param method method to compute TWA, one of \code{c('trapezoid', 'left', 'right')}
#' @param ref a value to compute the TWA relative to
#' @param ref_dir the direction to compute the average relative to ref, one of \code{c('raw', 'above', 'below', 'about')}
#' @return a data frame containing any grouping variables, the computed twa, and some other summary statistics
#' @note Need to check out this behavior as the behavior of summarize on grouped dataframes changed with dplyr 1.0 >:(
#' @examples
#' start_date = lubridate::ymd_hms('2019-01-01 00:00:00')
#' time_dat = tibble::tibble(id = c(1, 1, 1, 1, 2, 2),
#'                           val = c(4, 6, 8, 6, 1, NA),
#'                           t = lubridate::minutes(c(0, 10, 15, 45, 0, 10)) + start_date,
#'                           t2 = 1:6)
#' twa(df = time_dat, value_var = val, time_var = t, id)
#' twa(df = time_dat, value_var = val, time_var = t2, method = 'left', ref = 7, ref_dir = 'below')
#' @author Sven Halvorson
#' @export
twa = function(df, value_var, time_var, ...,
               method = 'trapezoid', ref = 0, ref_dir = 'raw'){

  # capture the potential NSE
  value_var = dplyr::enquo(value_var)
  value_var_s = dplyr::quo_name(value_var)
  time_var = dplyr::enquo(time_var)
  time_var_s = dplyr::quo_name(time_var)
  grouping_vars = dplyr::enquos(...)
  grouping_vars_s = grouping_vars %>%
    purrr::map_chr(dplyr::quo_name)

  # Check whether we have the right datatypes
  if(!is.data.frame(df)){
    stop('df must be a data frame')
  }
  if(any(!c(value_var_s, time_var_s, grouping_vars_s) %in% colnames(df))){
    stop('supplied columns not found in df')
  }
  if(!(is.numeric(df[[time_var_s]]) | lubridate::is.POSIXct(df[[time_var_s]]))){
    stop('time_var must be numeric or POSIXct')
  }
  if(!is.numeric(df[[value_var_s]])){
    stop('value_var must be numeric')
  }
  if(length(ref) != 1 | !is.numeric(ref)){
    stop('ref incorrectly specified')
  }
  if(length(ref_dir) != 1 | !ref_dir %in% c('raw', 'above', 'below', 'about')){
    stop('ref_dir incorrectly specified')
  }
  if(ref_dir == 'raw' & ref != 0){
    stop('ref must be zero if ref_dir == "raw"')
  }



  # clean time points -------------------------------------------------------
  # so we'll first start out by making sure that:

  # 1) We're sorted & grouped properly
  df = df %>%
    dplyr::arrange(!!time_var)
  if(length(grouping_vars) != 0){
    df = df %>%
      dplyr::group_by(!!!grouping_vars) %>%
      dplyr::arrange(!!!grouping_vars)
  }

  # 2) If there are missing outputs or times, record and then delete them.
  original_readings = df %>%
    dplyr::summarize(n_meas = dplyr::n())

  miss_output = df %>%
    dplyr::mutate(missing_either = pmax(is.na(!!time_var), is.na(!!value_var))) %>%
    dplyr::summarize(n_na = sum(missing_either))
  df = df %>%
    dplyr::filter(!is.na(!!value_var),
                  !is.na(!!time_var))

  # 3) We have only one reading per time point*group
  df = df %>%
    dplyr::group_by(!!time_var, .add = TRUE) %>%
    dplyr::summarize(!!value_var := median(!!value_var)) %>%
    (dplyr::ungroup)

  # regroup if needed to count actual data used:
  if(length(grouping_vars) != 0){
    df = df %>%
      dplyr::group_by(!!!grouping_vars) %>%
      dplyr::arrange(!!!grouping_vars)
  }
  used_readings = df %>%
    dplyr::summarize(n_used = dplyr::n())


  # Compute TWA -------------------------------------------------------------

  # Now we'll go ahead and compute the TWA. We'll need to determine
  # which method of computing it we'll use and whether we're using
  # a reference value
  multiplier = ifelse(ref_dir == 'below', -1, 1)
  lead_lag = ifelse(method == 'right', dplyr::lag, dplyr::lead)
  difference_fun = ifelse(lubridate::is.POSIXct(df[[time_var_s]]),
                          function(x, y){difftime(x, y, units = 'mins')},
                          function(x, y){x - y})

  # Now create the times and values:
  df = df %>%
    dplyr::mutate(method = method,
                  dir = ref_dir,
                  time_shift = lead_lag(!!time_var),
                  time_diff = difference_fun(!!time_var, time_shift),
                  time_diff = abs(as.numeric(time_diff)),
                  lead_val = lead(!!value_var),
                  new_val = dplyr::case_when(method == 'trapezoid' ~
                                               multiplier*(0.5*(lead_val + !!value_var) - ref),
                                             dir == 'about' ~ abs(!!value_var - ref),
                                             TRUE ~ multiplier*(!!value_var - ref)),
                  weighted_val = new_val*time_diff,
                  weighted_val = dplyr::case_when(dir == 'raw' ~ weighted_val,
                                                  TRUE ~ pmax(weighted_val, 0))) %>%
    # now summarize the times and values. We'll use a maximum
    # in the case that there is only one reading
    dplyr::summarize(total_time = sum(time_diff, na.rm = TRUE),
                     total_weight = sum(weighted_val, na.rm = TRUE),
                     max_measure = max(!!value_var),
                     max_gap = max(time_diff, na.rm = TRUE),
                     max_gap = replace(max_gap, is.infinite(max_gap), 0),
                     min_gap = min(time_diff, na.rm = TRUE),
                     min_gap = replace(min_gap, is.infinite(min_gap), 0)) %>%
    dplyr::mutate(twa = total_weight/total_time,
                  twa = dplyr::case_when(total_time > 0 ~ twa,
                                         TRUE ~ max_measure)) %>%
    # nifty trick to change the select based on whether grouping_vars were supplied
    purrr::when(length(grouping_vars) == 0 ~ dplyr::select(., twa, total_time, max_gap, min_gap),
                ~ dplyr::select(., !!!grouping_vars, twa, total_time, max_gap, min_gap))

  # attach other measures and export ----------------------------------------

  # if we have no grouping variables:
  bind_fun = ifelse(length(grouping_vars) == 0, dplyr::bind_cols, function(x,y){suppressMessages(dplyr::full_join(x,y))})

  # join summaries and return
  df  %>%
    bind_fun(original_readings) %>%
    bind_fun(used_readings) %>%
    bind_fun(miss_output) %>%
    dplyr::ungroup()

}