hisreportr: hisreportr: Automating MSF OCA EMR reporting

Documented in merge_to_episode

#' Merge to episode
#' 
#' Merge information back to treatment episodes based on patient ID numbers and dates.
#'
#' @param df_episode data frame containing episode data - each row is expected to have
#' a unique treatment episode
#' @param df_info data frame containing baseline information related to specific 
#' treatment episodes. Each row should have a unique collection date for each patient
#' @param key_var bare name of the key variable linking the two data frames - normally patient_id
#' @param start_date bare name of start episode variable in df_episode
#' @param eval_date bare name of collection date variable in df_info
#' @param post_start boolean flag to indicate whether information is entered after start
#' of episode - e.g. treatment outcome. 
#' 
#' @importFrom assertthat assert_that
#' @importFrom dplyr left_join mutate filter select arrange group_by ungroup
#' @importFrom rlang .data
#'
#' @return data frame with information attached to each treatment episode. Information
#' is selected based on date of collection such that the closest information prior to 
#' the episode starting is selected
#'
#' @examples
#' \dontrun{
#' merge_to_episode(df_episode = start,
#' df_info = reg, 
#' key_var = id, 
#' start_date = sdate,
#' eval_date = reg_date)
#' }
#' 

merge_to_episode <- function(df_episode,
                             df_info,
                             key_var,
                             start_date,
                             eval_date, 
                             post_start = FALSE) {

  . <- NULL
  
  # check args
  assertthat::assert_that(is.data.frame(df_episode),
                          is.data.frame(df_info), 
                          is.logical(post_start))
  
# convert specific function args to strings for left_join
  by_var <- deparse(substitute(key_var))
  start_by_var <- deparse(substitute(start_date))
  
# remove additional variables from df_episode to avoid duplication during
  # final merge
  selected_episode <- df_episode[c(by_var, start_by_var)]

# merge two data frames  
    m <- dplyr::left_join(selected_episode, 
                        df_info,
                        by = by_var) 
    
    # filter any information if before or after start date
 
    if (post_start) {
      m <- dplyr::filter(m, {{ start_date }} <= {{ eval_date }})
    } else {
      m <- dplyr::filter(m, {{ start_date }} >= {{ eval_date }})   
    }
    
    m <- m %>% 
      # count how many days prior to start of episode information collected
    dplyr::mutate(days = abs(as.numeric( {{ start_date}} - {{ eval_date }}))) %>% 
      
    dplyr::arrange({{ key_var }}, {{ start_date }}, .data$days) %>% 
      
    dplyr::group_by( {{ key_var }}, {{ start_date }}) %>% 
      # keep information which is closest to episode start
    dplyr::slice(1) %>% 
    dplyr::ungroup() %>% 
      # merge remaining information back to original episode data
    dplyr::left_join(df_episode, ., by = c(by_var, start_by_var)) %>% 
    dplyr::select(-.data$days)

# check that output has same rows as df_episode
    assertthat::assert_that(nrow(m) == nrow(df_episode))
  
# check that no new rows have been generated
    no_new_rows <- identical(unique(m[c(by_var, start_by_var)]),
                unique(df_episode[c(by_var, start_by_var)]))
    
    # generate warning if episode details don't match between input and output
    if (! no_new_rows) {
      warning("Treatment episode (key_var and start date) are different between input and output")
    }
    
    m
}