R/persistence.R

#' Calculate Persistence
#'
#' @description  The persistence rate is defined as the percentage of units (ID's)
#'  in a given period that are also found in the subsequent period. This
#'  function will either:
#'
#'  1) Calculate the persitence rate for each period
#'
#'  2) Create an indicator variable on the original dataframe that
#'   identifies whether the ID persisted (1) or not (0)
#'
#'@param df A dataframe
#'@param id Unique ID variable
#'@param rank Numeric or Ordered Factor variable that indicates the sequence of periods
#'@param period Optional pretty name of the rank variable
#'@param ... Optional grouping variables
#'@param overall Logical variable to include overall persistence rate
#'@param calculate Logical variable to indicate calculating persistence
#'    rates or indicator variable
#'
#'@return Returns either a summarised dataframe or the original dataframe
#'    with an extra column.
#'
#'@details If \code{calculate==TRUE} \code{persistence()} removes the last
#'    period from the output,
#'
#'@examples
#'
#'dataFrame <- data.frame(ID = c("A", "B", "C", "A", "B", "D", "A", "D"),
#'                        RANK = c(1, 1, 1, 2, 2, 2, 3, 3),
#'                      PERIOD = c("P1", "P1", "P1", "P2", "P2", "P2", "P3", "P3"),
#'                      GROUP = c("G1", "G2", "G1", "G1", "G2", "G3", "G1", "G3"),
#'                      stringsAsFactors = FALSE)
#'# Calculate == TRUE
#'persistence(dataFrame, ID, RANK, PERIOD)
#'
#'# Calculate == FALSE
#'dataFrame <- persistence(dataFrame, ID, RANK, PERIOD, calculate = FALSE)
#'
#'@export
persistence <- function(df, id, rank, period, ..., overall = TRUE, calculate = TRUE){

  stopifnot(!missing(df), !missing(id), !missing(rank))
  period_missing <- missing(period)

  enq_id <- enquo(id)
  enq_rank <- enquo(rank)
  enq_period <- enquo(period)
  enq_group_var <- quos(...)

  valid_rank_type <- is.numeric(rlang::eval_tidy(enq_rank, df)) | is.ordered(rlang::eval_tidy(enq_rank, df))


  if(!valid_rank_type){
    stop("Argument \"rank\" must be numeric or ordered factor")
  }

  if(is.logical(calculate)){
    calculate <- calculate
  } else {
    stop("Argument \"calculate\" must be logical (TRUE/FALSE)")
  }

  if(is.logical(overall)){
    overall <- overall
  } else{
    stop("Argument \"overall\" must be logical (TRUE/FALSE)")
  }

  df <- df %>%
        ungroup() %>%
        mutate(denseRank = dense_rank(UQ(enq_rank)))%>%
        group_by(UQ(enq_id))%>%
        arrange(denseRank)%>%
        mutate(nextrank = lead(denseRank))

  if(calculate == FALSE){

        out <- df %>%
          mutate(persistence_indicator = case_when(nextrank == denseRank + 1 ~ 1,
                                              TRUE ~ 0))%>%
          ungroup()%>%
          select(-nextrank, -denseRank)

          return(out)

  } else if (calculate == TRUE) {

        if(period_missing){
          out <- df %>%
                group_by(UQS(enq_group_var), UQ(enq_rank), denseRank)
        } else if(!period_missing){
          out <- df %>%
                group_by(UQS(enq_group_var), UQ(enq_rank), UQ(enq_period), denseRank)
        }


         out <- out %>%
                summarize(persistence_rate = sum(nextrank == (denseRank+1), na.rm = TRUE)/n(),
                          count = n()) %>%
                ungroup()%>%
                filter(denseRank != max(denseRank))%>%
                arrange(denseRank) %>%
                select(-denseRank)

         if(overall == TRUE){
          total <- df %>%
                    ungroup()%>%
                    filter(denseRank != max(denseRank))%>%
                    summarize(persistence_rate=sum(nextrank == (denseRank + 1), na.rm = TRUE)/n())%>%
                    as.numeric()

          out <- out %>%
                  mutate(overall = total)
         }
    return(out)
  }
}
christian-million/researchR documentation built on May 15, 2019, 12:45 p.m.