R/summarise_data.r

# Tools for translating extracted data into plottable formats.

#' Calculate the number of active users in a set of date ranges.
#' 
#' @param sesh_dur_data A dataframe of user_ids and the dates when they took a
#' platform action.
#' @param range_beginning A data.table with a single column of dates which 
#' denote the first days of each date range.
#' @return A data frame showing how many active users there were in each date
#' range.
#' @import data.table

calculate_active_users <- function(sesh_dur_data
                                   , range_beginning){
  copy_sesh_dur_data <- data.table::copy(sesh_dur_data)
  copy_sesh_dur_data[, rollDate:= platform_action_date]
  data.table::setkey(copy_sesh_dur_data, rollDate)

  range_beginning[, rollDate:= range_beginning_date]
  data.table::setkey(range_beginning, rollDate)

  range_beginning[
    copy_sesh_dur_data
    , .(user_id, platform_action_date, range_beginning_date)
    , roll = T
  ][

    , .(number_of_active_users = length(unique(user_id)))
    , by = range_beginning_date
  ]
}

#' Loop calculate_active_users over a list of date ranges.
#'
#' @param date_ranges A list of date ranges, of the form generated by
#' standardmetrics::define_date_ranges.
#' @return A list of data frames that show the number of active users that fall
#' within each date range.
#' @import data.table
#' @export

loop_calculate_active_users <- function(date_ranges
                                        , ...){
  lapply(date_ranges
         , FUN = function(r){
           calculate_active_users(..., r)
         })
}

#' Calculate DAU to MAU ratio by month.
#'
#' @param MAU_count_by_month A data.table: (month_beginning MAU_count)
#' @param DAU_count_by_day A data.table: (day DAU_count)  
#' @return A data frame showing the average DAU to MAU ratio for each month
#' specified by months_to_calc.
#' @import data.table
#' @export

calculate_DAU_MAU_ratio <- function(MAU_count_by_month
                                    , DAU_count_by_day){
  copy_MAU_count_by_month <- data.table::copy(MAU_count_by_month)
  copy_DAU_count_by_day <- data.table::copy(DAU_count_by_day)

  copy_MAU_count_by_month[, c('MAU_count'
                              , 'month_beginning_date'
                              , "rollMonth")
                            := 
                            .(number_of_active_users
                              , range_beginning_date
                              , range_beginning_date)]
  data.table::setkey(copy_MAU_count_by_month, rollMonth)

  copy_DAU_count_by_day[, c('DAU_count'
                            , 'day_beginning_date'
                            , "rollDate")
                          := 
                          .(number_of_active_users
                            , range_beginning_date
                            , range_beginning_date)]
  data.table::setkey(copy_DAU_count_by_day, rollDate)

  copy_MAU_count_by_month[
    copy_DAU_count_by_day 
    , .(month_beginning_date, day_beginning_date, MAU_count, DAU_count)
    , roll = T
  ][

    , .(DAU_to_MAU_ratio = mean(DAU_count)/mean(MAU_count))
    , by = month_beginning_date 
  ]
}

#' Calculate overall DAU to MAU ratio for a time interval.
#'
#' @param months_beginning A data.table (range_beginning_date) of months to
#' include in the calculation.
#' @param max_date The largest date to use in the calculation. Any of the
#' range_beginning_dates in months_beginning which is less than one month
#' before max_date is dropped.
#' @param sesh_dur_data A data.table (user_id, platform_action_date).
#' @return A number representing the overall DAU/MAU ratio for the time
#' interval (min_date, max_date). The ratio is only calculated for complete
#' months within the time interval. So for example, (jan 3, march 20) gets
#' rounded to (feb 1, feb 28).
#' @import data.table
#' @export

calculate_overall_ratio <- function(months_beginning
                                    , max_date = Sys.Date()
                                    , sesh_dur_data = sesh_dur_date){
  monthdiff <- function(date1,date2){
    length(seq(from=date1, to=date2, by='month')) - 1
  }

  monthdiff2 <- Vectorize(FUN = monthdiff, vectorize.args = "date1")

  months_beginning_copy <- data.table::copy(months_beginning)[
    , ':='(rollDate = range_beginning_date
           , month_length = c(diff(range_beginning_date), 0)
      )
  ]

  data.table::setkey(months_beginning_copy, rollDate)
  
  sesh_dur_data_copy <- data.table::copy(sesh_dur_data)[
    , rollDate := platform_action_date
  ]

  data.table::setkey(sesh_dur_data_copy, rollDate)

  months_beginning_copy[
    sesh_dur_data_copy
    , .(month_length
        , range_beginning_date
        , platform_action_date
        , user_id
      )
    , roll = T
  ][
    !is.na(month_length)
    , .(days_active = length(unique(platform_action_date))
        , month_days_possible = mean(month_length)
      )
    , by = c('user_id', 'range_beginning_date')
  ][
    , .(overall_ratio = sum(days_active)/sum(as.numeric(month_days_possible)))
  ]
}

#' Count signups per week (as total, and as perentage of signups).
#'
#' @param relative_pa_datetimes A data frame: (user_id, datetime) giving the
#' moments when a user took a platform action.
#' @param user_oneD7 A data frame: (user_id oneD7) that tells us which users
#' are 1D7s.
#' @param range_beginning A data.table with a single column of dates which 
#' denote the first days of each date range.
#' @param user_set A subset of users to restrict the calculation to. If not set
#' (ie NULL) then all users available in relative_pa_datetimes are used.
#' @return A data frame: (week_beginning, number_of_signups)
#' @import data.table
#' @export

count_signups <- function(relative_pa_datetimes
                          , user_oneD7
                          , range_beginning
                          , user_set = NULL){
  rpd <- data.table::copy(relative_pa_datetimes)
  u1 <- data.table::copy(user_oneD7)

  if(!is.null(user_set)){
    rpd <- rpd[user_id %in% user_set,,]
    u1  <- u1[user_id %in% user_set,,]
  }
 
  user_signup_date <- rpd[

    , .(signup_date = min(signup_date))
    , by = user_id
  ]

  data.table::setkey(user_signup_date, user_id)
  data.table::setkey(u1, user_id)

  range_beginning[, rollDate := range_beginning_date]
  data.table::setkey(range_beginning, rollDate)

  user_signup_week <- u1[user_signup_date][, rollDate := signup_date]

  data.table::setkey(user_signup_week, rollDate)

  range_beginning[user_signup_week, roll = T][
    
    , .(
        number_of_signups = length(unique(user_id))  
        , number_of_1d7_signups = length(unique(user_id[oneD7])) 
      )
    , by = range_beginning_date
  ][

  , c('range_beginning_date'
      , 'number_of_signups'
      , 'number_of_1d7_signups'
      , 'percent_of_1d7_signups'
    ) :=
    .(
       range_beginning_date
       , number_of_signups
       , number_of_1d7_signups
       , number_of_1d7_signups/number_of_signups
    )
  ]
}

#' Count signups per week (user classes > 2)
#'
#' @param relative_pa_datetimes A data frame: (user_id, datetime) giving the
#' moments when a user took a platform action.
#' @param user_classes A data frame: (user_id, user_class) that groups users into
#' classes.
#' @param range_beginning A data.table with a single column of dates which 
#' denote the first days of each date range.
#' @param user_set A subset of users to restrict the calculation to. If not set
#' (ie NULL) then all users available in relative_pa_datetimes are used.
#' @return A data frame: (week_beginning, number_of_signups)
#' @import data.table
#' @export

count_signups_multiclass <- function(relative_pa_datetimes
                                     , user_classes
                                     , range_beginning
                                     , user_set = NULL){

  rpd <- data.table::copy(relative_pa_datetimes)
  u1 <- data.table::copy(user_classes)

  if(!is.null(user_set)){
    rpd <- rpd[user_id %in% user_set,,]
    u1  <- u1[user_id %in% user_set,,]
  }

  user_signup_date <- rpd[

    , .(signup_date = min(signup_date))
    , by = user_id
  ]

  data.table::setkey(user_signup_date, user_id)
  data.table::setkey(u1, user_id)

  range_beginning[, rollDate := range_beginning_date]
  data.table::setkey(range_beginning, rollDate)

  user_signup_week <- u1[user_signup_date][, rollDate := signup_date]

  data.table::setkey(user_signup_week, rollDate)

  range_beginning[user_signup_week, roll = T][
    
    , .(
        number_of_signups = length(unique(user_id))  
      )
    , by = .(range_beginning_date, user_class)
  ]
}

#' Get user retention data (1d7s vs all).
#'
#' @param relative_pa_datetimes A data frame: (user_id, datetime) giving the
#' moments when a user took a platform action.
#' @param user_oneD7 A data frame: (user_id oneD7) that tells us which users
#' are 1D7s.
#' @param max_date The most recent date of activity that appears in
#' sesh_dur_date.
#' @return A data frame: (weeks_since_signup, pct_users_active)
#' @import data.table
#' @export

get_user_retention_data <- function(relative_pa_datetimes
                                    , user_oneD7
                                    , max_date = max(sesh_dur_date$platform_action_date)){
  rpd <- data.table::copy(relative_pa_datetimes)
  u1 <- data.table::copy(user_oneD7)

  data.table::setkey(rpd, user_id)
  data.table::setkey(u1, user_id)

  x <- rpd[
    , c('weeks_since_signup'
        , 'weeks_eligible'
      ) :=
      .(floor(as.numeric(relative_date)/7)
        , floor(as.numeric(max_date - signup_date)/7)  
      )
  ][u1]
  
  active_user_count <- x[

    , .(number_of_active_users = length(unique(user_id)) )
    , by = .(weeks_since_signup, oneD7)
  ]

  eligible_user_count <- x[

    , .(number_of_eligible_users = length(unique(user_id)) )
    , by = .(weeks_eligible, oneD7)
  ][
    order(desc(weeks_eligible))
    , cumulative_eligible_users := cumsum(number_of_eligible_users)
    , by = oneD7
  ]

  data.table::setkey(active_user_count, weeks_since_signup, oneD7)
  data.table::setkey(eligible_user_count, weeks_eligible, oneD7)

  active_user_count[
    eligible_user_count
    , .(percent_active = number_of_active_users/cumulative_eligible_users
        , number_of_active_users
        , cumulative_eligible_users
        , oneD7
        , weeks_since_signup
      )
  ]
}
johnchower/standardmetrics documentation built on May 19, 2019, 4:21 p.m.