R/hyfe_summarize.R

Defines functions hyfe_summarize

Documented in hyfe_summarize

#' Summarize Hyfe data
#'
#' This function produces summary tables for your `hyfe` object.
#'
#' @param ho A `hyfe` object, which is generated by `process_hyfe_data()`.
#' See full details and examples in the [package vignette](https://hyfe-ai.github.io/hyfer/#hyfe_object).
#' @param cutoff_hourly This function uses sample size cutoffs to ensure that rates are
#' not swung to extremes due to insufficient monitoring. For example,
#' an hour of day with 1 cough detection but only 1 minute of monitoring would
#' produce an hourly cough rate estimate of 60 coughs per hour. Those scenarios should be avoided.
#' The first cutoff used is `cutoff_hourly`: by default, at least 30 minutes of monitoring must occur within a hour-long window of the day
#' in order for that hour to contribute to the estimation of the hourly cough rate.
#' @param cutoff_daily The second cutoff used; the default is that at least 4 hours of monitoring must occur within a day in order
#' for that day to count toward the daily cough rate.
#' @details If your `hyfe` object was processed by aggregating all users together
#' (i.e., your call `process_hyfe_data()` included the argument `by_user = FALSE`,
#' which is the function's default), the cough rates reported should be treated with caution:
#' these rates are going to be biased by users with
#' (1) a lot of monitoring time and (2) a lot of coughs.
#' To summarize Hyfe data from multiple users in a way that is truly balanced,
#' in which each user is weighted equally, you should use a `hyfe` object processed `with by_user = TRUE`.
#'
#' @details Note that the cumulative counts are unaffected by these cutoffs, only the rates.
#'
#' @return A list with two slots: `overall`, which summarizes the entire dataset by pooling users together,
#' and `users`, which summarizes each user individually.
#' The `users` slot is `NULL` when the `hyfe` object provided was processed with `by_user = FALSE`.
#' That `users` table, if available, is used to build the `overall` slot in which the mean rates (i.e., `hourly_rate` and `daily_rate`) are the average of each user’s mean rates,
#' and – importantly – the variability metrics (`hourly_var`, `hourly_sd`, `daily_var`, `daily_sd`)
#' now pertain to the variability *among users*.
#'
#' @export
#'
hyfe_summarize <- function(ho,
                           cutoff_hourly = 30,
                           cutoff_daily = 4){
  if(FALSE){
    library(hyfer)
    library(dplyr)
    data(hyfe_data)
    ho <- process_hyfe_data(hyfe_data)
    ho <- process_hyfe_data(hyfe_data, by_user = TRUE)

    cutoff_hourly = 30
    cutoff_daily = 4

  }

  summarize_core <- function(ho, cutoff_hourly, cutoff_daily){
    hoi <- ho
    #hoi <- useri

    hours <- hoi$hours
    days <- hoi$days

    if('sessions' %in% names(hoi)){
      users <- hoi$sessions$uid %>% unique %>% length
    }else{
      users <- 1
    }

    # Basic summaries
    summi <-
      data.frame(users = users,
                 seconds = hours$session_seconds %>% as.numeric %>% sum)

    summi$hours <- summi$seconds / 3600
    summi$days <- summi$hours / 24
    summi$years <- summi$days / 365
    summi$sounds = hours$peaks %>% sum
    summi$coughs = hours$coughs %>% sum

    # Rates
    hours <- hours %>% filter(session_seconds/n_uid >= cutoff_hourly*60)
    hours %>% nrow

    days <- days %>% filter(session_hours/n_uid >= cutoff_daily)
    days %>% nrow

    summi$hourly_n = hours %>% nrow
    summi$hourly_rate = hours$cough_rate %>% mean(na.rm=TRUE)
    summi$hourly_var = hours$cough_rate %>% var(na.rm=TRUE)
    summi$hourly_sd = hours$cough_rate %>% sd(na.rm=TRUE)
    summi$hourly_max = hours$cough_rate %>% max(na.rm=TRUE)

    summi$daily_n = days %>% nrow
    summi$daily_rate = days$cough_rate %>% mean(na.rm=TRUE)
    summi$daily_var = days$cough_rate %>% var(na.rm=TRUE)
    summi$daily_sd = days$cough_rate %>% sd(na.rm=TRUE)
    summi$daily_max = days$cough_rate %>% max(na.rm=TRUE)

    summi
    return(summi)
  }

  if('user_summaries' %in% names(ho)){
    # ho by user
    users <- data.frame()
    i=147
    for(i in 1:length(ho$user_summaries)){
      useri <- ho$user_summaries[[i]]
      names(useri)
      summi <- summarize_core(useri,
                              cutoff_hourly = cutoff_hourly,
                              cutoff_daily = cutoff_daily)
      summi
      idkey <- useri$id_key
      if(nrow(idkey)==0){
        idkey <- data.frame(uid=NA,name=NA,email=NA,alias=NA,cohort_id=NA)
      }
      summi <- data.frame(idkey, summi)
      users <- rbind(users, summi)
    }
    users
    users <- users[!is.na(users$uid),]
    nrow(users)

    overall <- data.frame(users = nrow(users),
                          seconds = sum(users$seconds),
                          hours = sum(users$hours),
                          days = sum(users$days),
                          years = sum(users$years),
                          sounds = sum(users$sounds),
                          coughs = sum(users$coughs),
                          hourly_n = mean(users$hourly_n,na.rm=TRUE),
                          hourly_rate = mean(users$hourly_rate, na.rm=TRUE),
                          hourly_var = var(users$hourly_rate, na.rm=TRUE),
                          hourly_sd = sd(users$hourly_rate, na.rm=TRUE),
                          hourly_max = max(users$hourly_rate, na.rm=TRUE),
                          daily_n = mean(users$daily_n, na.rm=TRUE),
                          daily_rate = mean(users$daily_rate, na.rm=TRUE),
                          daily_var = var(users$daily_rate, na.rm=TRUE),
                          daily_sd = sd(users$daily_rate, na.rm=TRUE),
                          daily_max = max(users$daily_rate, na.rm=TRUE))
    overall

  }else{
    # Simply ho object
    users <- NULL
    overall <- summarize_core(ho,
                              cutoff_hourly = cutoff_hourly,
                              cutoff_daily = cutoff_daily)
  }

  return(list(overall = overall,
              users = users
              ))

}
hyfe-ai/hyfer documentation built on Dec. 20, 2021, 5:53 p.m.