hyfer: R Utilities for Interacting with Hyfe Data

Documented in cough_rate_distribution

#' Cough rate distribution
#'
#' Summarize cough rates and retrieve hourly cough counts for use in histogram-production and cough distribution modeling.
#'
#' @param ho A `hyfe` object, which is generated by `process_hyfe_data()`.
#' See full details and examples in the [package vignette](https://hyfe-ai.github.io/hyfer/#hyfe_object).
#' @param min_session allows you to define the minimum amount of monitoring required
#' during a single hour in order for that hour to be included in the cough rate estimation.
#' For example, sometimes an hour of day contains only a few minutes of monitoring for a user;
#' that makes for a pretty poor estimate of that hour’s cough rate.
#' The default `min_session` is `0.5` hours, or 30 minutes of monitoring within an hour.
#'
#' @details This function can take both aggregated data (ho) and user-separated (ho_by_user),
#' but it does best with the latter. It returns metrics about hourly cough rates
#' based on an hour-by-hour analysis. Similar to the inputs in hyfe_summarize(),
#' the argument `min_session`
#' The slot `$details` returns a dataframe with all details you might need to analyze these rates (essentially the hours table from a hyfe object).
#'
#' @return A list with named slots: `overall` contains a dataframe with the mean and SD of hourly cough rate for the entire dataset.
#' These metrics are based on the mean/variance for each individual user, i.e.,
#' `mean_of_mean` is the average of mean cough rates across users.
#' When using a `hyfe` object prepared with `by_user=TRUE`,
#' this means that each user is weighted equally in the summary statistics.
#' When using a `hyfe` object in which all user data are aggregated together,
#' users will be weighted according to their session time;
#' The slot `$users` contains a dataframe with the mean and SD cough rate for each user.
#' The slot `$rates` returns a numeric vector of hourly cough rates that satisfy the minimum monitoring threshold:
#'
#' @export
#'
cough_rate_distribution <- function(ho,
                                    min_session = 0.5){

  #=============================================================================
  # for debugging only -- not run!
  if(FALSE){
    # debugging only - not run
    data(hyfe_data)
    ho <- process_hyfe_data(hyfe_data)
    ho_by_user <- process_hyfe_data(hyfe_data, by_user = TRUE)
    hoi <- ho_by_user

    min_session = .5

    cough_rate_distribution(ho)
    cough_rate_distribution(ho_by_user)$users %>% head(20)

  }
  #=============================================================================

  hoi <- ho # make safe copy of input

  # Test to see if `ho` is user-separated
  this_by_user <- 'user_summaries' %in% names(hoi)
  if(this_by_user){
    # Data were processed separately for each user:
    hourlies <- data.frame()
    i=2
    for(i in 1:length(hoi$user_summaries)){
      useri <- hoi$user_summaries[[i]]
      names(useri)
      useri$id_key
      hoursi <- useri$hours
      hoursi$uid <- useri$id_key$uid[1]
      names(hoursi)
      hoursi <-
        hoursi %>%
        dplyr::filter(session_hours >= min_session) %>%
        dplyr::select(uid, timestamp:n_uid, session_hours:cough_rate)
      hoursi
      hourlies <- rbind(hourlies, hoursi)
    }
  }else{
    # Data were processed in aggregate
    hourlies <-
      hoursi %>%
      dplyr::mutate(uid = 'aggregate') %>%
      dplyr::filter(session_hours >= min_session) %>%
      dplyr::select(uid, timestamp:n_uid, session_hours:ough_rate)
  }

  hourlies

  # user summaries
  user_summaries <-
    hourlies %>%
    dplyr::group_by(uid) %>%
    dplyr::summarize(rate_mean = mean(cough_rate, na.rm=TRUE),
                     rate_variance = var(cough_rate, na.rm=TRUE),
                     n_hours = dplyr::n(),
                     n_uid = length(unique(uid)))
  user_summaries

  # overall summary
  overall_summary <- user_summaries %>%
    dplyr::summarize(mean_of_mean = mean(rate_mean, na.rm=TRUE),
                     sd_of_mean = sd(rate_mean, na.rm=TRUE),
                     mean_of_variance = mean(rate_variance, na.rm=TRUE),
                     sd_of_variance = sd(rate_variance, na.rm=TRUE),
                     n_hours_tot = sum(n_hours),
                     n_hours_mean = mean(n_hours,na.rm=TRUE),
                     n_uid = dplyr::n())
  overall_summary


  # Prep return
  return_list <- list(overall = overall_summary,
                      users = user_summaries,
                      rates = as.numeric(hourlies$cough_rate),
                      details = hourlies)

  return(return_list)
}