R/TURN.R

Defines functions clean_TURN_output calc_det_rate calc_path_TURN calc_TURN calc_n_sites FA1.5_adjust calc_path_Y_3wma add_missing_dates_2_site_path calc_Y_2prime calc_n_active_adj arrange_epidate dly_2_epi_path dly_2_epi site_2_dly_path site_2_dly mean_weighted_TUR create_epidates

Documented in calc_det_rate calc_n_sites calc_path_TURN calc_TURN clean_TURN_output

# Script started 3/25/2020 by martin holdrege

# functions to support TURN calculation (mostly used in the 02_TUR...script)

# other functions are found in the "functions.R" script



# namespace imports for use by roxygen2 -----------------------------------
# functions from these packages freely available for functions in turnr
# to use

#' @import dplyr
#' @importFrom stats weighted.mean
#' @importFrom utils data


create_epidates <- function(date_vec) {
  # args:
  #   date_vec--date vector the covers whole length of time of interest
  # returns:
  #   dataframe with epiweek, epiyear, and epidate (date of middle of epiweek)
  #   covers date range of date_vec (plus a week on either side)
  stopifnot(
    class(date_vec) == "Date"
  )

  # slightly expanded range of dates
  # using longer range of dates so have epidate be middle date of week for even
  # partial epiweeks at beginning and end
  dates_longer <- tibble(date = seq(from = min(date_vec)-14,
                                    to = max(date_vec) +14, by = 1))
  epi_dates <- dates_longer %>%
    mutate(epiweek = lubridate::epiweek(date),
           epiyear = lubridate::epiyear(date)) %>%
    group_by(.data$epiyear, .data$epiweek) %>%
    summarize(epidate = mean(.data$date)) %>%
    ungroup()
  # get rid of first/last week b/ those may be partial (ie epidate not
  # actually middle of the epiweek)
  out <- epi_dates %>%
    filter(epidate != min(epidate), epidate != max(epidate))
  out
}

#  mean weighted by TUR -------------------------------------------------------

mean_weighted_TUR <- function(x, wt, na.rm = TRUE) {
  # args:
  #   x- numeric vector to take weighted mean of
  #   wt--numeric vector (to be used as weight)--usually TUR or similar
  # returns:
  #   weighted mean
  wt <- wt/sum(wt, na.rm = TRUE)
  out <- weighted.mean(x = x, w = wt, na.rm = na.rm)
  out
}

# testing
if (FALSE) {
  x <- 1:10
  wt1 <- rep(2, 10)
  wt2 <- c(rep(0, 9), 1)
  # should be true
  mean_weighted_TUR(x, wt1) == 5.5
  mean_weighted_TUR(x, wt2) == 10
}



# site to daily TUR ---------------------------------------------------------

site_2_dly <- function(df, group_vars) {
  # args:
  #   df--site level daily data, with columns including "date",
  #       "InstrumentVersion", "daily_TUR_rp", "daily_TUR_all", "n_active"
  #   group_vars--un-quoted names of columns to group by e.g
  #       group_vars = vars(date, InstrumentVersion)
  # returns:
  #   data frame with daily_TUR_rp, daily_TUR_all and n_seerial_3mo summed
  #   to the level of the grouping variable
  required_cols <- c("date", "InstrumentVersion", "daily_TUR_rp",
                     "daily_TUR_all", "n_active")

  check_cols(df, required_cols)
  check_quosures(group_vars)

  out <- df %>%
    # this right join may not be necessary--just precaution so no missing dates
    group_by(!!!group_vars) %>%
    summarize(daily_TUR_rp = sum(daily_TUR_rp, na.rm = TRUE), # rp only
              # rp and non:
              daily_TUR_all = sum(daily_TUR_all, na.rm = TRUE),
              n_active = sum(n_active, na.rm = TRUE)) %>%
    ungroup()
  out
}

# site to daily pathogens ----------------------------------------------------

site_2_dly_path <- function(df, group_vars) {
  # args:
  #   df--site level daily data of pathogens, with columns including "date",
  #        "InstrumentVersion", "TargetName",  "daily_count"
  #   group_vars--un-quoted names of columns to group by e.g
  #       group_vars = vars(date, InstrumentVersion)
  # returns:
  #   data frame with daily_count which is the sum of daily count within
  #   groups
  required_cols <- c("date", "InstrumentVersion", "TargetName",
                     "daily_count")

  check_cols(df, required_cols)
  check_quosures(group_vars)


  out <- df %>%
    # this right join may not be necessary--just precaution so no missing dates
    group_by(!!!group_vars) %>%
    summarize(daily_count = sum(daily_count, na.rm = TRUE)) %>%
    ungroup()
  out
}


# -------------------------------------------------------------------------

dly_2_epi <- function(df, group_vars) {
  # args:
  #   df--dataframe of daily TUR data that
  #   group_vars--unquoted names of columns to group by wrapped in vars() function
  # returns:
  #   df with "daily_TUR_rp", "daily_TUR_all", "n_active" cols aggregated to epiweek
  required_cols <- c("daily_TUR_rp", "daily_TUR_all", "n_active", "date")

  check_cols(df, required_cols)
  check_quosures(group_vars)

  epi_dates <- create_epidates(df$date) # df of epidates/weeks
  out <- df %>%
    mutate(epiweek = lubridate::epiweek(date),
           epiyear = lubridate::epiyear(date)) %>%
    group_by(!!!group_vars) %>%
    summarize(epi_TUR_rp = sum(daily_TUR_rp, na.rm = TRUE),
              epi_TUR_all = sum(daily_TUR_all, na.rm = TRUE),
              n_active = mean(n_active, na.rm = TRUE),
              epi_n_days = lu(date)) %>%
    ungroup() %>%
    left_join(epi_dates, by = c("epiweek", "epiyear"))
  out
}

# daily to epi week pathogen data---------------------------------------------

dly_2_epi_path <- function(df, group_vars) {
  # args:
  #   df--dataframe of daily counts by pathogens (daily_count)
  #   group_vars--unquoted names of columns to group by wrapped in vars() function
  # returns:
  #   df with epicount (replacing input of dail_count). which is aggregated to epiweek
  required_cols <- c("date", "InstrumentVersion", "TargetName",
                     "daily_count")

  check_cols(df, required_cols)
  check_quosures(group_vars)

  epi_dates <- create_epidates(df$date) # df of epidates/weeks
  out <- df %>%
    mutate(epiweek = lubridate::epiweek(date),
           epiyear = lubridate::epiyear(date)) %>%
    group_by(!!!group_vars) %>%
    summarize(epi_count = sum(daily_count, na.rm = TRUE)) %>%
    ungroup()%>%
    left_join(epi_dates, by = c("epiweek", "epiyear"))
  out
}

# arrange by epidate ------------------------------------------------------


arrange_epidate <- function(df) {
  # args:
  #   df--dataframe (often grouped), with epidate column
  # returns:
  #   df sorted by epidate (within each group), throws an error if epidates
  #   are not all 7 days apart (e.g a week missing).
  #   for use in other functions
  stopifnot(
    is.data.frame(df),
    "epidate" %in% names(df)
  )
  out <- arrange(df, epidate, .by_group = TRUE) # sorting

  check <- out %>%
    summarize(is_week_interval = all(diff(epidate)==7))

  if(!all(check$is_week_interval)) {
    stop("non-consecutive epidates present")
  }
  out
}

# calc_n_active_adj -------------------------------------------------------

calc_n_active_adj <- function(df, group_vars, width = 52) {
  # args:
  #   df
  #
  # returns:
  #
  required_cols <- c("epidate", "epi_TUR_rp", "epi_TUR_all", "n_active")

  check_cols(df, required_cols)
  check_quosures(group_vars)

  out <- df %>%
    group_by(!!!group_vars) %>%
    arrange_epidate() %>%
    mutate(
      sum_rp_long = zoo::rollapply(epi_TUR_rp, width = width, FUN = sum,
                                   na.rm = TRUE, partial = TRUE),
      sum_all_long = zoo::rollapply(epi_TUR_all, width = width, FUN = sum,
                                    na.rm = TRUE, partial = TRUE),
      prop_rp = sum_rp_long/sum_all_long,
      #  this next line is necessary when doing by region
      # otherwise subsequent weighted averages are thrown off
      prop_rp = ifelse(epi_TUR_rp == 0, NA, prop_rp),
      n_active_adj = prop_rp*n_active, # Adjust number of active instruments
      #Week X RPTests/(Active Instruments week x prop_rp):
      Y = epi_TUR_rp/n_active_adj)
  out
}


# calculate Y prime -------------------------------------------------------

calc_Y_2prime <- function(df, group_vars, means, Y, Y_prime) {
  # args:
  #   df--dataframe with for which Y` and Y" (all_adj) needs to be calculated
  #   group_vars--variables to group by when taking weighted means to calculate Y"
  #   means--names vector of means of Y of instrument versions
  #   Y--string, name of column containg Y prime variable
  #   Y_prime--string, name to give the Y_prime variable
  # returns:
  #   df new column with Y prime, and new instrument version (all_adj) for which
  #   Y_prime col is actually Y"
  required_cols <- c("InstrumentVersion", "epi_n_days", "n_active")

  check_cols(df, required_cols)

  stopifnot(
    check_quosures(group_vars),
    is.numeric(means),
    is.character(Y),
    is.character(Y_prime)
  )

  # calculating Y prime
  df[[Y_prime]] <- FA1.5_adjust(x = df[[Y]], InstrumentVersion = df$InstrumentVersion,
                                FA1.5_mean = means["FA1.5"],
                                FA2.0_mean = means["FA2.0"],
                                Torch_mean = means["Torch"])
  # calculating Y double prime--still called Y_prime but under
  # instrumentversion = "all_adj"
  df_adj <- df %>%
    filter(InstrumentVersion != "all") %>%
    group_by(!!!group_vars) %>%
    # epi_n_days same for all inst version--but just want to keep for late ruse
    summarize(epi_n_days = mean(epi_n_days),
              !!Y_prime := weighted.mean(x = .data[[Y_prime]],
                                         wt = n_active, na.rm = TRUE),
              InstrumentVersion = "all_adj") %>%
    ungroup()
  # adding back together
  out <- bind_rows(df, df_adj)
  out
}


# add_missing_dates_2_site_path -------------------------------------------

add_missing_dates_2_site_path <- function(df) {
  # args:
  #   df--data frame of daily counts of by TargetName, instrumentversion and SiteID
  # returns:
  #   dataframe with same columns as input but with no missing dates within the
  #   range of dates found for each SiteID/instrumentversion combination
  #   also TargeName now ordered factor
  required_cols <- c("date", "SiteID", "daily_count", "TargetName",
                     "InstrumentVersion")
  check_cols(df, required_cols)


  all_pathogens <- sort(unique(df$TargetName))
  split_list1 <- split(df, df$SiteID) # split by site id

  # iterate over sites
  df_split2 <- purrr::map(split_list1, function(df) {
    site_instrument <- split(df, df$InstrumentVersion)

    # "loop" of instrument versions for the site
    site_instrument2 <- purrr::map2(site_instrument, names(site_instrument), function(df2, inst){
      dates <- seq(from = min(df2$date), to = max(df2$date), by = "1 day")
      # all combos of date/pathogen
      dates_paths <- expand.grid(date = dates,
                                 TargetName = all_pathogens,
                                 stringsAsFactors = FALSE)
      out <- left_join(dates_paths, df2, by = c("date", "TargetName"))
      # left join leaves NAs--filling back in:
      out$InstrumentVersion <- inst
      id <- unique(out$SiteID)
      out$SiteID <- id[!is.na(id)]
      out
    }) %>%
      bind_rows()
  })
  out <- df_split2 %>%
    bind_rows() %>%
    as_tibble()
  # change TargetName to a factor--want specific ordering, and
  # code setup like this so runs w/ and w/o co-detection category
  others <- sort(all_pathogens[all_pathogens %in% c("negative", "co-detection")],
                 decreasing = TRUE)
  all_pathogens <- c(others, sort(all_pathogens[!all_pathogens %in% others],
                                  decreasing = TRUE))
  out$TargetName <- factor(out$TargetName,
                           levels = all_pathogens)
  out
}


# calc_path_3wma ----------------------------------------------------------

calc_path_Y_3wma <- function(df,
                             means,
                             group_2prime,
                             group_3wma,
                             add_path_dates = FALSE
                             ) {
  # args:
  #   df with epi_count (count per epiweek) by TargetName.
  #   means--named vector of means of the 3 instrument versions
  #   group_2prime--names of columns wrapped in vars() to group by when calculating
  #     Y", passed on to calc_Y_2prime function
  #   group_3wma--names of columns wrapped in vars() to group by when calculating
  #     three week moving average
  #   add_path_dates--logical whether to fill in date gaps with NA when smoothing Y".
  #     may slow down the function. Can be needed when if get, error for non
  #     continuous epidates (there are some cases when data from
  #     some instrument versions has been removed where this is needed)
  # returns:
  #   dataframe with path_Y, path_Y_prime and path_Y_prime_3wma columns, ie
  #       this is Y, Y' and Y'' for each pathogen. Y'' is "path_Y_prime" column
  #       when InstrumentVersion = all_adj
  required_cols <- c("epidate", "epi_count", "epi_n_days", "n_active_adj")

  check_cols(df, required_cols, "calc_path_Y_3wma")

  stopifnot(
    is.numeric(means),
    check_quosures(group_3wma)
  )

  df2 <- df %>%
    # count/instrument
    mutate(path_Y = epi_count/n_active_adj) %>%
    calc_Y_2prime(group_vars = group_2prime,
                  means = means, Y = "path_Y", Y_prime = "path_Y_prime")

  # solving a problem of non-continuous epidates, occurs when
  # instrument versions do not overlap (problem for ID) in dates when
  # smoothing Y"--makes code substantially slower I think.
  if (add_path_dates) {
    stopifnot(c("epidate", "TargetName", "InstrumentVersion") %in% names(df2))

    cols2expand <- unique(c("epidate", "TargetName", quo2char(group_3wma)))

    combo <- expand_cols(df2, cols = cols2expand)

    df2 <- right_join(df2, combo, by = cols2expand) %>%
      # adding epiweek/epiyear back in
      mutate(epiweek = lubridate::epiweek(.data$epidate),
             epiyear = lubridate::epiyear(.data$epidate))

  }

  out <- df2 %>%
    group_by(!!!group_3wma) %>%
    arrange_epidate() %>%
    # 3 week moving average
    mutate(path_Y_prime_3wma = rollapply_epi(x = .data[["path_Y_prime"]],
                                             n_days = .data[["epi_n_days"]],
                                             na.rm = TRUE),
           # b/na.rm = TRUE can calculate moving windows when that week
           # is missing, want to switch those back to NA
           path_Y_prime_3wma = ifelse(is.na(.data$path_Y_prime), NA,
                                      .data$path_Y_prime_3wma)) %>%
    ungroup()



  out
}

# FA1.5 adjust ------------------------------------------------------------

FA1.5_adjust <- function(x, InstrumentVersion, FA1.5_mean, FA2.0_mean,
                         Torch_mean) {
  # args:
  #   x--numeric vector (TUR per instrument)
  #   InstrumentVersion--character vector
  #   FA1.5_mean--mean values (TUR/instrument) of 1.5 instruments (reference)
  #   FA2.0_mean--mean of 2.0 instruments
  #   Torch_mean--mean of torch instruments
  # returns:
  #   adjust vector so that 2.0 and Torch has same mean as the 1.5, doesn't
  #   adjust all or the 1.5

  stopifnot(InstrumentVersion %in% c("FA1.5", "FA 1.5", "FA2.0", "FA 2.0",
                                     "Torch", "all"),
            length(InstrumentVersion) == length(x))
  # switched to warning b/ in some cases may have a panel that isn't
  # present in particular instrument version
  if(is.na(FA1.5_mean)) {
    warning("FA1.5_mean is NA, now will adjust to grand mean instead of to FA1.5")
    FA1.5_mean <- mean(FA2.0_mean, Torch_mean)
  }
  if(is.na(FA2.0_mean)) {
    warning("FA2.0_mean is NA")
  }
  if(is.na(Torch_mean)) {
    warning("Torch_mean is NA")
  }

  out <- ifelse(InstrumentVersion %in% c("FA2.0", "FA 2.0") , x*FA1.5_mean/FA2.0_mean,
                ifelse(InstrumentVersion == "Torch", x*FA1.5_mean/Torch_mean, x))

  out

}


# number of sites in a given epiweek --------------------------------------

#' Calculate number of unique SiteIDs per epiweek
#'
#' Given daily site level data, calculate the number of unique SiteIDs
#' for each epiweek.
#'
#' @param df This is a dataframe containing daily test utilization data
#'   must have columns of "daily_TUR_rp","SiteID", "date", as well as
#'   columns to group by
#' @param group_vars columns to group by (e.g. region). Unquoted names of columns enclosed in vars(),
#'    or NULL
#' @param window Width (in weeks) of window over which to count how many
#'  sites are present.
#' @return dataframe with SiteID, date, epiweek, epiyear, epidate, n_sites
#'  (number of sites that week), and other grouping variables
#' @examples
#' # see ?TUR_dat
#' calc_n_sites(TUR_dat)
#' calc_n_sites(TUR_dat, group_vars = vars(InstrumentVersion))
#' @export
calc_n_sites <- function(df,
                         group_vars = NULL,
                         window = 3) {

  required_cols <- c("daily_TUR_rp","SiteID", "date")

  check_cols(df, required_cols)

  stopifnot(is.null(group_vars) || check_quosures(group_vars))

  epidates <- create_epidates(df$date)

  group_vars2 <- c(vars(epiweek, epiyear), group_vars)

  df2 <- df %>%
    filter(!is.na(daily_TUR_rp) & daily_TUR_rp > 0) %>%
    mutate(epiweek = lubridate::epiweek(date),
           epiyear = lubridate::epiyear(date)) %>%
    group_by(!!!group_vars2)


  df2expand <- right_join(df2, epidates, by = c("epiweek", "epiyear"))

  if (is.null(group_vars)) {
    cols2expand <- "epidate"
  } else {

    cols2expand <- quo2char(group_vars)

    # epiweek/epiyear shouldn't be included in group_vars
    if(any(c("epiweek", "epiyear", "epidate") %in% quo2char(group_vars))) {
      stop("group_vars shouldn't included epiweek, epiyear or epidate")
    }

    # want to use epidate so don't all combinations of year and week
    # can't have gaps in dates for rolling window
    cols2expand <- c(cols2expand, "epidate")
  }


  # for joining so gaps don't exist
  df2join <- expand_cols(df2expand, cols = cols2expand)


  out <- df2 %>%
    nest() %>%
    mutate(SiteIDs = purrr::map(.data$data, function(x) unique(x$SiteID))) %>%
    select(-.data$data) %>%
    left_join(epidates, by = c("epiweek", "epiyear")) %>%
    ungroup() %>%
    select(-c("epiweek", "epiyear")) %>%
    right_join(df2join, by = cols2expand) %>%
    group_by(!!!group_vars) %>%
    arrange_epidate() %>%
    mutate(sites_list = roll_list(.data$SiteIDs, window = window, align = "center"),
           # number of unique serial numbers within 3 months
           n_sites = purrr::map_dbl(.data$sites_list, function(x) lu(unlist(x))),
           epiweek = lubridate::epiweek(.data$epidate),
           epiyear = lubridate::epiyear(.data$epidate)
    ) %>%
    select(-c("sites_list", "SiteIDs")) %>%
    ungroup()

  out
}


# calc_TURN ---------------------------------------------------------------



#' Calculate TURN
#'
#' Calculates test utilization rate normalization (Y''), using daily site level
#' data. Y (tests/adjusted active instruments) is calculated for each
#' InstrumentVersion, then Y' prime is calculated to adjust all means to FA1.5.
#' The weighted average of these Y' is taken, and three week moving average
#' applied.
#'
#' @param df Dataframe with columns of "date", "InstrumentVersion",
#'  "daily_TUR_rp" (number of RP tests that day),
#'  "daily_TUR_all" (number of all tests that day),
#'  "n_active" (active instruments).
#' @param group_vars columns to group by.  Unquoted names of columns
#'   enclosed in vars(). Must at least include InstrumentVersion (which would then
#'   result in a national level TURN)
#' @param site_info NULL or dataframe (if additonal group_vars are needed).
#'    must contain a SiteID column, so that it can be merged with df. Additional
#'    columns (e.g. Region) need to be present in the site_info dataframe
#'    to be used as group_vars.
#' @param means NULL or named vector of means. The vector needs to be length of
#'   three giving the means of Y (tests/adjusted active instrument) for the
#'   FA1.5, FA2.0 and Torch system (with those names provided as names of
#'   elements in the vector). Leave as NULL when calculating National
#'   level TURN, because then the means are calculated internally in this
#'   function.
#' @param return_means logical, whether to return vector of means of Y
#'   (tests/adjusted active instrument). If true, output is a list with
#'   first element being the output dataframe, the second being named
#'   vector of means (which can then be used as an input to this function
#'   when calculating regional TURN)
#' @return Dataframe or list. The dataframe contains the following columns:
#'  epiweek, epiyear,
#'  InstrumentVersion (the instrument version, or 'all' for all instruments
#'  combined, or all_adj  for which the Y_prime column is the weighted average
#'  of all the Y_prime for the other instruments or what we are calling Y''),
#'  epi_TUR_rp (number of RP test that week),
#'  epi_TUR_all (number of all tests that week) ,
#'  n_active (number of active instruments),
#'  epi_n_days (number of days that week), epidate (calander date of the middle
#'  of the epidemiological week), sum_rp_long (number of rp tests within 1 year),
#'  sum_all_long (number of all tests within 1 year),
#'  prop_rp (proportion RP tests = sum_rp_long/sum_all_long),
#'  n_active_adj (number of active instruments*prop_rp), Y
#'  (rp tests/adjust active instruments), Y_prime (Y adjusted to mean of FA1.5,
#'  or when InstrumentVersion == all_adj, this is the weighted average
#'  of Y prime for the other instruments), Y_prime_3wma (3 week moving average
#'  applied to Y_prime).
#'
#'  If a list is returned (i.e. return_means = TRUE) the first element is the
#'  aformentioned dataframe, the second element is the vector of mean Y for
#'  each instrument version.
#' @examples
#' l <- calc_TURN(TUR_dat, return_means = TRUE)
#' # mean of Y for each instrument version
#' means <- l$means
#' # data frame containg national turn
#' national_turn <- l$df
#' # plot of final TURN curve
#' # plot(Y_prime_3wma ~ epidate,
#' #      data = national_turn[national_turn$InstrumentVersion == "all_adj", ],
#' #      type = "l")
#' # calculating TURN by region
#' state_turn <- calc_TURN(TUR_dat,
#'                         group_vars = dplyr::vars(InstrumentVersion, Region),
#'                         site_info = get_site_info(rp_raw),
#'                         means = means)
#' @export
calc_TURN <- function(df,
                      group_vars = vars(InstrumentVersion),
                      site_info = NULL,
                      means = NULL,
                      return_means = FALSE){

  check_quosures(group_vars)
  required_cols <- c("date", "InstrumentVersion", "daily_TUR_rp",
                     "daily_TUR_all", "n_active")
  check_cols(df, required_cols, "calc_TURN()")

  stopifnot(is.null(site_info) | is.data.frame(site_info),
            is.logical(return_means))

  # I'm not sure if having redundant group_vars causes problems
  # so I'm not allowing
  not_allowed_groups <- c("date", "epiweek", "epiyear")

  group_char <- quo2char(group_vars)

  stopifnot("InstrumentVersion" %in% group_char)

  if(any(not_allowed_groups %in% group_char)) {
    stop(paste0("group_vars should not include any of\n",
                paste(not_allowed_groups, collapse = "\n")))

  }

  if(is.null(means) & length(group_char) != 1) {
    stop("named vector of means for each instrument version must be supplied
             when additional group_var variables are used")
  }

  # grouping by date, and other variables provided by user
  dly_group_vars <- c(vars(date), group_vars)

  if (!is.null(site_info)) {
    if(!"SiteID" %in% names(df) | !"SiteID" %in% names(site_info)) {
      stop("SiteID column must be in df and site_info dataframes")
    }

    df <- df %>%
      left_join(site_info, by = "SiteID")
  }

  if(!all(group_char %in% names(df))) {
    stop("all group_vars must be included in either df or site_info")
  }


  dly <- site_2_dly(df, group_vars = dly_group_vars)

  # dataframe with every combination of date and group variables
  # so no holes in the dates
  df2join <- expand_cols(dly, cols = c("date", group_char))

  # aggregating to epiweek
  epi1 <- dly %>%
    # this right join may not be necessary--just precaution so no missing dates
    right_join(df2join, by = c("date", group_char)) %>%
    dly_2_epi(group_vars = c(vars(epiweek, epiyear), group_vars))

  # adjusted active instruments
  epi2 <- calc_n_active_adj(epi1, group_vars = group_vars)

  # calculate means by instrument version (when just aggregating to national
  # level)
  if(is.null(means)) {
    # it might be possible to also calculate just calculate means
    # on the fly when other grouping vars included but playing it safe
    mean_epi <- epi2  %>%
      filter(.data$InstrumentVersion != "all") %>%
      group_by(.data$InstrumentVersion) %>%
      summarize(mean = mean(.data$Y, na.rm = TRUE))

    means <- mean_epi$mean
    names(means) <- mean_epi$InstrumentVersion
  }


  # adding in grouping variables (except InstrumentVersion--don't want'
  # to group by instrument version when taking weighted mean)
  group_vars_2prime <- c(vars(epiyear, epiweek, epidate),
                         group_vars[which(group_char != "InstrumentVersion")])

  epi4 <- calc_Y_2prime(epi2, group_vars = group_vars_2prime,
                        means = means, Y = "Y", Y_prime = "Y_prime")

  # 3 week moving averages
  epi5 <- epi4 %>%
    group_by(!!!group_vars) %>%
    arrange_epidate() %>%
    # 3 week moving average
    mutate(Y_prime_3wma = rollapply_epi(x = .data$Y_prime,
                                        n_days = .data$epi_n_days,
                                        width = 3,
                                        na.rm = TRUE),
           # b/na.rm = TRUE can calculate moving windows when that week
           # is missing, want to switch those back to NA
           Y_prime_3wma = ifelse(is.na(.data$Y_prime), NA, .data$Y_prime_3wma)) %>%
    ungroup()

  if(return_means) {
    out <- list("df" = epi5, means = means)
  } else {
    out <- epi5
  }

  out
}


# calc_path_TURN ----------------------------------------------------------




# next: make examples. make documentation.
# add checks--ie sums should add, and should work missing dates
# try tricky version where add_path_dates is required.
# add optional check for sum count vs TURN

#' Calculate TURN by pathogen
#'
#' Calculates test utilization rate normalization (Y'') for each pathogen,
#' using daily site level data. See \code{?calc_TURN} for details on how TURN
#' is calculated (that function is for caclulating overall/total TURN--not for
#'  a specific pathogen).
#'
#' @param df dataframe of daily pathogen data. Columns must include "date",
#'   "InstrumentVersion", "daily_count", "TargetName", "SiteID"
#' @param TURN_df dataframe created by \code{calc_TURN()}. Must include
#'   columns of "epidate", "n_active", "n_active_adj", "epi_n_days", "Y",
#'   "Y_prime", "Y_prime_3wma", as well as columns listed in group_vars
#'   argument.
#' @param means named vector of means. The vector needs to be length of
#'   three giving the means of Y (tests/adjusted active instrument) for the
#'   FA1.5, FA2.0 and Torch system (with those names provided as names of
#'   elements in the vector). This should be created by the \code{calc_TURN()}
#'    function when aggregating to the national level.
#' @param group_vars columns to group by.  Unquoted names of columns
#'   enclosed in vars(). Must at least include InstrumentVersion (which would then
#'   result in a national level TURN)
#' @param site_info NULL or dataframe (if additonal group_vars are needed).
#'    must contain a SiteID column, so that it can be merged with df. Additional
#'    columns (e.g. Region) need to be present in the site_info dataframe
#'    to be used as group_vars.
#' @param add_path_dates logical whether to fill in date gaps with NA when
#'    smoothing Y".
#'     May slow down the function. Can be needed when if get error for non
#'     continuous epidates (there are some cases when data from
#'     some instrument versions has been removed where this is needed)
#' @param run_check logical, whether to run a check to see if the sum of individual
#'   pathogens TURNs equals the total TURN, for each epiweek. This check
#'   is only relevant when co-detections and negatives are pathogen categories,
#'   in which case you would expect the check to not throw an error.
#' @return Dataframe. The dataframe contains the following columns:
#'  epiweek, epiyear, epidate (date of middle of epiweek), epi_count
#'  (count of number of detections of pathogen that epiweek),
#'  TargetName (pathogen), path_Y
#'  (positives for that pathogen/adjust active instruments),
#'  path_Y_prime (path_Y adjusted to mean of FA1.5,
#'  or when InstrumentVersion == all_adj, this is the weighted average
#'  of path Y prime for the other instruments).
#'  Y_prime_3wma (3 week moving average
#'  applied to path_Y_prime, this is TURN for pathogen when
#'  InstrumentVersion == 'all_adj'). Other columns as supplied in TURN_df
#'  and site_info
#' @examples
#' # list of national turn and means by instrument (means should be
#' #    calculated at national level)
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # regional (state) TURN
#' site_info <- get_site_info(rp_raw)
#' TURN_region <- calc_TURN(TUR_dat,
#'                          group_vars = vars(InstrumentVersion, Region),
#'                          site_info = site_info,
#'                          means = means)
#'
#' # calculate TURN for each pathogen -- national
#' calc_path_TURN(df = path_dat,
#'                TURN_df = TURN_national,
#'                means = means)
#'
#' # calculate TURN for each pathogen -- regional
#' calc_path_TURN(df = path_dat,
#'                TURN_df = TURN_region,
#'                means = means,
#'                group_vars = vars(InstrumentVersion, Region),
#'                site_info = site_info)
#'
# TURN by pathogen--national level
#' @export
calc_path_TURN <- function(df, TURN_df, means,
                           group_vars = vars(InstrumentVersion),
                           site_info = NULL,
                           add_path_dates = FALSE,
                           run_check = FALSE){

  # checks on inputs ~~~~~

  required_cols <- c("date", "InstrumentVersion", "daily_count", "TargetName",
                     "SiteID")

  check_cols(df, required_cols)

  stopifnot(is.null(site_info) | is.data.frame(site_info),
            is.numeric(means),
            is.logical(add_path_dates),
            is.logical(run_check))

  check_quosures(group_vars)

  group_char <- quo2char(group_vars)

  check_cols(TURN_df,
             # prob don't need to require Y Y_prime and Y_prime_3wma
             # but keeping the requirement for now, b/ joined
             # for output
             required_cols = c("epidate", "n_active", "n_active_adj",
                               "epi_n_days", "Y", "Y_prime", "Y_prime_3wma",
                               "epi_TUR_rp", group_char),
             name = "TURN_df")

  # I'm not sure if having redundant group_vars causes problems
  # so I'm not allowing
  not_allowed_groups <- c("date", "epiweek", "epiyear", "TargetName")

  stopifnot("InstrumentVersion" %in% group_char)

  if(any(not_allowed_groups %in% group_char)) {
    stop(paste0("group_vars should not include any of\n",
                paste(not_allowed_groups, collapse = "\n")))

  }

  if(is.null(means) & length(group_char) != 1) {
    stop("named vector of means for each instrument version must be supplied
             when additional group_var variables are used")
  }

  # end checks on inputs ~~~~

  # rolling windows don't work when missing dates aren't filled in
  df2 <- add_missing_dates_2_site_path(df)

  # join in site info
  if (!is.null(site_info)) {
    if(!"SiteID" %in% names(df2) | !"SiteID" %in% names(site_info)) {
      stop("SiteID column must be in df and site_info dataframes")
    }

    df2 <- left_join(df2, site_info, by = "SiteID")
  }

  if(!all(group_char %in% names(df2))) {
    stop("all group_vars must be included in either df or site_info")
  }



  # daily pathogen counts-
  path_dly <- site_2_dly_path(df2, c(vars(date, TargetName), group_vars))

  # pathogens counts at epi week level
  path_epi1 <- dly_2_epi_path(
    path_dly, c(vars(epiyear, epiweek, TargetName), group_vars))


  # adding in active instruments
  path_epi2 <- TURN_df %>%
    # We need to make new all_adj based on individual pathogens, so deletiing here
    filter(.data$InstrumentVersion != "all_adj") %>%
    # needed columns
    select(c("epidate", "n_active", "n_active_adj", "epi_n_days", "epi_TUR_rp",
             group_char)) %>%
    right_join(path_epi1,  by = c("epidate", group_char))

  # check join integrity
  # i think it is ok to have epi_count = 0 when n_active_adj is NA
  if (any(is.na(path_epi2$n_active_adj) &
          !(is.na(path_epi2$epi_count) | path_epi2$epi_count == 0))
  ){
    stop("n_active_adj is NA when number expected")
  }

  # ok to do only b/ of above check (0s can happen when summing across all NAs)
  path_epi2 <- path_epi2 %>%
    mutate(epi_count = ifelse(is.na(.data$n_active_adj),
                              NA, .data$epi_count))

  path_epi5a <- calc_path_Y_3wma(
    df = path_epi2,
    means = means,
    # don't want to group by instrument version here
    group_2prime = c(vars(epiyear, epiweek, epidate, TargetName),
                     group_vars[group_char != "InstrumentVersion"]),
    group_3wma = c(vars(TargetName), group_vars),
    add_path_dates = add_path_dates)

  # adding in y_prime now so that have it for all_adj
  out <- TURN_df %>%
    select("epidate","Y", "Y_prime", "Y_prime_3wma", group_char) %>%
    right_join(path_epi5a,  by = c("epidate", group_char))

  # final checks
  cols2check <- c("epidate", "TargetName", group_char)
  if(sum(is.na(out[, cols2check])) != 0) {
    stop("At least one of following columns has missing values:\n",
         paste(cols2check, collapse = "\n"),
         "\nensure that no levels of group_vars in TURN_df are missing")
  }
  # check that path turns sum to total turn
  if(run_check) {
    check_path_TURN(out, group_vars = group_vars)
  }

  out

}


# calculate detection rate ------------------------------------------------

#' Calculate detection rate of pathogens
#'
#'  Calculate the percentage of total tests that are of a specific pathogen
#'  using the output from the \code{calc_path_TURN} function. It is calculated
#'  as Y for the pathogen divided by total Y. Since the Ys are calculate with
#'  the same denominator this is equivelant to tests for that pathogen/total tests.
#'  A three week moving average is applied to the numerator and denominator
#'  prior to calculation.
#'
#' @param df dataframe with at least columns of "InstrumentVersion", "epi_n_days",
#'  "Y", "path_Y", "epidate", and any other columns included in group_vars.
#' @param group_vars names of columns to group by, wrapped in vars(). Must
#'  included TargetName and can also include other columns to group by (e.g. a
#'  column defining regions).
#' @param return_week logical whether output should include epiweek and
#'  epiyear columns
#' @param window width of the window over which to apply rolling window sum
#'  to calculate detection rate.
#' @return data frame that includes a column 'det_rate', which is the detection
#'  rate (in percent), and TUR_mws which is the moving window sum of TUR
#' @examples
#' ###############################################
#' ############## Generating data ################
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # regional (state) TURN
#' site_info <- get_site_info(rp_raw)
#' TURN_region <- calc_TURN(TUR_dat,
#'                          group_vars = vars(InstrumentVersion, Region),
#'                          site_info = site_info,
#'                          means = means)
#'
#' # calculate TURN for each pathogen -- national
#' path_turn_national <- calc_path_TURN(df = path_dat,
#'                TURN_df = TURN_national,
#'                means = means)
#'
#' # calculate TURN for each pathogen -- regional
#' path_turn_region <- calc_path_TURN(df = path_dat,
#'                TURN_df = TURN_region,
#'                means = means,
#'                group_vars = vars(InstrumentVersion, Region),
#'                site_info = site_info)
#'
#' ###############################################
#' ############## detection rate ################
#'
#' # calculate detection rate--national
#' calc_det_rate(path_turn_national)
#' # calculate detection rate--regional
#' calc_det_rate(path_turn_region, group_vars = vars(TargetName, Region))
#' @export
calc_det_rate <- function(df, group_vars = vars(TargetName), return_week = TRUE,
                          window = 3) {

  check_quosures(group_vars)
  required_cols <- c("InstrumentVersion", "epi_count", "epi_TUR_rp",
                     "epidate", quo2char(group_vars))

  check_cols(df, required_cols, "calc_det_rate()")

  if (!"TargetName" %in% quo2char(group_vars)) {
    warning("group_vars should include TargetName")
  }

  out <- df %>%
    filter(.data$InstrumentVersion == "all") %>%
    group_by(!!!group_vars) %>%
    arrange_epidate() %>%
    # moving window sum of count
    mutate(count_mws = zoo::rollapply(epi_count, width = window, FUN= sum,
                                     na.rm = TRUE, partial = TRUE),
           # TUR moving window sum
           TUR_mws = zoo::rollapply(epi_TUR_rp, width = window, FUN= sum,
                                    na.rm = TRUE, partial = TRUE),
           det_rate = count_mws/TUR_mws*100) %>%
    select(c("epidate", quo2char(group_vars), "det_rate", "TUR_mws")) %>%
    ungroup()

  if(return_week) {
    out <- out %>%
      mutate(epiweek = lubridate::epiweek(.data$epidate),
             epiyear = lubridate::epiyear(.data$epidate))
  }
  out
}



# clean TURN output -------------------------------------------------------

#'  clean TURN dataframe
#'
#'  Output a simpler (fewer columns) dataframe, given an input dataframe
#'  made by either \code{calc_TURN()} or \code{calc_path_TURN()} functions.
#'  New TURN column is returned, which is the the Y_prime_3wma input column, where
#'  InstrumentVersion == 'all_adj'.
#'
#' @param df dataframe with at least columns of "InstrumentVersion", "epi_n_days",
#'  "Y", "path_Y", "epidate", and any other columns included in group_vars.
#' @param extra_cols character vector of additional columns to keep in output
#' @param is_path logical, whether dataframe is of pathogen level data
#' (i.e. created in \code{calc_path_TURN()}).
#' @return dataframe that includes TURN column (final normalization). It also
#' includes path_TURN (the TURN for individual pathogens, when the
#' is_path argument is TRUE)
#' @examples
#'
#' ############## Generating data ################
#' national_list <- calc_TURN(TUR_dat, return_means = TRUE)
#' means <- national_list$means
#' # national TURN
#' TURN_national <- national_list$df
#'
#' # calculate TURN for each pathogen -- national
#' path_turn_national <- calc_path_TURN(df = path_dat,
#'                TURN_df = TURN_national,
#'                means = means)
#'
#' ##### creating cleaner/simple dataframe #######
#'
#' clean_TURN_output(TURN_national)
#' clean_TURN_output(path_turn_national, is_path = TRUE)
#' @export
clean_TURN_output <- function(df, extra_cols = NULL,  is_path = FALSE) {

  required_cols <- c("epidate", "epiweek", "epiyear", "Y_prime_3wma",
                     "InstrumentVersion")
  check_cols(df, required_cols, "clean_TURN_output()")

  df2 <- filter(df, .data$InstrumentVersion  == "all_adj")

  df2 <- rename(df2, "TURN" = "Y_prime_3wma")

  cols2keep <- c("epidate", "epiweek", "epiyear", extra_cols, "TURN")
  if (is_path){
    cols2keep <- c(cols2keep, "path_TURN", "TargetName")
    df2 <- rename(df2, "path_TURN" = "path_Y_prime_3wma")
  }
  out <- df2[, cols2keep]
  out
}
MartinHoldrege/turnr documentation built on May 16, 2020, 10:39 a.m.