#' Incidence estimator using rs2014 method implemented using daily agregatation
#'
#' @details Input:
#' Input data are expected to have some columns & some fixes. They are loaded and prepared using \code{\link{load_results_for_incidence}}
#'
#' @details Parameters:
#'
#' \describe{
#' \item{ignore.first.delay}{Number of days form his first survey befotre to include a participant (not compatible with ignore.first)}
#' \item{ignore.first.only.new}{Ignore first rule only for new participant (uses intake$first.season column}
#' \item{exclude.same}{should exclude syndrom if same episode is on (partcipant still active)}
#' \item{exclude.same.delay}{max delay to exclude syndrom if same (not excluded if over this delay)}
#' \item{active.max.freq}{max delay between 2 surveys (in weeks)}
#' \item{active.day.before}{Number of week before each computed week to count active participants}
#' \item{active.day.after}{Number of week after each computed week to count active participants}
#' \item{active.min.surveys}{min number of surveys for each participant (not active if less)}
#' \item{active.use.mean}{Use mean number of active participant over the windows [active.day.before, active.day.after]}
#' }
#' @export
IncidenceDailyRS2014 = R6Class("IncidenceDailyRS2014", public = list(
#' @field weekly weekly data (loaded using \code{\link{load_results_for_incidence}})
weekly = NULL,
#' @field intake intake data (loaded using \code{\link{load_results_for_incidence}})
intake = NULL,
#' @field participant data.frame() with all available participants and commputed criterias used during computation
participant = NULL,
#' @field params parameters for computation
params = NULL,
#' @field syndromes character vector of column names containing syndromes classification for each weekly
syndromes = NULL,
#' @field profiler profiler
profiler = NULL,
#' @field verbose logical show verbose message
verbose = FALSE,
#' @field design design stratification from \code{\link{design_incidence}}
design = NULL,
#' @field output vector of character, see \code{\link{IncidenceRS2014}}
output = NULL,
#' @description
#' instanciate object
#' @param weekly data
#' @param intake intake data
#' @param syndromes syndromes column names in weekly
#' @param params list of parameters for computing
#' @param design design_incidence
#' @param output list of output types to compute
#' @param syndroms compatibility
initialize = function(weekly, intake, syndromes, params, design, output=c('inc')) {
requireNamespace("dplyr", quietly = FALSE)
requireNamespace("epitools", quietly = FALSE)
if( is.null(output) ) {
output = eval(formals()$output)
}
# params default values
def = list(
ignore.first.delay = NA, # Number of days form his first survey befotre to include a participant (not compatible with ignore.first)
ignore.first.only.new = F, # Ignore first rule only for new participant (uses intake$first.season column @see load_results_for_incidence)
exclude.same = F, # should exclude syndrom if same episode is on (partcipant still active)
exclude.same.delay=15, # max delay to exclude syndrom if same (not excluded if over this delay)
active.max.freq=NA, # max delay between 2 surveys (in weeks)
active.day.before=NA, # Number of week before each computed week to count active participants
active.day.after=NA, # Number of week after each computed week to count active participants
active.min.surveys=NA, # min number of surveys for each participant (not active if less)
active.use.mean=FALSE
)
params = default.params(params, def)
self$params = params
self$output = output
self$weekly = weekly
self$intake = intake
self$design = design
self$syndromes = syndromes
},
#' @description
#' Prepare data before to compute
#' Prepare participants data to be able to apply selection rules
prepare = function() {
weekly = self$weekly
syndromes = self$syndromes
ignore.first.delay = self$params$ignore.first.delay
ignore.first.only.new = self$params$ignore.first.only.new
exclude.same = self$params$exclude.same
exlude.same.delay = self$params$exclude.same.delay
active.max.freq = self$params$active.max.freq
active.min.surveys = self$params$active.min.surveys
weekly$date = as.Date(trunc(weekly$timestamp, "day"))
if(exclude.same) {
weekly = weekly[ order(weekly$person_id, weekly$timestamp), ]
weekly$delay.date = calc_weekly_delay(weekly, "date")
same = !is.na(weekly$same.episode) & weekly$same.episode == YES & (is.na(weekly$delay.date) | weekly$delay.date <= exlude.same.delay) # recoded value !
for(ss in syndromes) {
# Cancel a syndrom report if flagged as same episode as previous
i = !is.na(weekly[, ss]) & weekly[, ss] > 0L
weekly[ same & i, ss ] = 0L
}
}
if(!hasName(weekly, "onset")) {
stop("Weekly data should have onset column defined")
}
# @TODO
# Actual strategy will remove aberrant data (if fever.start or sympt.start are too far)
# Depending on params the participant will be counted as active or not
# Now aggregate to the week and person
weekly = aggregate(as.list(weekly[, syndromes]), list(person_id=weekly$person_id, onset=weekly$onset), sum, na.rm=TRUE)
weekly[, syndromes] = weekly[, syndromes] > 0L # Only one syndrom report by participant by day
weekly = weekly[ order(weekly$person_id, weekly$onset), ]
weekly = weekly %>% dplyr::group_by(person_id) %>% dplyr::mutate(delay.previous=c(NA, diff(onset)))
#weekly$delay.previous = calc_weekly_delay(weekly, time.col='onset')
participant = weekly %>% dplyr::group_by(person_id) %>% dplyr::summarise(date.first=min(onset))
if( !is.na(ignore.first.delay) ) {
if( isTRUE(ignore.first.only.new) ) {
if( is.null(self$intake$first.season) ) {
stop("ignore.first.only.new option requires intake to have a 'first.season' column indicating if participant is new for the season")
}
participant = merge(participant, self$intake[, c('person_id', 'first.season')], by='person_id', all.x=TRUE)
}
}
date.last = weekly %>% dplyr::group_by(person_id) %>% dplyr::summarise(date.last=max(onset))
participant = merge(participant, date.last, by='person_id')
if( !is.na(active.min.surveys) ) {
weekly$nb = 1L
nb.survey = weekly %>% dplyr::group_by(person_id) %>% dplyr::summarise(nb=sum(nb))
participant = merge(participant, nb.survey, by='person_id')
}
if(!is.na(active.max.freq)) {
dd = weekly %>% dplyr::group_by(person_id) %>% dplyr::summarise(delay.previous=max(delay.previous, na.rm=TRUE))
participant = merge(participant, dd, by='person_id')
}
self$weekly = weekly
self$participant = participant
},
#' @description
#' Internal function
#' Estimate incidence with rs2014 method for a given date
#' @param date date to compute
compute_date = function(date)
{
active.day.before = self$params$active.day.before
active.day.after = self$params$active.day.after
active.max.freq = self$params$active.max.freq # max delay between 2 surveys (in days)
active.min.surveys = self$params$active.min.surveys
active.use.mean = self$params$active.use.mean
ignore.first.delay = self$params$ignore.first.delay
ignore.first.only.new = self$params$ignore.first.only.new # only ignore first survey for new participant for this season
# Track count of participants (for introspection)
track = create_step_tracker()
participant = self$participant
weekly = self$weekly
syndromes = self$syndromes
###
# Active participants
####
track$add("base", nrow(participant))
# Should have first survey before considered date
participant = participant[ participant$date.first <= date, ]
track$add("first_after_week", nrow(participant))
# Keep participant when first survey is more than $ignore.first.delay days, remove otherwise
if( !is.na(ignore.first.delay) ) {
d = floor(date - participant$date.first) >= ignore.first.delay
if(ignore.first.only.new) {
# Take into account of the first.season, only participant with first.season=T can be removed
# if first.season=NA, first.season is censored (cannot be determined), keep the participant regardless delay
d = ifelse(!is.na(participant$first.season) & participant$first.season, d, T) # always keep if not first season for a participant
}
participant = participant[ d, ]
}
track$add("ignore_first_delay", nrow(participant))
if( !is.na(active.max.freq) ) {
# Keep only participants with a minimal frequency of surveys
participant = participant[ participant$delay.previous <= active.max.freq, ]
track$add("max_freq", nrow(participant))
}
if( !is.na(active.min.surveys) ) {
# Keep only participants with a minimal frequency of surveys
participant = participant[ participant$nb >= active.min.surveys, ]
track$add("min.surveys", nrow(participant))
}
# Should have at least one survey in a time window defined by active.week.after & active.week.before
if( !is.na(active.day.after) ) {
active.last.day = date + active.day.after
} else {
active.last.day = max(weekly$onset)
}
if( !is.na(active.day.before) ) {
active.first.day = date - active.day.before
} else {
active.first.day = min(weekly$onset)
}
a = weekly %>% dplyr::filter(dplyr::between(onset, active.first.day, active.last.day)) %>% dplyr::select(person_id)
a = unique(a$person_id)
participant = participant[ participant$person_id %in% a, ]
track$add("in_active_window", nrow(participant))
track$add("final", nrow(participant))
if(nrow(participant) == 0L) {
r = list()
attr(r, "select.count") <- track$get_steps()
return(r)
}
strata = self$design$strata
if( is.null(strata) ) {
# Use dummy strata column
# It more costly than avoid aggregation but this garantees that the same procedure is used to compute regardless design
self$intake$dummy = 1L
strata = 'dummy'
}
participant$active = 1L
# active participant = having at least one survey on 3 weeks
participant = merge(participant, self$intake[, c('person_id', strata)], by='person_id', all.x=T) # get the geo code
# cat("Strata: ", paste(strata,collapse=','))
active = aggregate(list(active=participant$active), participant[, strata, drop=FALSE], sum)
if(nrow(active) == 0L) {
return(structure(list(), "select.count"=track$get_steps()))
}
if(active.use.mean) {
w = 1 + sum(active.day.after, active.day.before, na.rm=TRUE)
active$active = active$active / w # Use mean of participants seen on window size
}
# now calculate number of participant by health status for the current week
# now we only take account of the week of interest
weekly = weekly[ weekly$onset == date, ] # only keep our week of interest
weekly = weekly[ weekly$person_id %in% participant$person_id, ] # only keep weekly from active participant
track$add("weekly_for_part", nrow(weekly))
if(nrow(weekly) == 0L) {
r = list()
attr(r, "select.count") = track$get_steps()
return(r)
}
# Count number of syndrom by user and by week
count = aggregate(as.list(weekly[, syndromes]), list(person_id=weekly$person_id), sum)
count = merge(count, self$intake[ , c('person_id', strata)], by='person_id', all.x=T) # get the geo code for each user
# make syndrom exclusive in a date ?
# nop now
count = dplyr::mutate_at(count, syndromes, ~as.integer(. > 0L))
#count[, syndromes] = as.integer(count[, syndromes] > 0L) # syndrom counted only once for each user
# aggregate by strata
count.strata = aggregate( count[, syndromes, drop=F], count[, strata, drop=F], sum, na.rm=T)
if( nrow(count.strata) == 0L ) {
count.strata = active
count.strata[, syndromes] = as.numeric(NA)
} else {
# merge with active
count.strata = merge(active, count.strata, by=strata, all.x=T)
}
if( !is.na(self$output) ) {
# Compute incidence from count data
r = calc_adjusted_incidence(count.strata, design=self$design, syndromes=self$syndromes, output=self$output)
} else {
r = list(count = count.strata)
}
attr(r, "select.count") <- track$get_steps()
r
},
#' @description
#' Compute weekly incidence for all requested weeks
#' Apply the full algorithm for rs2014 incidence computation
#' prepare data and then compute for each week by calling estimate_incidence_rs2014_week
#' @param dates dates to compute, if NULL will compute on all dates available in weekly data
#' @param verbose show verbose messages if TRUE
#' @param progress show progress
compute = function(dates=NULL, verbose=T, progress=T) {
# Prepare data for all weeks (common part)
if(verbose) {
str(self$params)
message("Preparing data...", appendLF = FALSE)
} else {
if( !is.null(progress) ) {
cat('~')
}
}
self$verbose = verbose
self$prepare()
if(verbose) {
message("done.")
}
inc = NULL
inc.zlow = NULL
inc.age = NULL
count = NULL
if( is.null(dates) ) {
dates = unique(self$weekly$onset)
}
use.progress = FALSE
if( isTRUE(progress) ) {
progress = dplyr::progress_estimated(length(dates), min_time = 5)
use.progress = TRUE
}
for(i in seq_along(dates)) {
date = dates[i]
if(verbose) {
message("Day", date,"\n")
} else {
if( use.progress ) {
progress$tick()$print()
}
}
r = self$compute_date(date=date)
if( !is.null(r$inc) && nrow(r$inc) > 0 ) {
ii = r$inc
ii$date = date
inc = dplyr::bind_rows(inc, ii)
}
if( !is.null(r$zlow) && nrow(r$zlow) > 0) {
ii = r$zlow
ii$date = date
inc.zlow = dplyr::bind_rows(inc.zlow, ii)
}
if( !is.null(r$age) && nrow(r$age) > 0) {
ii = r$age
ii$date = date
inc.age = dplyr::bind_rows(inc.age, ii)
}
if( !is.null(r$count) && nrow(r$count) > 0) {
ii = r$count
ii$date = date
count = dplyr::bind_rows(count, ii)
}
}
if(use.progress) {
cat("\n")
}
structure(
list(
inc=inc,
inc.zlow=inc.zlow,
inc.age=inc.age,
count=count
),
syndromes = self$syndromes,
design=self$design,
params=self$params,
method="rs2014"
)
} # Compute
) # Public
) # R6Class
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.