R/incidence_design.R

Defines functions calc_adjusted_incidence calc.conf.int output_incidence design_incidence get_columns_for_incidence

Documented in calc_adjusted_incidence calc.conf.int design_incidence get_columns_for_incidence output_incidence

# This library aims at providing set of function to compute incidence from InfluenzaNet data
#
# Several functions are provided:
#
# - IncidenceRS2014
# - IncidenceDailyRS2014
#
# - (weekly, intake, syndromes, params, design, output=NULL, ...) : Estimator Function interface
# weekly, intake, syndromes, params, design, output=NULL
# intake = last intake for each participant with necessary columns for stratification
# syndromes = list of indicators column for each "syndrome" for which estimate an incidence
# params  = parameters for a given estimator
# design  = design stratification definition (@see design.incidence)
# output  = kind of output to get inc=incidence (national level), zlow="lower geographic level (z)", "age" age-specific incidence
#         age categories should be in "age.cat" column in intake df.
#         It could be an character vector or a structure with more option returned by output.incidence
#
# Helpers:
#
# - design_incidence() : define a stratification structre
# - output_incidence() : define an output type and extra parameters for output results


#' Returns list of columns needed of weekly survey to compute incidence
#' @export
get_columns_for_incidence = function() {
  c('timestamp', get_symptoms_aliases(), 'fever.sudden','highest.temp','same.episode','sympt.start','fever.start','sympt.sudden','sympt.cause')
}

#' Describe the stratification to use for incidence computation (parameter design)
#'
#'
#' Create a structure usable by incidence computation functions.
#' The returned structure embed all data & metadata about stratification
#' Only handle strata on age and geo level for instance
#' By convention age group column will be names "age.cat"
#'
#' @param age.categories list of breaks
#' @param year.pop year of the population to fetch
#' @param geo geographic level id (see geography & geo.levels in platform config.). could be character or structure return by geo_level function
#' @param geo_column name of the geographic column  (value return by strata.call), geo_level's column name by default
#' @param geo_area id of geographic area to keep at the geographic level
#' @param use.gender use gender as strata
#' @param ... other parameters to include in the design
#' @return design.incidence object
#' @export
design_incidence = function(age.categories, year.pop, geo, geo_column=NULL, geo_area=NULL, use.gender=FALSE, ...) {

  if( is.null(geo_column) ) {
    geo_column = geo_column(geo)
  }

  if( is.null(geo_column) ) {
    stop(paste("Unknown geo level ", geo))
  }

  strata = geo_column

  pop = NULL
  if( !is.null(age.categories) ) {
    strata = c(strata, 'age.cat')
    pop = load_population_age(geo=geo, year=year.pop, age.breaks=age.categories)
    if( is.null(geo) ) {
      # if geo is null, then there is no "geographic" dimension (only one geo strata)
      # fake it with a geo_column with a single value
      # The same should be done in the user's data
      pop[, geo_column] = 1
    }
  } else {
    if( !is.null(year.pop) ) {
      pop = load_population(geo=geo, year=year.pop)
    }
  }

  if( !is.null(pop) ) {
    n = names(pop)
    n[ n == 'all' ] <- 'population'
    names(pop) <- n

    if( !is.null(geo_area) ) {
      pop = pop[ pop[[geo_column]] %in% geo_area, ]
    }

  }

  if(use.gender) {
    strata = c(strata, 'gender')
  }

  data = list(
    age.categories = age.categories,
    population = pop, # population  at geographic level for stratification
    strata = strata,
    geo_level = geo,
    geo_column=geo_column,
    year.pop = year.pop,
    use.gender = use.gender
  )

  l = list(...)
  if(length(l) > 0) {
    for(n in names(l)) {
      data[[n]] = l[[n]]
    }
  }
  structure(data, class="design.incidence")
}

#' Define structure containing incidence results output and some metadata
#'
#' This function which allow to pass some extra parameters to the output function \code{\link{calc_adjusted_incidence}}
#'
#' @param types types of measures computed
#' @param conf.int confidence interval computed
#' @param adjust adjusted measures
#' @param ... data sets
#' @export
output_incidence = function(types, conf.int=T, adjust=T, ...) {
  data = list(types=types)
  l = list(...)
  if(length(l) > 0) {
    for(n in names(l)) {
      data[[n]] = l[[n]]
    }
  }
  ss = structure(data, class="output.incidence")
  attr(ss, 'conf.int') <- conf.int
  attr(ss, 'calc.adj') <- adjust
  ss
}


#' Compute confidence interval for rate
#' using poisson distribution law
#' @param inc data.frame from incidence estimator
#' @param v column name containing syndrome count
#' @param type crude to create rate column name
#' @param unit if rate need to be rescale
calc.conf.int = function(inc, v, type='crude', unit=1) {
  y = inc[[v]]
  i = is.na(y)
  y[i] = 0 # Fake 0 value will be replaced by NA
  d = epitools::pois.exact(y, inc$active)
  cc = c('upper','lower')
  d = d[, cc]
  d[i, cc] = NA
  d$upper = d$upper * unit
  d$lower = d$lower * unit
  names(d) <- paste(v, type, cc, sep='.')
  d
}


#' Compute crude & adjusted incidence from a count data
#'
#' This function is shared by all incidence computation methods, once active participants & syndrome counts are provided
#'
#' Ouput columns are ([syndrome] is the column with logical value for presence of one syndrome in the weekly data, for example "ili"):
#'  - active : count of active participants
#'  - [syndrome] : count for this syndrome (or variable)
#'  - [syndrome].crude : crude incidence (without adjustment, count/active participant ratio)
#'  - [syndrome].adj : Adjusted incidence (taking into account of stratification design)
#'  - [syndrome].crude.(upper|lower) : upper & lower bound of CI95 for crude incidence (Exact poisson CI)
#'  - [syndrome].adj.(upper|lower) : upper & lower boud of CI95 for ajusted incidence (DKES method)
#'
#' @param count.week data.frame(), count for a week. Count of active participants is in "active" column
#' @param design design structure (defining strata and reference data like population in each strata)
#' @param syndromes char() list of column names for each syndrome count
#' @param output char() list of requested output "inc"=global incidence, "zlow"=lower geo level incidence, "age" by age incidence,
#' @export
calc_adjusted_incidence = function(count.week, design, syndromes, output) {

  if(nrow(count.week) == 0) {
    return(list())
  }

  if(class(output) == "character") {
    output.obj = output_incidence(output)
  } else {
    if(!"output.incidence" %in% class(output)) {
      stop("output should be a character vector or a output.incidence object")
    }
    output.obj = output
  }

  conf.int = attr(output.obj, 'conf.int')
  calc.adj = attr(output.obj, 'calc.adj')
  output = output.obj$types

  calc_inc_strata = function(count, column.strata, column.prop) {

    if(calc.adj && conf.int) {
      count$active.weight = count[ , column.prop ] / count$active
    }

    if( length(column.strata) > 1 ) {
      g = as.list(count[, column.strata])
    } else {
      g = list(count[, column.strata])
      names(g) <- column.strata
    }

    if( calc.adj ) {

      for(v in syndromes) {
        v.adj = paste0(v,'.adj')
        count[, v.adj ] = count[, column.prop] * as.vector(count[, v]) / count$active
        if(conf.int) {
          # Estimated variance of the weighted counts is sum(wi^2 * count) for a strata
          # see ref below
          v.w2 = paste0(v, '.w2')
          count[, v.w2 ] = count$active.weight^2 * count[, v]
        }
      }

      v = paste0(syndromes,'.adj')
      if(conf.int) {
        v.w2 = paste0(syndromes,'.w2')
      } else {
        v.w2 = c()
      }

      inc.adj = aggregate(as.list(count[, c(v, v.w2)]), g, sum, na.rm=T)
    }
    inc.crude = aggregate(as.list(count[, c(syndromes, 'active' )]), g, sum, na.rm=T)
    inc.crude[ paste0(syndromes,'.crude') ] = inc.crude[syndromes] / inc.crude$active

    if(conf.int) {
      for(v in syndromes) {
        d = calc.conf.int(inc.crude, v, unit=1, type="crude")
        inc.crude = cbind(inc.crude, d)
      }
    }

    if( calc.adj ) {

      inc.crude = cbind(inc.crude, inc.adj)
      if(conf.int) {
        # Now compute adjusted confidence interval
        # Based on DKES estimator Fay & Feuer, 1997, Stat In Med (16) p791-801
        # Based on Poisson confidence interval using a rescaled distribution
        # y = weighted rate, xi = count in strata; wi = weight in strata; Ni = Sample size in (i)th strata;
        # x = sum(xi); popi = population in strata i; popT = Total population (all strata = sum(popi))
        # Up(x) & Lp(x) 1/2* Chisq quantile for respectively (1 - a / 2, DF=2(x+1)) and (a/2, DF=2x)
        # wi = 1/Ni * (popi / popT)
        # v = sum(wi^2 * xi)
        # U(y) = y + ( sqrt(v) / sqrt(x) ) * ( Up(x) - x)
        # L(y) = y + ( sqrt(v) / sqrt(x) ) * ( Lp(x) - x)

        conf.level = .95
        N. = 1 - ((1 - conf.level)/2)

        for(v in syndromes) {
          v.w2 = paste0(v, '.w2') # Variance of count
          v.adj = paste0(v, '.adj')
          x = inc.crude[, v] # Total Counts

          x.up = 0.5 * stats::qchisq(1 - N., 2 * (x + 1), lower.tail= F)

          inc.crude[, paste0(v.adj, '.upper') ] = inc.crude[, v.adj] + ( sqrt( inc.crude[, v.w2]) / sqrt(x) ) * (x.up - x)

          x.low = 0.5 * stats::qchisq(N., 2 * x, lower.tail=F)

          inc.crude[, paste0(v.adj, '.lower') ] = inc.crude[, v.adj] + ( sqrt( inc.crude[, v.w2]) / sqrt(x) ) * (x.low - x)
        }
      }
    }
    inc.crude
  }

  # result data
  r = list()

  # No stratification
  # compute crude incidence for each row (assuming that are independent count)
  if( is.null(design) ) {
    if( any(output != "inc") ) {
      warning("design is NULL, cannot compute output type other than overall incidence")
    }
    for(v in syndromes) {
      count.week[, paste0(v, '.crude') ] = count.week[, v ] / count.week$active
      if(conf.int) {
        d = calc.conf.int(count.week, v, unit=1, type="crude")
        count.week = cbind(count.week, d)
      }
    }
    r$inc = count.week
    return(r)
  }

  # List of stratification columns
  strata = design$strata

  # population by age-group at the geo level
  population = design$population

  `%>%` <- dplyr::`%>%`

  if(design$use.gender) {
    # Population is provided with gender in extra columns, we need to use them instead of population
    population = population %>% dplyr::select(-population)
    genders = c('male','female')
    # Pivot to longer format with gender as extra column
    population = dplyr::bind_rows(lapply(genders, function(g) {
      cols = names(population)
      cols = c(cols[ !cols %in% genders], g) # only keep non gender columns and add the currrent one
      population %>%
        dplyr::select(!!!syms(cols)) %>%
        dplyr::rename(population := !!sym(g)) %>%
        dplyr::mutate(gender=!!g)
    }))
  }

  pop.total = sum(population$population)

  population$prop.pop.all = population$population / pop.total # Proportion of a given age-group in overall population (i.e  (prop pop in geo levels) * (pop age-group in each level) )

  count.week = merge(count.week, population[, c(strata, 'prop.pop.all', 'population')], by=strata, all=T)

  # Incidence calculation by strata with adjustment
  # count = count data.frame (syndrome + active), column.strata = strata column, column.prop= column with adjustment factor
  if( !is.null(count.week$yw ) ) {
    group = "yw"
  } else {
    group = c()
  }

  if("inc" %in% output) {
    # Overall incidence
    count.week$zone = 1
    r$inc = calc_inc_strata(count.week, column.strata=c("zone", group), column.prop='prop.pop.all')
  }

  if( any( c('zlow','age') %in% output ) ) {
    # population in each geographic area (at lower level)
    pop.zlow = aggregate(stats::as.formula(paste("population ~ ", design$geo_column)), data=population, sum)
    names(pop.zlow) <- c(design$geo_column, 'pop.zlow')
    count.week = merge(count.week, pop.zlow, by=design$geo_column, all.x=T)
  }

  if("zlow" %in% output) {
    count.week$prop.age.zlow = count.week$population / count.week$pop.zlow # proportion of an age-group in each geographic area (at the lower level)
    r$zlow = calc_inc_strata(count.week, column.strata=c(group, design$geo_column), column.prop='prop.age.zlow')
  }
  if("age" %in% output) {
    count.week$prop.zlow = count.week$pop.zlow / pop.total # proportion of each geographic level in overall population
    r$age = calc_inc_strata(count.week, column.strata=c(group, "age.cat"), column.prop='prop.zlow')
  }
  if("count" %in% output) {
    r$count = count.week
  }
  r
}
cturbelin/ifnBase documentation built on Nov. 5, 2023, 12:54 p.m.