R/QuasipoissonTestPredictData.r

Defines functions QuasipoissonTrainPredictData FormatDatasetWeekly FormatDatasetDaily

Documented in QuasipoissonTrainPredictData

FormatDatasetDaily <- function(data) {
  # variables used in data.table functions in this function
  . <- NULL
  trend <- NULL
  dayOfYear <- NULL
  dayOfWeek <- NULL
  denominator <- NULL
  # end

  data[, trend := as.numeric(date) - 13000]
  data[, dayOfYear := data.table::yday(date)]
  data[, dayOfWeek := data.table::wday(date)]
  data[denominator == 0, denominator := 1]

  return(data)
}

FormatDatasetWeekly <- function(data, weeklyDenominatorFunction = sum) {
  # variables used in data.table functions in this function
  . <- NULL
  n <- NULL
  denominator <- NULL
  trend <- NULL
  pop <- NULL
  HelligdagIndikator <- NULL
  # end

  data <- data[year >= 2006 & week %in% 1:52]
  data[, trend := as.numeric(date) - 13000]
  data <- data[, .(
    n = sum(n),
    denominator = weeklyDenominatorFunction(denominator),
    HelligdagIndikator = mean(HelligdagIndikator),
    trend = mean(trend)
  ),
  by = .(date, year, week)
  ]

  data <- data[, .(
    n = sum(n),
    denominator = weeklyDenominatorFunction(denominator),
    HelligdagIndikator = mean(HelligdagIndikator),
    trend = mean(trend)
  ),
  by = .(year, week)
  ]

  data[denominator == 0, denominator := 1]

  return(data)
}


#' Quasipoisson Core Algorithm
#'
#' This is the core algorithm of sykdomspulsen.
#'
#' Description: Applys a surveillance algorithm based on a quasi-poisson regression
#' model to the selected data. The difference from the Farrington algorithm is in how
#' seasonality is accounted for (here it is adjusted for, in Farrington it is removed
#' by design.
#'
#' @param datasetTrain Training data.table
#' @param datasetPredict Prediction data.table
#' @param reweights Number (greater or equal to 0) of residual reweights adjusting for previous outbreaks (default: 1; 1 reweight)
#' @param remove.pandemic.year true/false (default: false; keep 2009 data)
#' @param remove.highcounts Number between 0 and 1 of fraction of high counts to be removed from prediction, to remove impact of earlier outbreaks (default: 0)
#' @param sign.level Significance level for the prediction intervals (default: 0.05)
#' @param isDaily Is it daily data or weekly data?
#' @param v Version (Not in use)
#' @param weeklyDenominatorFunction sum or mean - should the denominator be summed or meaned over time
#' @importFrom glm2 glm2
#' @import data.table
#' @import stringr
#' @import stats
#' @export QuasipoissonTrainPredictData
QuasipoissonTrainPredictData <- function(
                                         datasetTrain,
                                         datasetPredict,
                                         reweights = 1,
                                         remove.pandemic.year = F,
                                         remove.highcounts = 0,
                                         sign.level = 0.05,
                                         isDaily = TRUE,
                                         v = 1,
                                         weeklyDenominatorFunction = sum) {
  # variables used in data.table functions in this function
  consult <- NULL
  n <- NULL
  threshold0 <- NULL
  threshold2 <- NULL
  threshold4 <- NULL
  threshold6 <- NULL
  zscore <- NULL
  cumE1 <- NULL
  cumL1 <- NULL
  cumU1 <- NULL
  failed <- NULL
  revcumE1 <- NULL
  revcumL1 <- NULL
  revcumU1 <- NULL
  denominator <- NULL
  displayDay <- NULL
  # end

  # FUNCTION quasipoisson.algorithm
  #
  # Description: Applys a surveillance algorithm based on a quasi-poisson regression
  # model to the selected data. The difference from the Farrington algorithm is in how
  # seasonality is accounted for (here it is adjusted for, in Farrington it is removed
  # by design.
  #
  # Input (arguments):
  # dataset: data frame with columns for number of cases, covariates and dates per day
  # datecol: name of date-column (default: 'Dato')
  # predinterval: length of prediction interval (default: 30; last 30 days)
  # historical.data.years: number (should be greater or equal to at least 3) of full years of background data (default: 5)
  # mod.pred.window: number (greater or equal to 0) of days window between datasets used for modelling and prediction(default: 90)
  # reweights: number (greater or equal to 0) of residual reweights adjusting for previous outbreaks (default: 1; 1 reweight)
  # remove.pandemic.year: true/false (default: false; keep 2009 data)
  # remove.highcounts: number between 0 and 1 of fraction of high counts to be removed from prediction, to remove impact of earlier outbreaks (default: 0)
  # sign.level: significance level for the prediction intervals (default: 5%)

  datasetTrain[denominator == 0, denominator := 1]
  datasetTrain[, year := as.numeric(format.Date(date, "%G"))] # Week-based year, instead of normal year (%Y)
  datasetTrain[, week := as.numeric(format.Date(date, "%V"))] # Week-based year, instead of normal year (%Y)
  # datasetTrain[,week := data.table::isoweek(date)] #ISO-week, instead of others (%W and %U)

  datasetPredict[denominator == 0, denominator := 1]
  datasetPredict[, year := as.numeric(format.Date(date, "%G"))] # Week-based year, instead of normal year (%Y)
  datasetPredict[, week := as.numeric(format.Date(date, "%V"))] # Week-based year, instead of normal year (%Y)
  # datasetPredict[,week := data.table::isoweek(date)] #ISO-week, instead of others (%W and %U)

  # SET REGRESSION FORMULA:
  if (isDaily) {
    regformula <- n ~ offset(log(denominator)) + trend + factor(dayOfWeek) + sin(2 * pi * dayOfYear / 366) + cos(2 * pi * dayOfYear / 366) + HelligdagIndikator

    datasetTrain <- FormatDatasetDaily(datasetTrain)
    datasetPredict <- FormatDatasetDaily(datasetPredict)
  } else {
    regformula <- n ~ offset(log(denominator)) + trend + sin(2 * pi * (week - 1) / 52) + cos(2 * pi * (week - 1) / 52) + HelligdagIndikator

    datasetTrain <- FormatDatasetWeekly(datasetTrain, weeklyDenominatorFunction = weeklyDenominatorFunction)
    datasetPredict <- FormatDatasetWeekly(datasetPredict, weeklyDenominatorFunction = weeklyDenominatorFunction)
  }

  if (remove.pandemic.year == T) {
    datasetTrain <- datasetTrain[year != "2009"]
  }

  # If chosen, remove upper given percentage of counts from the prediction:
  if (remove.highcounts > 0) {
    datasetTrain <- datasetTrain[n < quantile(n, (1 - remove.highcounts)), ]
  }

  # FIT QUASI-POISSON REGRESSION MODEL ON THE TRAINING SET:
  normalFunction <- function(regformula, datasetTrain) {
    fit <- glm2::glm2(regformula, data = datasetTrain, family = quasipoisson, na.action = na.omit)
    return(list(fit = fit, failed = !fit$converged))
  }
  exceptionalFunction <- function(err) {
    return(list(fit = NaN, failed = TRUE))
  }
  poisreg <- tryCatch(normalFunction(regformula, datasetTrain), error = exceptionalFunction, warning = exceptionalFunction)

  if (poisreg$failed) {
    datasetPredict[, threshold0 := 0.0]
    datasetPredict[, threshold2 := 5.0]
    datasetPredict[, threshold4 := 10.0]
    datasetPredict[, threshold6 := 15.0]
    datasetPredict[, zscore := 0.0]

    datasetPredict[, cumE1 := 0.0]
    datasetPredict[, cumL1 := -5.0]
    datasetPredict[, cumU1 := 5.0]
    datasetPredict[, failed := TRUE]
  } else {
    # REFIT THE REGRESSION USING RESIDUAL WEIGHTS (TO DOWNWEIGHT PREVIOUS OUTBREAKS):
    w_i <- rep(1, nrow(datasetTrain))
    datasetTrain <- cbind(datasetTrain, w_i)

    for (i in sort(1:reweights)) {
      dispersion_parameter <- summary(poisreg$fit)$dispersion
      if (i == 0) {
        break
      }
      try({
        anscombe.res <- anscombe.residuals(poisreg$fit, dispersion_parameter)
        anscombe.res[anscombe.res < 1] <- 1 # Alt. 2.58?
        datasetTrain[, w_i := anscombe.res^(-2)] # The weight
        Gamma <- nrow(datasetTrain) / sum(datasetTrain$w_i)
        datasetTrain[, w_i := Gamma * w_i] # Makes sum(w_i) = n
        poisreg$fit <- glm2::glm2(regformula, data = datasetTrain, weights = w_i, family = quasipoisson, na.action = na.omit)
        dispersion_parameter <- summary(poisreg$fit)$dispersion
        od <- max(1, sum(poisreg$fit$weights * poisreg$fit$residuals^2) / poisreg$fit$df.r)
      }, TRUE)
    }

    # CALCULATE SIGNAL THRESHOLD (prediction interval from Farrington 1996):
    pred <- predict(poisreg$fit, type = "response", se.fit = T, newdata = datasetPredict)
    datasetPredict[, threshold0 := pred$fit]
    datasetPredict[, threshold2 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 2, skewness.transform = "2/3")]
    datasetPredict[, threshold4 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 4, skewness.transform = "2/3")]
    datasetPredict[, threshold6 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3")]
    datasetPredict[, zscore := FarringtonZscore(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3", y = n)]

    datasetPredict[, stderr := FarringtonSEinGammaSpace(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3")]
    datasetPredict[, cumE1 := n^(2 / 3) - threshold0^(2 / 3)]
    datasetPredict[, cumL1 := (cumE1 - 2 * stderr)^(3 / 2)]
    datasetPredict[, cumU1 := (cumE1 + 2 * stderr)^(3 / 2)]
    datasetPredict[, cumE1 := cumE1^(3 / 2)]

    datasetPredict[, revcumE1 := threshold0^(2 / 3) - n^(2 / 3)]
    datasetPredict[, revcumL1 := (revcumE1 - 2 * stderr)^(3 / 2)]
    datasetPredict[, revcumU1 := (revcumE1 + 2 * stderr)^(3 / 2)]
    datasetPredict[, revcumE1 := revcumE1^(3 / 2)]

    datasetPredict[is.nan(cumE1), cumE1 := -revcumE1]
    datasetPredict[is.nan(cumL1), cumL1 := -revcumU1]
    datasetPredict[is.nan(cumU1), cumU1 := -revcumL1]
    datasetPredict[, revcumE1 := NULL]
    datasetPredict[, revcumL1 := NULL]
    datasetPredict[, revcumU1 := NULL]
    datasetPredict[, stderr := NULL]
    datasetPredict[, failed := FALSE]
  }

  datasetPredict <- AddXToWeekly(datasetPredict)
  datasetPredict <- AddWkyrAndDisplayDateToWeekly(datasetPredict)

  if (isDaily) {
    datasetPredict[, displayDay := date]
  } else {
    datasetPredict[, date := displayDay]
  }
  datasetPredict[, displayDay := as.Date(displayDay)]
  datasetPredict[, date := as.Date(date)]

  return(datasetPredict[, VARS$REQ_RESULTS_BASIC, with = F])
}
raubreywhite/dashboards_sykdomspuls documentation built on Nov. 29, 2018, 4:23 a.m.