# R/QuasipoissonTestPredictData.r In raubreywhite/dashboards_sykdomspuls: Norwegian Syndromic Surveillance System - NorSySS

#### Documented in QuasipoissonTrainPredictData

```FormatDatasetDaily <- function(data) {
# variables used in data.table functions in this function
. <- NULL
trend <- NULL
dayOfYear <- NULL
dayOfWeek <- NULL
denominator <- NULL
# end

data[, trend := as.numeric(date) - 13000]
data[, dayOfYear := data.table::yday(date)]
data[, dayOfWeek := data.table::wday(date)]
data[denominator == 0, denominator := 1]

return(data)
}

FormatDatasetWeekly <- function(data, weeklyDenominatorFunction = sum) {
# variables used in data.table functions in this function
. <- NULL
n <- NULL
denominator <- NULL
trend <- NULL
pop <- NULL
HelligdagIndikator <- NULL
# end

data <- data[year >= 2006 & week %in% 1:52]
data[, trend := as.numeric(date) - 13000]
data <- data[, .(
n = sum(n),
denominator = weeklyDenominatorFunction(denominator),
HelligdagIndikator = mean(HelligdagIndikator),
trend = mean(trend)
),
by = .(date, year, week)
]

data <- data[, .(
n = sum(n),
denominator = weeklyDenominatorFunction(denominator),
HelligdagIndikator = mean(HelligdagIndikator),
trend = mean(trend)
),
by = .(year, week)
]

data[denominator == 0, denominator := 1]

return(data)
}

#' Quasipoisson Core Algorithm
#'
#' This is the core algorithm of sykdomspulsen.
#'
#' Description: Applys a surveillance algorithm based on a quasi-poisson regression
#' model to the selected data. The difference from the Farrington algorithm is in how
#' seasonality is accounted for (here it is adjusted for, in Farrington it is removed
#' by design.
#'
#' @param datasetTrain Training data.table
#' @param datasetPredict Prediction data.table
#' @param reweights Number (greater or equal to 0) of residual reweights adjusting for previous outbreaks (default: 1; 1 reweight)
#' @param remove.pandemic.year true/false (default: false; keep 2009 data)
#' @param remove.highcounts Number between 0 and 1 of fraction of high counts to be removed from prediction, to remove impact of earlier outbreaks (default: 0)
#' @param sign.level Significance level for the prediction intervals (default: 0.05)
#' @param isDaily Is it daily data or weekly data?
#' @param v Version (Not in use)
#' @param weeklyDenominatorFunction sum or mean - should the denominator be summed or meaned over time
#' @importFrom glm2 glm2
#' @import data.table
#' @import stringr
#' @import stats
#' @export QuasipoissonTrainPredictData
QuasipoissonTrainPredictData <- function(
datasetTrain,
datasetPredict,
reweights = 1,
remove.pandemic.year = F,
remove.highcounts = 0,
sign.level = 0.05,
isDaily = TRUE,
v = 1,
weeklyDenominatorFunction = sum) {
# variables used in data.table functions in this function
consult <- NULL
n <- NULL
threshold0 <- NULL
threshold2 <- NULL
threshold4 <- NULL
threshold6 <- NULL
zscore <- NULL
cumE1 <- NULL
cumL1 <- NULL
cumU1 <- NULL
failed <- NULL
revcumE1 <- NULL
revcumL1 <- NULL
revcumU1 <- NULL
denominator <- NULL
displayDay <- NULL
# end

# FUNCTION quasipoisson.algorithm
#
# Description: Applys a surveillance algorithm based on a quasi-poisson regression
# model to the selected data. The difference from the Farrington algorithm is in how
# seasonality is accounted for (here it is adjusted for, in Farrington it is removed
# by design.
#
# Input (arguments):
# dataset: data frame with columns for number of cases, covariates and dates per day
# datecol: name of date-column (default: 'Dato')
# predinterval: length of prediction interval (default: 30; last 30 days)
# historical.data.years: number (should be greater or equal to at least 3) of full years of background data (default: 5)
# mod.pred.window: number (greater or equal to 0) of days window between datasets used for modelling and prediction(default: 90)
# reweights: number (greater or equal to 0) of residual reweights adjusting for previous outbreaks (default: 1; 1 reweight)
# remove.pandemic.year: true/false (default: false; keep 2009 data)
# remove.highcounts: number between 0 and 1 of fraction of high counts to be removed from prediction, to remove impact of earlier outbreaks (default: 0)
# sign.level: significance level for the prediction intervals (default: 5%)

datasetTrain[denominator == 0, denominator := 1]
datasetTrain[, year := as.numeric(format.Date(date, "%G"))] # Week-based year, instead of normal year (%Y)
datasetTrain[, week := as.numeric(format.Date(date, "%V"))] # Week-based year, instead of normal year (%Y)
# datasetTrain[,week := data.table::isoweek(date)] #ISO-week, instead of others (%W and %U)

datasetPredict[denominator == 0, denominator := 1]
datasetPredict[, year := as.numeric(format.Date(date, "%G"))] # Week-based year, instead of normal year (%Y)
datasetPredict[, week := as.numeric(format.Date(date, "%V"))] # Week-based year, instead of normal year (%Y)
# datasetPredict[,week := data.table::isoweek(date)] #ISO-week, instead of others (%W and %U)

# SET REGRESSION FORMULA:
if (isDaily) {
regformula <- n ~ offset(log(denominator)) + trend + factor(dayOfWeek) + sin(2 * pi * dayOfYear / 366) + cos(2 * pi * dayOfYear / 366) + HelligdagIndikator

datasetTrain <- FormatDatasetDaily(datasetTrain)
datasetPredict <- FormatDatasetDaily(datasetPredict)
} else {
regformula <- n ~ offset(log(denominator)) + trend + sin(2 * pi * (week - 1) / 52) + cos(2 * pi * (week - 1) / 52) + HelligdagIndikator

datasetTrain <- FormatDatasetWeekly(datasetTrain, weeklyDenominatorFunction = weeklyDenominatorFunction)
datasetPredict <- FormatDatasetWeekly(datasetPredict, weeklyDenominatorFunction = weeklyDenominatorFunction)
}

if (remove.pandemic.year == T) {
datasetTrain <- datasetTrain[year != "2009"]
}

# If chosen, remove upper given percentage of counts from the prediction:
if (remove.highcounts > 0) {
datasetTrain <- datasetTrain[n < quantile(n, (1 - remove.highcounts)), ]
}

# FIT QUASI-POISSON REGRESSION MODEL ON THE TRAINING SET:
normalFunction <- function(regformula, datasetTrain) {
fit <- glm2::glm2(regformula, data = datasetTrain, family = quasipoisson, na.action = na.omit)
return(list(fit = fit, failed = !fit\$converged))
}
exceptionalFunction <- function(err) {
return(list(fit = NaN, failed = TRUE))
}
poisreg <- tryCatch(normalFunction(regformula, datasetTrain), error = exceptionalFunction, warning = exceptionalFunction)

if (poisreg\$failed) {
datasetPredict[, threshold0 := 0.0]
datasetPredict[, threshold2 := 5.0]
datasetPredict[, threshold4 := 10.0]
datasetPredict[, threshold6 := 15.0]
datasetPredict[, zscore := 0.0]

datasetPredict[, cumE1 := 0.0]
datasetPredict[, cumL1 := -5.0]
datasetPredict[, cumU1 := 5.0]
datasetPredict[, failed := TRUE]
} else {
# REFIT THE REGRESSION USING RESIDUAL WEIGHTS (TO DOWNWEIGHT PREVIOUS OUTBREAKS):
w_i <- rep(1, nrow(datasetTrain))
datasetTrain <- cbind(datasetTrain, w_i)

for (i in sort(1:reweights)) {
dispersion_parameter <- summary(poisreg\$fit)\$dispersion
if (i == 0) {
break
}
try({
anscombe.res <- anscombe.residuals(poisreg\$fit, dispersion_parameter)
anscombe.res[anscombe.res < 1] <- 1 # Alt. 2.58?
datasetTrain[, w_i := anscombe.res^(-2)] # The weight
Gamma <- nrow(datasetTrain) / sum(datasetTrain\$w_i)
datasetTrain[, w_i := Gamma * w_i] # Makes sum(w_i) = n
poisreg\$fit <- glm2::glm2(regformula, data = datasetTrain, weights = w_i, family = quasipoisson, na.action = na.omit)
dispersion_parameter <- summary(poisreg\$fit)\$dispersion
od <- max(1, sum(poisreg\$fit\$weights * poisreg\$fit\$residuals^2) / poisreg\$fit\$df.r)
}, TRUE)
}

# CALCULATE SIGNAL THRESHOLD (prediction interval from Farrington 1996):
pred <- predict(poisreg\$fit, type = "response", se.fit = T, newdata = datasetPredict)
datasetPredict[, threshold0 := pred\$fit]
datasetPredict[, threshold2 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 2, skewness.transform = "2/3")]
datasetPredict[, threshold4 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 4, skewness.transform = "2/3")]
datasetPredict[, threshold6 := FarringtonThreshold(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3")]
datasetPredict[, zscore := FarringtonZscore(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3", y = n)]

datasetPredict[, stderr := FarringtonSEinGammaSpace(pred, phi = dispersion_parameter, z = 6, skewness.transform = "2/3")]
datasetPredict[, cumE1 := n^(2 / 3) - threshold0^(2 / 3)]
datasetPredict[, cumL1 := (cumE1 - 2 * stderr)^(3 / 2)]
datasetPredict[, cumU1 := (cumE1 + 2 * stderr)^(3 / 2)]
datasetPredict[, cumE1 := cumE1^(3 / 2)]

datasetPredict[, revcumE1 := threshold0^(2 / 3) - n^(2 / 3)]
datasetPredict[, revcumL1 := (revcumE1 - 2 * stderr)^(3 / 2)]
datasetPredict[, revcumU1 := (revcumE1 + 2 * stderr)^(3 / 2)]
datasetPredict[, revcumE1 := revcumE1^(3 / 2)]

datasetPredict[is.nan(cumE1), cumE1 := -revcumE1]
datasetPredict[is.nan(cumL1), cumL1 := -revcumU1]
datasetPredict[is.nan(cumU1), cumU1 := -revcumL1]
datasetPredict[, revcumE1 := NULL]
datasetPredict[, revcumL1 := NULL]
datasetPredict[, revcumU1 := NULL]
datasetPredict[, stderr := NULL]
datasetPredict[, failed := FALSE]
}