## hide all code chunks in the output, but show errors knitr::opts_chunk$set(echo = FALSE, # hide all code chunks in output error = TRUE, # show errors if they appear, but don't stop warning = FALSE, # hide all warnings message = FALSE, # hide all messages fig.width = 16*0.39, # Figure width for plots (cm to inch conv) fig.height = 20*0.39, # Figure height for plots (cm to inch conv) tab.cap.style = "Table Caption", tab.cap.pre = "Table ", tab.cap.sep = ": " ) ## Installing required packages for this template required_packages <- c("here", # find your files "stringr", # clean text "epitrix", # epi helpers and tricks "dplyr", # clean/shape data "tidyr", # clean/shape data "purrr", # for iterating over projects "rio", # read in data "ggplot2", # create plots and charts "tsibble", # for time series plotting "slider", # for moving averages "feasts", # for time series decomposition and autocorrelation "forecast", # fit sin and cosin terms to data (note: must load after feasts) "imputeTS", # for filling in missing values "knitr", # for making clean tables "flextable", # for making clean tables "yardstick", # for model forecast performance estimates "surveillance", # for aberration methods "officer", # for formatting word output "officedown", # for formatting word output "trending" # for fitting regressions and forecasting ) for (pkg in required_packages) { # install packages if not already present if (!pkg %in% rownames(installed.packages())) { install.packages(pkg) } # load packages to this current session library(pkg, character.only = TRUE) } ## installing github packages for this template ## Installing required packages from github git_packages <- c("epitsa" # for msf data tools and tsa utilities ) for (pkg in git_packages) { ## install packages if not already present from appropriate repo if (!pkg %in% rownames(installed.packages())) { if (pkg == "epitsa") { remotes::install_github("R4EPI/epitsa") } } ## load packages to this current session library(pkg, character.only = TRUE) }
## Whether or not should be based on date of admission (if not then date of exit) admissions_based <- FALSE ## whether or not to show detailed outputs (data quality and descriptive analysis) show_details <- FALSE ## Whether or not want to run the time series analysis include_tsa <- TRUE ## whether want to include aberration methods (only relevant for TSA) include_aberration <- FALSE ## which aberration method to use (options are: "farrington", "glrnb") aberration_method <- "glrnb" ## define which wards are included by country ## "Medical" is excluding ITFC + maternity in IPD ("Medical extension" includes) medical_wards <- if_else(z %in% c("Bangladesh", "South Sudan", "Nigeria", "Afghanistan", "DRC North Kivu"), "Medical", "Medical extension") ## If only want to run a specific country: uncomment beloww and define # z <- "DRC North Kivu"
## define dates want to plot ## the first date interested in start_date <- yearweek("2014 W01") ## using tsibble ## the surveillance cut off date (i.e. end of baseline, start of prediction period) surveillance_date <- yearweek("2019-12-31") ## the last date interested in (i.e. end of prediction) end_date <- yearweek("2020-12-31") ## using tsibble ## find how many weeks in period (year) of interest num_weeks <- as.numeric(end_date - surveillance_date) ## define seqeuence of all weeks to label axes in plots ## get a vector of all the dates in between all_dates <- as.Date(start_date:end_date, origin = "1970-01-01") ## using base ## pull the unique weeks which occur all_weeks <- seq(start_date, end_date, by = 1) %>% factor() ## using tsibble ## change weeks to dates all_epiweeks <- as.Date(seq(start_date, end_date, by = 1)) ## using tsibble
## define the filepath file_path <- here::here("Data", "Processed", z)
## define breaks as weeks date_breaks <- seq(as.Date(start_date), max(all_epiweeks), by = "6 months") ## define ggplot themes tsa_axes <- ## set the x-axis limits based on weeks of interest, only label every january scale_x_date(breaks = date_breaks, date_labels = "%Y %b", limits = c( first(all_epiweeks), last(all_epiweeks) )) tsa_theme <- ## give classic black/white axes for plots and set text size theme_classic() + theme( ## set the size for text text = element_text(size = 15), ## colour and size the grid lines in the plot panel.grid.major = element_line(colour = "grey90"), panel.grid.minor = element_line(colour = "grey90", size = 0.5), ## add space between faceted plots (axis labels run in to eachother) panel.spacing = unit(1, "lines"), ## rotate x axis labels axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1), ## put legend below plot legend.position = "bottom", legend.direction = "horizontal" ) ## define appropriate label for the x-axis week_lable <- if_else(admissions_based, "Calendar week of admission", "Calendar week of discharge") ## define appropriate label for use in figure captions caption_lable <- if_else(admissions_based, "Admissions", "Discharges") ## define appropriate label for some y-axes unit_lable <- str_c(caption_lable," (n)")
## set the data source depending on the country data_source <- if_else(z %in% c("CAR", "India"), "HIS", "Both") ## pull ipd data clean_data <- read_msf_data(file_path, site = "IPD", data_source = data_source, ## dont use DHIS team's chronic disease definitions chronic_defs = FALSE) ## set admission dates and exit dates which more than 1 year before start year to NA clean_data$date_of_admission[which(clean_data$date_of_admission < min(all_dates) - 365)] <- NA clean_data$date_of_exit[which(clean_data$date_of_exit < min(all_dates) - 365)] <- NA ## set admission and exit dates in the future to NA (i.e. after current date) clean_data$date_of_admission[which(clean_data$date_of_admission > Sys.Date())] <- NA clean_data$date_of_exit[which(clean_data$date_of_exit > Sys.Date())] <- NA ## drop appropriate missings depending on whether doing by admission or exit if (admissions_based) { ## drop those with missing date of admission clean_data <- filter(clean_data, !is.na(date_of_admission)) ## also need to drop missing exit for admissions based? } else { ## drop those with missing date of exit clean_data <- filter(clean_data, !is.na(date_of_exit)) } ## drop duplicates ## duplicate based on: ## "calendar_week", "project", "case_number", "age_years", "sex", "diagnosis_at_exit_primary" clean_data <- filter(clean_data, !duplicate_row)
## create a categorical variable for diseases of interest ## using linelist data clean_data <- clean_data %>% ## rename diseases as appropriate ## set all those not interested in to be other mutate(disease = case_when( ## URTI was only introduced in DHIS2 - so do not use for TSA # diagnosis_at_exit_primary %in% c("Lower respiratory tract infection", # "Upper respiratory tract infection") ~ "ARI", # diagnosis_at_exit_primary == "Upper respiratory tract infection" ~ "URTI", diagnosis_at_exit_primary == "Lower respiratory tract infection" ~ "LRTI", diagnosis_at_exit_primary %in% c("Diarrhoea, acute watery", "Diarrhoea, bloody") ~ "Diarrhoea", diagnosis_at_exit_primary == "Malaria" ~ "Malaria", diagnosis_at_exit_primary == "Tuberculosis, pulmonary" ~ "TB", ## HIV removed because collected in different tools (e.g. fuchia etc) # diagnosis_at_exit_primary == "HIV and HIV-related illnesses" ~ "HIV related", ## chronic diseases removed because different definitions between HIS/DHIS2 # diagnosis_at_exit_primary %in% c("Chronic diseases", # "Acute hypertensive crisis", # "Acute cardiac events", # "Acute complications of diabetes", # "Chronic cardio & cerebrovascular disease", # "Chronic complications of diabetes", # "Non-infectious respiratory diseases - chronic", # "Non-infectious respiratory disease - acute exacerbation", # "Epilepsy") ~ "Chronic diseases", TRUE ~ "Other" )) ## create an age group variable by specifying categorical breaks clean_data$age_group <- sitrep::age_categories(clean_data$age_years, breakers = c(0, 3, 15, 30, 45)) %>% ## change NAs to "missing" forcats::fct_explicit_na("Missing") ## remove extra words from project names clean_data$project <- str_remove(clean_data$project, " Secondary Healthcare| Secondary Health care| Primary Healthcare| Primary healthcare| South Sudan Refugees| Healthcare") ## define unique projects (for looping over later) projects <- unique(clean_data$project) ## drop inpatient and maternity where appropriate if (medical_wards == "Medical") { clean_data <- clean_data %>% filter(!admission_ward %in% c("Inpatient TFC", "Maternity ward")) }
## pull ipd data bed_counts <- read_msf_data(file_path, site = "IPD Beds", ## pull from the appropriate data_source (HIS or DHIS2) data_source = data_source, ## Only count beds from appropriate wards wards = medical_wards) ## remove extra words from project names bed_counts$project <- str_remove(bed_counts$project, " Secondary Healthcare| Secondary Health care| Primary Healthcare| Primary healthcare| South Sudan Refugees| Healthcare") ## find duplicates and only keep the higher bed count bed_counts <- bed_counts %>% group_by(country, project, report_year, week, month) %>% mutate( ## get the number of times duplicate occurs num_dupes = n(), duped = if_else(num_dupes > 1 , TRUE, FALSE) ) %>% arrange(-beds) %>% slice(1) %>% ungroup()
## pull timelines data timelines <- rio::import(system.file("extdata", "timelines.xlsx", package = "epitsa"), which = "msf_info") %>% ## filter to only keep country currently being run filter(country == z & project %in% projects) ## make sure dates timelines$date_event <- as.Date(timelines$date_event, origin = "1899-12-30") ## make in to an epiweek timelines$calendar_week <- yearweek(timelines$date_event)
This report is based on data management and data analysis scripts written by Alexander
Spina. If you have questions, please reach out to your epidemiology advisor.
The report uses data available for projects (r str_c(projects, collapse = ", ")
)
in r str_replace(z, "_", " ")
from old Excel based HIS tools as well as DHIS2
databases as of r format(Sys.Date(), format = "%d %B %Y")
.
For a breakdown of the time period covered by different data sources see the
appendix.
For more details see the concept note.
This report is structured by data from outpatient departments (OPD) and inpatient departments (IPD). [Note: OPD will be added in a later draft]
There are initial descriptive sections, for both OPD and IPD, to evaluate data quality and availability.
Then there is a section on time series analysis which uses only the most suitable data available. There are two sections in time series analysis; one which is for surveillance (looking at excess in cases of diseases or death by week) and an impact section using segmented regression to look at whether COVID-19 had an effect on services utilisation.
The first case of confirmed Covid-19 in the country was in
r filter(timelines, event == "Covid country") %>% summarise(min(calendar_week)) %>% pull()
## use dataset of events by date ## save as object for later use intervention_weeks <- timelines %>% ## only keep the dates for covid cases filter(event %in% c("Covid suspect", "Covid confirmed", "Covid region")) %>% ## only keep relevant columns select(project, calendar_week, event) %>% ## spread dataset in to wide format by project (event becomes columns) pivot_wider(id_cols = project, names_from = event, values_from = calendar_week) ## create table for output documents ## rename columns appropriately intervention_weeks %>% select( Project = project, "Suspected (MSF linelist)" = "Covid suspect", "Confirmed (MSF linelist)" = "Covid confirmed", "Confirmed (region)" = "Covid region") %>% ## show as a formatted table in the output flextable() %>% autofit() %>% set_caption("Calendar week of Covid-19 case reports at MSF facilities and in the region, by project") ## only keep regional date for use in segmented regression intervention_weeks <- intervention_weeks %>% select(project, confirmed_region = "Covid region")
\newpage
(Epi-advisor to comment on most relevant results here)
\newpage
This section looks at availability and quality of data from inpatient departments for each project.
## define how to group (either by admission or exit date) age_case_counts <- if (admissions_based) { group_by(clean_data, project, epiweek, age_group) } else { group_by(clean_data, project, epiweek_death, age_group) } ## get patient counts (admissions or discharges) by age group age_patient_counts <- age_case_counts %>% summarise(disease = "Patients", "total" = n()) %>% ## remove grouping so can be worked with after ungroup() ## get counts for diseases by age group age_case_counts <- age_case_counts %>% ## get counts for each disease count(disease, name = "total") %>% ## remove grouping so can be worked with after ungroup() ## If grouped by exit date - rename epiweek_death to epiweek ## this makes all code the same after this point if (!admissions_based) { age_patient_counts <- rename(age_patient_counts, epiweek = epiweek_death) age_case_counts <- rename(age_case_counts, epiweek = epiweek_death) } ## make patient counts tsibble conform - to fill in missings age_patient_counts <- age_patient_counts %>% ## make epiweek in to tsibble compatible week variable mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease, age_group)) %>% ## add in missing weeks with NAs fill_gaps() %>% ## make back in to a tibble for manupulation as_tibble() ## make case counts tsibble conform - to fill in missings age_case_counts <- age_case_counts %>% ## make epiweek in to tsibble compatible week variable mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease, age_group)) %>% ## add in missing weeks with NAs fill_gaps() %>% ## make back in to a tibble for manupulation as_tibble() ## get counts for disease overall case_counts <- age_case_counts %>% ## summarise by project week and disease (drop age_group) group_by(project, epiweek, disease) %>% summarise(total = sum(total, na.rm = TRUE)) %>% ## remove grouping so can be worked with after ungroup() ## get patient counts (admissions or discharges) by week patient_counts <- age_patient_counts %>% group_by(project, epiweek) %>% summarise(disease = "Patients", "total" = sum(total, na.rm = TRUE)) %>% ## remove grouping so can be worked with after ungroup() ## get the overall death counts by age group age_death_counts <- clean_data %>% group_by(project, epiweek_death, age_group) %>% summarise(disease = "Deaths", total = sum(inpatient_exit_status == "Dead", na.rm = TRUE)) %>% ## rename epiweek_death to epiweek (so can use with other counts) rename(epiweek = epiweek_death) %>% ## remove grouping so can be worked with after ungroup() ## get the disease specific death counts by age group age_disease_death_counts <- clean_data %>% group_by(project, epiweek_death, age_group, disease) %>% summarise(deaths = sum(inpatient_exit_status == "Dead", na.rm = TRUE)) %>% ## rename epiweek_death to epiweek (so can use with other counts) rename(epiweek = epiweek_death) %>% ## remove grouping so can be worked with after ungroup() ## make patient counts tsibble conform - to fill in missings age_death_counts <- age_death_counts %>% ## make epiweek in to tsibble compatible week variable mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease, age_group)) %>% ## add in missing weeks with NAs fill_gaps() %>% ## make back in to a tibble for manupulation as_tibble() ## make case counts tsibble conform - to fill in missings age_disease_death_counts <- age_disease_death_counts %>% ## make epiweek in to tsibble compatible week variable mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease, age_group)) %>% ## add in missing weeks with NAs fill_gaps() %>% ## make back in to a tibble for manupulation as_tibble() ## get the overall death counts death_counts <- age_death_counts %>% ## summarise by project and week (drop age_group) group_by(project, epiweek) %>% summarise(disease = "Deaths", total = sum(total, na.rm = TRUE)) %>% ## remove grouping so can be worked with after ungroup() ## get the disease specific death counts disease_death_counts <- age_disease_death_counts %>% ## summarise by project and week and disease (drop age_group) group_by(project, epiweek, disease) %>% summarise(deaths = sum(deaths, na.rm = TRUE))
## bind together counts and deaths tile_counts <- bind_rows(patient_counts, case_counts, death_counts) tile_counts <- tile_counts %>% ## only keep three diseases of interest filter(disease %in% c("Patients", "LRTI", "Deaths")) %>% ## create a categorical variable about whether data and counts available or not mutate(categories = case_when( is.na(total) ~ NA_character_, total == 0 ~ "Zero count", total > 0 ~ "Counts available"))
## using tile_counts, plot counts by week and colour by category ggplot(filter(tile_counts, !disease %in% c("lrti_rt", "deaths_rt")), aes(x = as.Date(epiweek), y = disease, fill = categories)) + ## plot as tiles with a thin white border geom_tile(colour = "white", size = 0.25) + ## plot for each site individually facet_grid(project ~ . ) + ## choose colours and set the category names for legend scale_fill_manual(values = c("#009E73", "#E69F00"), na.value = "Grey90", labels = c("Counts (>0)", "Zero count", "Missing")) + ## add axes titles appropriately labs(x = week_lable, y = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme
## using tile_counts tile_counts %>% ## dont use the rate measures # filter(!disease %in% c("lrti_rt", "deaths_rt")) %>% ## only keep weeks before 2020 filter(epiweek %in% all_epiweeks[all_epiweeks < as.Date(surveillance_date)]) %>% ## calculate by site and disease group_by(project, disease) %>% ## get the number of weeks with non missing counts and divide by expected total summarise(complete_n = sum(!is.na(total) & total != 0), completeness = round(complete_n / sum(all_epiweeks < as.Date(surveillance_date)) * 100, digits = 1)) %>% ## change column names appropriately rename("Project" = project, "Disease" = disease, "Weeks with data (n)" = complete_n, "Completeness(%)" = completeness) %>% ## print as a nice table kable(caption = "Table: Completeness of data by project site and indicator for IPD until end 2019")
## using the case counts admission_counts <- case_counts %>% ## make sure disease appear in the correct order mutate(disease = factor(disease, levels = c("Other", "LRTI", "Diarrhoea", "Malaria", "TB", # "HIV related", # "Chronic diseases" NULL # so can comment out others above )))
## define the height that want to use for text labels height <- admission_counts %>% ## for each project and week group_by(project, epiweek) %>% ## find the tallest peak summarise(height = sum(total)) %>% ## find the highest peak out of all projects summarise(height = max(height)) %>% ## take 3/4s of that summarise(max(height) * 0.75) %>% ## return the value pull() ## using admission_counts, plot counts by week and colour by category ggplot() + ## plot for each site individually facet_grid(project ~ . ) + ## plot as a bar chart, colour by disease, use counts as they are geom_bar(data = admission_counts, aes(x = as.Date(epiweek), y = total, fill = disease, colour = disease), stat = "identity") + ## add in dhis2 start date line and label geom_vline(data = filter(timelines, event == "Start DHIS2"), aes(xintercept = as.Date(calendar_week)), linetype = 2) + geom_label(data = filter(timelines, event == "Start DHIS2"), aes(x = as.Date(calendar_week), y = height, label = event)) + ## choose colours for boxes and outlines scale_fill_manual(values = c("Grey90", "#72A162", ## green "#485C6E", ## darkblue "#9F4D48", ## red "#F1B077", ## orange # "#E2FCFF", ## sky blue # "#AAA970", ## beige NULL ## so can comment others out )) + scale_colour_manual(values = c("Grey90", "#72A162", ## green "#485C6E", ## darkblue "#9F4D48", ## red "#F1B077", ## orange # "#E2FCFF", ## sky blue # "#AAA970", ## beige NULL ## so can comment others out )) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## label the axes appropriately and remove legend titles labs(x = week_lable, y = unit_lable, colour = element_blank(), fill = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme
## join case counts and patient counts cases <- left_join(case_counts, ## from patient counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(patient_counts, project, epiweek, patients = total) , ## combine project and epiweek as unique identifier by = c("project", "epiweek")) ## make rows where admissions less than 10 NA (otherwise throws off everything) cases[which(cases$patients < 10), c("total", "patients")] <- NA ## add in a rate column cases <- cases %>% mutate(rate = total / patients * 10000) ## join deaths and counts ## this is in two steps because the denominators are different (overall vs diseases) deaths1 <- left_join(death_counts, ## from patient counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(patient_counts, project, epiweek, patients = total), ## combine project and epiweek as unique identifier by = c("project", "epiweek")) %>% ## change deaths to overall for disease so is labelled clearer mutate(disease = "Overall") deaths2 <- left_join(disease_death_counts, ## from case counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(case_counts, project, epiweek, patients = total, disease), ## combine project and epiweek as unique identifier by = c("project", "epiweek", "disease")) %>% rename(total = deaths) ## combine overall and disease specific death counts deaths <- bind_rows(deaths1, deaths2) ## make rows where admissions less than 10 NA (otherwise throws off everything) ## get weeks where patient admissions are low low_count_weeks <- filter(patient_counts, total < 10) %>% mutate(filler = TRUE, epiweek = as.Date(epiweek)) %>% select(project, epiweek, filler) ## merge with deaths temporarily to get the weeks to set to empty deaths <- left_join(deaths, low_count_weeks, by = c("project", "epiweek")) ## set patient and totals columns to empty deaths[which(deaths$filler), c("total", "patients")] <- NA ## drop extra variable deaths <- select(deaths, -filler) ## add in a rate column deaths <- deaths %>% mutate(rate = total / patients * 10000) ## drop the intermediate datasets rm(deaths1, deaths2)
## join case counts and patient counts age_cases <- left_join(age_case_counts, ## from patient counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(age_patient_counts, project, epiweek, age_group, patients = total), ## combine project and epiweek as unique identifier by = c("project", "epiweek", "age_group")) ## make rows where admissions less than 10 NA (otherwise throws off everything) age_cases[which(age_cases$patients < 10), c("total", "patients")] <- NA ## add in a rate column age_cases <- age_cases %>% mutate(rate = total / patients * 10000) ## join deaths and counts ## this is in two steps because the denominators are different (overall vs diseases) deaths1 <- left_join(age_death_counts, ## from patient counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(age_patient_counts, project, age_group, epiweek, patients = total), ## combine project and epiweek as unique identifier by = c("project", "epiweek", "age_group")) %>% ## change deaths to overall for disease so is labelled clearer mutate(disease = "Overall") deaths2 <- left_join(age_disease_death_counts, ## from case counts only keep project, epiweek and rename ## "total" column to "patients" (seperate column on merge) select(age_case_counts, project, epiweek, age_group, patients = total, disease), ## combine project and epiweek as unique identifier by = c("project", "epiweek", "disease", "age_group")) %>% rename(total = deaths) ## combine overall and disease specific death counts age_deaths <- bind_rows(deaths1, deaths2) ## make rows where admissions less than 10 NA (otherwise throws off everything) ## get weeks where patient admissions are low low_count_weeks <- filter(age_patient_counts, total < 10) %>% mutate(filler = TRUE, epiweek = as.Date(epiweek)) %>% select(project, epiweek, age_group, filler) ## merge with deaths temporarily to get the weeks to set to empty age_deaths <- left_join(age_deaths, low_count_weeks, by = c("project", "epiweek", "age_group")) ## set patient and totals columns to empty age_deaths[which(age_deaths$filler), c("total", "patients")] <- NA ## drop extra variable age_deaths <- select(age_deaths, -filler) ## add in a rate column age_deaths <- age_deaths %>% mutate(rate = total / patients * 10000) ## drop the intermediate datasets rm(deaths1, deaths2)
## plot counts by week using tile_plots ggplot(filter(cases, disease %in% c("Diarrhoea", "LRTI", "Malaria")), aes(x = as.Date(epiweek), y = total)) + ## plot as a line chart ## identity specifies to use the counts as are geom_line(stat = "identity") + ## plot for each site and disease individually facet_grid(project + disease ~ .) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## add axis titles appropriately labs(x = week_lable, y = "cases (n)") + ## add in theme and axis labels tsa_axes + tsa_theme
## plot counts by week using tile_plots ggplot(filter(cases, disease %in% c("Diarrhoea", "LRTI", "Malaria")), aes(x = as.Date(epiweek), y = rate)) + ## plot as a line chart ## identity specifies to use the counts as are geom_line(stat = "identity") + ## plot for each site and disease individually facet_grid(project + disease ~ .) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## add axis titles appropriately labs(x = week_lable, y = "Rate (cases/10,000 discharges)") + ## add in theme and axis labels tsa_axes + tsa_theme
## plot counts by week using tile_plots ggplot(filter(age_cases, disease %in% c("LRTI")), aes(x = as.Date(epiweek), y = rate)) + ## plot as a line chart ## identity specifies to use the counts as are geom_line(stat = "identity") + ## plot for each site and disease individually facet_grid(project + age_group ~ .) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## add axis titles appropriately labs(x = week_lable, y = "Rate (cases/10,000 discharges)") + ## add in theme and axis labels tsa_axes + tsa_theme
## plot counts by week using tile_plots ggplot(filter(deaths, disease %in% c("Overall", "LRTI", "Malaria")), aes(x = epiweek, y = total)) + ## plot as a line chart ## identity specifies to use the counts as are geom_line(stat = "identity") + ## plot for each site and disease individually facet_grid(project + disease ~ .) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## add axis titles appropriately labs(x = week_lable, y = "cases (n)") + ## add in theme and axis labels tsa_axes + tsa_theme
## plot counts by week using tile_plots ggplot(filter(deaths, disease %in% c("Overall", "LRTI", "Malaria")), aes(x = epiweek, y = rate)) + ## plot as a line chart ## identity specifies to use the counts as are geom_line(stat = "identity") + ## plot for each site and disease individually facet_grid(project + disease ~ .) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## add axis titles appropriately labs(x = week_lable, y = "Mortality (deaths/10,000 discharges)") + ## add in theme and axis labels tsa_axes + tsa_theme
\newpage
This section uses the most appropriate data from project sites to look at
trends over time. It is divided in to two sections. The first (surveillance) uses
regression forecasts to detect abnormal changes in LRTI and death rates over time.
This analysis evaluates whether changes are either higher or lower compared to
what would be expected based on historical trends.
The second analysis (impact) uses segmented regression to assess whether there has been
a change in patient presentations to the facility after COVID-19 introduction in
the project location.
Currently only data from IPD is considered, as these are more consistent
in terms of sufficient case counts and reporting.
This analysis evaluates whether rates are either higher or lower compared to what would be expected based on historical trends. See technical details in the appendix for methods used.
Based on the data available from previous years, while taking in to account seasonality and trends, we would expect case (or death) rates to be within the ranges (prediction intervals [PI]), 95% (or 99%) of the time.
If observed case rates in the current year are above or below these ranges, it suggests there is a change in case rates compared to what we would expect.
However, peaks above the range might not immediately suggest an unusual occurrence (like an outbreak). Observed peaks may be due to different causative organisms/strains for particular syndromes (e.g. in LRTI or diarrhoea) or changes in reporting (e.g. shifting to DHIS2).
The alert threshold is the level that suggests there has been a sustained change in case rates (compared to baseline from previous years and considering previous weeks).
## make cases in to a tsibble dataset cases <- cases %>% ## make epiweek in to tsibble compatible week variable # mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease)) ## create a moving average variable (deals with missings) cases <- cases %>% group_by_key() %>% mutate(ma_2wk = slide_dbl(rate, ~mean(.x, na.rm = TRUE), .before = 2)) ## alternatively interpolate missings by linear trend ## between two nearest adjacent points cases <- cases %>% mutate(total = na_interpolation(total), patients = na_interpolation(patients), ## re-calculate rate based on interpolations rate_int = total / patients * 10000, ## create a population per 10,000 variable population = patients / 10000 ) ## to check what values have been imputed compared to the original # imputeTS::ggplot_na_imputations(cases$rate, cases$rate_int) ## only keep lrti cases lrti_cases <- filter(cases, disease == "LRTI") %>% as_tsibble(key = project, index = epiweek) %>% ungroup()
## decompose dcmp <- lrti_cases %>% model(classical_decomposition(rate_int, type = "additive")) %>% components() %>% autoplot() ## autocorellation atcrl <- lrti_cases %>% ACF(rate_int, lag_max = 53) %>% autoplot() ## get spectral periodogram for extracting weeks with the highest frequencies ## (checking of seasonality) periodo <- purrr::map(projects, function(i) ## run periodogram for each project producing raw spectrum periodogram( filter(lrti_cases, project == i), "rate_int", output = "raw")) %>% ## add in the project names set_names(projects) %>% ## pull out spectrum and frequency in to a dataframe for plotting purrr::map_df(~tibble(freq = .$freq, spec = .$spec), .id = "name") ## get weeks that are as peaks peak_weeks <- purrr::map(projects, ## run periodogram function for each project ## (default produces weeks) function(i) periodogram( filter(lrti_cases, project == i), "rate_int")) ## get a quick plot (ggplot but just takes x,y coords rather than df) pgram <- ggplot(data = periodo, aes(x = 1/(freq/52), y = log(spec))) + facet_grid(name~.) + geom_line() + labs(x = "Period (Weeks)", y = "Log(density)") + tsa_theme
## define fourier terms (sincos) fourier_terms <- purrr::map( ## for each of the projects projects, ## run fourier (subsetting data appropriately) function(i) { ## combine fourier terms for weeks prior to and after cut-off ## Year of interest fourier terms are predicted rbind( ## get fourier terms for previous years forecast::fourier( filter(lrti_cases, project == i, epiweek <= surveillance_date ) %>% select(epiweek, total), ## choose number of sincos terms to include K = 2 ), ## forecast for year of interest forecast::fourier( filter(lrti_cases, project == i, epiweek <= surveillance_date ) %>% select(epiweek, total), ## choose number of sincos terms to include K = 2, h = num_weeks ) ) }) %>% ## bind together a list (bind_rows doesnt work because cols not named) do.call(rbind, .) ## add in missing weeks till end of year lrti_cases <- lrti_cases %>% ## group by the project group_by_key() %>% ## for each group add rows from the highest epiweek to the end of year group_modify(~add_row(., epiweek = seq(max(.$epiweek) + 1, end_date, by = 1))) ## add fourier terms to dataset lrti_cases$fourier <- fourier_terms
## split data for fitting and prediction dat <- lrti_cases %>% group_by(project, cut_off = epiweek < surveillance_date) %>% nest() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the furier terms to account for seasonality fourier + ## include the population as an offset (to make) offset(log(population)) ) ## fit the baseline model dat <- dat %>% mutate( fit = ifelse( ## for data after the cut-off date (i.e. year of interest) set model to NA ## (dont run a model) !cut_off, NA, ## for base line data fit the model defined above for each projects purrr::map(data, ~fit(model, .x) ))) ## Need to change population to be 1 for prediction (this way returns rate) dat <- dat %>% ## within the nested list loop over the data frames for each project mutate(data = purrr::map(data, ## within each dataframe, change population to be 1 ~mutate(.x, population = 1))) ## use the models to get estimates and prediction intervals for each project observed <- list() ## for each project for (i in unique(dat$project)) { ## get the prediction intervals for the baseline period ## use the regression n observed[[paste0(i, "_fitted")]] <- predict(dat$fit[dat$project == i & dat$cut_off][[1]], dat$data[dat$project == i & dat$cut_off][[1]], simulate_pi = FALSE) ## get the prediction intervals for the forecast period (year of interest) observed[[paste0(i, "_predicted")]] <- predict( ## use the model defined ## on baseline data dat$fit[dat$project == i & dat$cut_off][[1]], ## predict based on data (fourier) ## from the prediction period dat$data[dat$project == i & !dat$cut_off][[1]], simulate_pi = FALSE) ## store the output of the baseline period observed[[paste0(i, "_fitted")]]$project <- i ## store the output of the predicted perio observed[[paste0(i, "_predicted")]]$project <- i } ## combine the outputs in to one data frame observed <- bind_rows(observed)
## calculate the residuals observed <- observed %>% mutate(resid = rate_int - estimate) ## are the residuals fairly constant over time (if not: outbreaks? change in practice?) res_plot1 <- observed %>% ggplot(aes(x = epiweek, y = resid)) + facet_grid(project~.) + geom_line() + geom_point() + labs(x = "epiweek", y = "Residuals") ## is there autocorelation in the residuals (is there a pattern to the error?) res_plot2 <- observed %>% as_tsibble(index = epiweek, key = project) %>% ACF(resid, lag_max = 53) %>% autoplot() ## are residuals normally distributed (are under or over estimating?) res_plot3 <- observed %>% ggplot(aes(x = resid)) + facet_grid(project~.) + geom_histogram(binwidth = 100) + geom_rug() + labs(y = "count") ## compare observed counts to their residuals ## should also be no pattern res_plot4 <- observed %>% ggplot(aes(x = estimate, y = resid)) + facet_grid(project~.) + geom_point() + labs(x = "Fitted", y = "Residuals") ## formally test autocorrelation of the residuals ## H0 is that residuals are from a white-noise series (i.e. random) ## test for independence ## if p value significant then non-random ljungbox <- list() for(i in projects) { ljungbox[[i]] <- Box.test(observed$resid[observed$project == i], type = "Ljung-Box") }
## Cross validation: predicting week(s) ahead based on sliding window ## expand your data by rolling over in 52 week windows (before + after) ## to predict 52 week ahead ## (creates longer and longer chains of observations - keeps older data) ## define window want to roll over roll_window <- 52 ## define weeks ahead want to predict weeks_ahead <- 52 ## do for each project purrr::map( ## for each of the projects projects, ## run the forecast validation function(i) { ## only use cases before year of interest (i.e. 2020) case_roll <- lrti_cases %>% filter(epiweek < surveillance_date & project == i) %>% select(project, epiweek, disease, total, patients, rate_int, population) %>% ## drop the last x observations ## depending on how many weeks ahead forecasting ## (otherwise will be an actual forecast to "unknown") slice(1:(n() - weeks_ahead)) %>% as_tsibble(index = epiweek, key = c(disease)) %>% ## roll over each week in x after windows to create grouping ID ## depending on what rolling window specify stretch_tsibble(.init = roll_window, .step = 1) %>% ## drop the first couple - as have no "before" cases filter(.id > roll_window) forecasts <- purrr::map(unique(case_roll$.id), function(j) { ## only keep the current fold being fit mini_data <- filter(case_roll, .id == j) %>% as_tibble() ## create a data set for forecasting on forecast_data <- tibble( project = rep.int(NA, weeks_ahead), epiweek = seq(max(mini_data$epiweek) + 1, max(mini_data$epiweek) + weeks_ahead, by = 1), disease = rep.int(NA, weeks_ahead), total = rep.int(NA, weeks_ahead), patients = rep.int(NA, weeks_ahead), rate_int = rep.int(NA, weeks_ahead), .id = rep.int(NA, weeks_ahead) ) ## add the forecast data to the original mini_data <- bind_rows(mini_data, forecast_data) ## define the cut off based on latest non missing count data cv_cut_off <- mini_data %>% ## only keep non-missing rows drop_na(rate_int) %>% ## get the latest week summarise(max(epiweek)) %>% ## extract so is not in a dataframe pull() ## make mini_data back in to a tsibble mini_data <- tsibble(mini_data, index = epiweek) ## define fourier terms (sincos) mini_data <- mini_data %>% mutate( ## combine fourier terms for weeks prior to and after cut-off date fourier = rbind( ## get fourier terms for previous years forecast::fourier( ## only keep the rows before cut-off filter(mini_data, epiweek <= cv_cut_off) %>% select(epiweek, total), ## include one set of sin cos terms K = 1 ), ## predict the fourier terms for following year (using baseline data) forecast::fourier( ## only keep the rows before cut-off filter(mini_data, epiweek <= cv_cut_off) %>% select(epiweek, total), ## include one set of sin cos terms K = 1, ## predict 52 weeks ahead h = weeks_ahead ) ) ) # split data for fitting and prediction dat <- mini_data %>% group_by(epiweek <= cv_cut_off) %>% group_split() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the fourier terms to account for seasonality fourier + ## include the population as an offset (to make) offset(log(population)) ) # define which data to use for fitting and which for predicting fitting_data <- pluck(dat, 2) pred_data <- pluck(dat, 1) # fit model fitted_model <- trending::fit(model, fitting_data) ## Need to change population to be 1 for prediction (this way returns rate) pred_data <- pred_data %>% mutate(population = 1) # forecast with data want to predict with forecasts <- fitted_model %>% predict(pred_data, simulate_pi = FALSE) %>% ## only keep the week and the forecast estimate select(epiweek, estimate) } ) ## make the list in to a data frame with all the forecasts forecasts <- bind_rows(forecasts) ## join the forecasts with the observed forecasts <- left_join(forecasts, filter(lrti_cases, project == i) %>% as_tibble() %>% select(epiweek, rate_int), by = "epiweek") ## using {yardstick} compute metrics ## RMSE: Root mean squared error ## MAE: Mean absolute error ## MASE: Mean absolute scaled error ## MAPE: Mean absolute percent error model_metrics <- bind_rows( ## in your forcasted dataset compare the observed to the predicted rmse(forecasts, rate_int, estimate), mae( forecasts, rate_int, estimate), mase(forecasts, rate_int, estimate), mape(forecasts, rate_int, estimate), ) %>% ## only keep the metric type and its output select(Metric = .metric, Measure = .estimate) %>% ## make in to wide format so can bind rows after pivot_wider(names_from = Metric, values_from = Measure) %>% ## add in current project name mutate(project = i) ## return model metrics model_metrics } ) %>% ## combine in to one data frame bind_rows()
for (i in projects) { ## define surveillance time series object ## subset testing cases to the project being run lrti_sts <- sts(observed = lrti_cases$rate_int[which(lrti_cases$project == i & ## have to drop the added empty rows !is.na(lrti_cases$rate_int))], # population = matrix(lrti_cases$population[which(lrti_cases$project == i)]), start = c( ## subset to only keep the year from start_date as.numeric(str_sub(start_date, 1, 4)), ## subset to only keep the week from start_date as.numeric(str_sub(start_date, 7, 8))), freq = 52) ## define which weeks to include ## (use first week available in data - otherwise SSudan fails, missing early data) weekrange <- surveillance_date - min(lrti_cases$epiweek[lrti_cases$project == i]) #### using the farrington algorithm if (aberration_method == "farrington") { ## define control ctrl <- list( ## define what time period that want threshold for (i.e. 2020) range = which(lrti_sts@epoch > weekrange), b = 5, ## how many years backwards for baseline w = 2, ## rolling window size in weeks weightsThreshold = 2.58, ## reweighting past outbreaks (improved noufaily method - original suggests 1) ## pastWeeksNotIncluded = 3, ## use all weeks available (noufaily suggests drop 26) trend = TRUE, pThresholdTrend = 1, ## 0.05 normally, however 1 is advised in the improved method (i.e. always keep) thresholdMethod = "nbPlugin", populationOffset = TRUE ) ## apply farrington flexible method detectionmethod <- farringtonFlexible(lrti_sts, ctrl) } ### using glrnb ## set up using https://cran.r-project.org/web/packages/surveillance/vignettes/glrnb.pdf if (aberration_method == "glrnb") { ## define control options ctrl <- list( ## define what time period that want threshold for (i.e. 2020) range = which(lrti_sts@epoch > weekrange), mu0 = list(S = 2, ## number of fourier terms (harmonics) to include trend = TRUE, ## whether to include trend or not refit = FALSE), ## whether to refit model after each alarm ## cARL = threshold for GLR statistic (arbitrary) ## 3 ~ middle ground for minimising false positives ## 1 fits to the 99%PI of glm.nb - with changes after peaks (threshold lowered for alert) c.ARL = 2, # theta = log(1.5), ## equates to a 50% increase in cases in an outbreak ret = "cases" ## return threshold upperbound as case counts ) ## apply the glrnb method detectionmethod <- glrnb(lrti_sts, control = ctrl, verbose = FALSE) } ## add outbreak threshold to dataset observed[which(observed$project == i & observed$epiweek >= surveillance_date & ## have to drop the added empty rows !is.na(observed$rate_int)), "threshold"] <- detectionmethod@upperbound }
## define the appropriate y-axis label unit_lable <- if_else(admissions_based, "LRTI (n/10,000 admissions)", "LRTI (n/10,000 discharges)") ## plot cases and fits with confidence intervals base_plot <- ggplot(observed, aes(x = as.Date(epiweek))) + ## plot for each project row-wise facet_grid(project ~ .) + ## plot interpolated observed values as line - label observed (colour defined later) geom_line(aes(y = rate_int, colour = "Observed")) + ## plot in points for the observed rates - colour by whether is above PI geom_point(aes(y = rate_int, colour = if_else(rate_int > upper_pi, "Excess", "Observed"), size = if_else(rate_int > upper_pi, "trigger", "normal"))) + ## plot 95% prediction intervals as a ribbon - alpha sets transparency geom_ribbon(aes(ymin = lower_pi, ymax = upper_pi, alpha = "95% PI"), fill = "grey60") + ## plot modelled values as a line geom_line(aes(y = estimate, colour = "Fitted")) + ## set the colours for defined parts (match names to colours) scale_colour_manual(values = c("Observed" = "black", "Fitted" = "blue", "Excess" = "red", "Alert threshold" = "orange")) + ## define how see-through parts are (match names to alpha) scale_alpha_manual(values = c("95% PI" = 0.4)) + scale_size_manual(values = c("trigger" = 2, "normal" = NA_real_), guide = FALSE) + ## reverse the order of the legend guides(colour = guide_legend(reverse = TRUE)) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## label the axes appropriately and remove legend titles labs(x = week_lable, y = unit_lable, colour = element_blank(), alpha = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme ## if aberration methods set to be included in control chunk then add dashed line if (include_aberration) { base_plot <- base_plot + ## add in upper bound of aberration algorithm geom_line(aes(y = threshold, colour = "Alert threshold"), linetype = "dashed") } ## return plot base_plot
## make deaths in to a tsibble dataset deaths <- deaths %>% ## make epiweek in to tsibble compatible week variable mutate(epiweek = yearweek(epiweek)) %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = c(project, disease)) ## create a moving average variable (deals with missings) deaths <- deaths %>% group_by_key() %>% mutate(ma_2wk = slide_dbl(rate, ~mean(.x, na.rm = TRUE), .before = 2)) ## alternatively interpolate missings by linear trend ## between two nearest adjacent points deaths <- deaths %>% mutate(total = na_interpolation(total), patients = na_interpolation(patients), ## re-calculate rate based on interpolations rate_int = total / patients * 10000, ## create a population per 10,000 variable population = patients / 10000 ) ## to check what values have been imputed compared to the original # imputeTS::ggplot_na_imputations(deaths$rate, deaths$rate_int) ## only keep overall deaths overall_deaths <- filter(deaths, disease == "Overall") %>% as_tsibble(key = project, index = epiweek) %>% ungroup()
## get the median min max and 95% centile range from previous years avg_counts <- overall_deaths %>% ## turn to tsibble to be able to change to numeric weeks as_tibble() %>% ## create a week number variable (without year) mutate(week = format(epiweek, format = "%V")) %>% ## filter for previous years filter(epiweek < surveillance_date) %>% ## for each project and week number group_by(project, week) %>% ## calculate median, min, max and then the 95% range based on percentiles summarise(avg = median(rate_int, na.rm = TRUE), minz = min(rate_int, na.rm = TRUE), maxz = max(rate_int, na.rm = TRUE), low = quantile(rate_int, 0.025), high = quantile(rate_int, 0.975)) ## get rates for the current year by weeks year_counts <- overall_deaths %>% as_tibble() %>% mutate(week = format(epiweek, format = "%V"), year = format(epiweek, format = "%Y"), ## create a tag for whether it is in the current year or not filler = if_else(epiweek < surveillance_date, "Past years", "Current year") ) ## start a plot ggplot() + ## do for each project facet_grid(project ~ .) + ## plot centile rage as a shaded area geom_ribbon(data = avg_counts, aes(x = week, ymin = low, ymax = high, group = 1, fill = "Past 95% centile range"), alpha = 0.3) + ## plot only the current year and colour based on tag from above (will all be "current year") geom_line(data = filter(year_counts, year == 2020), aes(x = week, y = rate_int, group = year, colour = filler)) + ## plot median as a dotted line geom_line(data = avg_counts, aes(x = week, y = avg, colour = "Past median", group = 1), linetype = 2) + ## choose appropriate colours and fill scale_colour_manual(values = c("Past median" = "black", "Current year" = "red", "Past years" = "grey60")) + scale_fill_manual(values = c("Past 95% centile range" = "grey80"), guide = guide_legend(reverse = TRUE)) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## fix the x axis labels to only show every forth number ## and show a 0 infront of numbers less than 10 scale_x_discrete(breaks = seq(01, 53, by = 4) %>% str_pad(2, pad = "0")) + ## label axes appropriately labs(x = "Week of discharge", y = "Mortality rate (deaths/10,000 discharges)", colour = element_blank(), fill = element_blank()) + ## apply tsa theme for axes etc tsa_theme
## decompose dcmp <- overall_deaths %>% model(classical_decomposition(rate_int, type = "additive")) %>% components() %>% autoplot() ## autocorellation atcrl <- overall_deaths %>% ACF(rate_int, lag_max = 53) %>% autoplot() ## get spectral periodogram for extracting weeks with the highest frequencies ## (checking of seasonality) periodo <- purrr::map(projects, function(i) ## run periodogram for each project producing raw spectrum periodogram( filter(overall_deaths, project == i), "rate_int", output = "raw")) %>% ## add in the project names set_names(projects) %>% ## pull out spectrum and frequency in to a dataframe for plotting purrr::map_df(~tibble(freq = .$freq, spec = .$spec), .id = "name") ## get weeks that are as peaks peak_weeks <- purrr::map(projects, ## run periodogram function for each project ## (default produces weeks) function(i) periodogram( filter(overall_deaths, project == i), "rate_int")) ## get a quick plot (ggplot but just takes x,y coords rather than df) pgram <- ggplot(data = periodo, aes(x = 1/(freq/52), y = log(spec))) + facet_grid(name~.) + geom_line() + labs(x = "Period (Weeks)", y = "Log(density)") + tsa_theme
## define fourier terms (sincos) fourier_terms <- purrr::map( ## for each of the projects projects, ## run fourier (subsetting data appropriately) function(i) { ## combine fourier terms for weeks prior to and after cut-off ## Year of interest fourier terms are predicted rbind( ## get fourier terms for previous years forecast::fourier( filter(overall_deaths, project == i, epiweek <= surveillance_date ) %>% select(epiweek, total), ## choose number of sincos terms to include K = 2 ), ## forecast for year of interest forecast::fourier( filter(overall_deaths, project == i, epiweek <= surveillance_date ) %>% select(epiweek, total), ## choose number of sincos terms to include K = 2, h = num_weeks ) ) }) %>% ## bind together a list (bind_rows doesnt work because cols not named) do.call(rbind, .) ## add in missing weeks till end of year overall_deaths <- overall_deaths %>% ## group by the project group_by_key() %>% ## for each group add rows from the highest epiweek to the end of year group_modify(~add_row(., epiweek = seq(max(.$epiweek) + 1, end_date, by = 1))) ## add fourier terms to dataset overall_deaths$fourier <- fourier_terms
## split data for fitting and prediction dat <- overall_deaths %>% group_by(project, cut_off = epiweek < surveillance_date) %>% nest() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the furier terms to account for seasonality fourier + ## include the population as an offset (to make) offset(log(population)) ) ## fit the baseline model dat <- dat %>% mutate( fit = ifelse( ## for data after the cut-off date (i.e. year of interest) set model to NA ## (dont run a model) !cut_off, NA, ## for base line data fit the model defined above for each projects purrr::map(data, ~fit(model, .x) ))) ## Need to change population to be 1 for prediction (this way returns rate) dat <- dat %>% ## within the nested list loop over the data frames for each project mutate(data = purrr::map(data, ## within each dataframe, change population to be 1 ~mutate(.x, population = 1))) ## use the models to get estimates and prediction intervals for each project observed <- list() ## for each project for (i in unique(dat$project)) { ## get the prediction intervals for the baseline period ## use the regression n observed[[paste0(i, "_fitted")]] <- predict(dat$fit[dat$project == i & dat$cut_off][[1]], dat$data[dat$project == i & dat$cut_off][[1]], simulate_pi = FALSE) ## get the prediction intervals for the forecast period (year of interest) observed[[paste0(i, "_predicted")]] <- predict( ## use the model defined ## on baseline data dat$fit[dat$project == i & dat$cut_off][[1]], ## predict based on data (fourier) ## from the prediction period dat$data[dat$project == i & !dat$cut_off][[1]], simulate_pi = FALSE) ## store the output of the baseline period observed[[paste0(i, "_fitted")]]$project <- i ## store the output of the predicted perio observed[[paste0(i, "_predicted")]]$project <- i } ## combine the outputs in to one data frame observed <- bind_rows(observed)
## calculate the residuals observed <- observed %>% mutate(resid = rate_int - estimate) ## are the residuals fairly constant over time (if not: outbreaks? change in practice?) res_plot1 <- observed %>% ggplot(aes(x = epiweek, y = resid)) + facet_grid(project~.) + geom_line() + geom_point() + labs(x = "epiweek", y = "Residuals") ## is there autocorelation in the residuals (is there a pattern to the error?) res_plot2 <- observed %>% as_tsibble(index = epiweek, key = project) %>% ACF(resid, lag_max = 53) %>% autoplot() ## are residuals normally distributed (are under or over estimating?) res_plot3 <- observed %>% ggplot(aes(x = resid)) + facet_grid(project~.) + geom_histogram(binwidth = 100) + geom_rug() + labs(y = "count") ## compare observed counts to their residuals ## should also be no pattern res_plot4 <- observed %>% ggplot(aes(x = estimate, y = resid)) + facet_grid(project~.) + geom_point() + labs(x = "Fitted", y = "Residuals") ## formally test autocorrelation of the residuals ## H0 is that residuals are from a white-noise series (i.e. random) ## test for independence ## if p value significant then non-random ljungbox <- list() for(i in projects) { ljungbox[[i]] <- Box.test(observed$resid[observed$project == i], type = "Ljung-Box") }
## Cross validation: predicting week(s) ahead based on sliding window ## expand your data by rolling over in 52 week windows (before + after) ## to predict 52 week ahead ## (creates longer and longer chains of observations - keeps older data) ## define window want to roll over roll_window <- 52 ## define weeks ahead want to predict weeks_ahead <- 52 ## do for each project purrr::map( ## for each of the projects projects, ## run the forecast validation function(i) { ## only use cases before year of interest (i.e. 2020) case_roll <- overall_deaths %>% filter(epiweek < surveillance_date & project == i) %>% select(project, epiweek, disease, total, patients, rate_int, population) %>% ## drop the last x observations ## depending on how many weeks ahead forecasting ## (otherwise will be an actual forecast to "unknown") slice(1:(n() - weeks_ahead)) %>% as_tsibble(index = epiweek, key = c(disease)) %>% ## roll over each week in x after windows to create grouping ID ## depending on what rolling window specify stretch_tsibble(.init = roll_window, .step = 1) %>% ## drop the first couple - as have no "before" cases filter(.id > roll_window) forecasts <- purrr::map(unique(case_roll$.id), function(j) { ## only keep the current fold being fit mini_data <- filter(case_roll, .id == j) %>% as_tibble() ## create a data set for forecasting on forecast_data <- tibble( project = rep.int(NA, weeks_ahead), epiweek = seq(max(mini_data$epiweek) + 1, max(mini_data$epiweek) + weeks_ahead, by = 1), disease = rep.int(NA, weeks_ahead), total = rep.int(NA, weeks_ahead), patients = rep.int(NA, weeks_ahead), rate_int = rep.int(NA, weeks_ahead), .id = rep.int(NA, weeks_ahead) ) ## add the forecast data to the original mini_data <- bind_rows(mini_data, forecast_data) ## define the cut off based on latest non missing count data cv_cut_off <- mini_data %>% ## only keep non-missing rows drop_na(rate_int) %>% ## get the latest week summarise(max(epiweek)) %>% ## extract so is not in a dataframe pull() ## make mini_data back in to a tsibble mini_data <- tsibble(mini_data, index = epiweek) ## define fourier terms (sincos) mini_data <- mini_data %>% mutate( ## combine fourier terms for weeks prior to and after cut-off date fourier = rbind( ## get fourier terms for previous years forecast::fourier( ## only keep the rows before cut-off filter(mini_data, epiweek <= cv_cut_off) %>% select(epiweek, total), ## include one set of sin cos terms K = 1 ), ## predict the fourier terms for following year (using baseline data) forecast::fourier( ## only keep the rows before cut-off filter(mini_data, epiweek <= cv_cut_off) %>% select(epiweek, total), ## include one set of sin cos terms K = 1, ## predict 52 weeks ahead h = weeks_ahead ) ) ) # split data for fitting and prediction dat <- mini_data %>% group_by(epiweek <= cv_cut_off) %>% group_split() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the fourier terms to account for seasonality fourier + ## include the population as an offset (to make) offset(log(population)) ) # define which data to use for fitting and which for predicting fitting_data <- pluck(dat, 2) pred_data <- pluck(dat, 1) # fit model fitted_model <- trending::fit(model, fitting_data) ## Need to change population to be 1 for prediction (this way returns rate) pred_data <- pred_data %>% mutate(population = 1) # forecast with data want to predict with forecasts <- fitted_model %>% predict(pred_data, simulate_pi = FALSE) %>% ## only keep the week and the forecast estimate select(epiweek, estimate) } ) ## make the list in to a data frame with all the forecasts forecasts <- bind_rows(forecasts) ## join the forecasts with the observed forecasts <- left_join(forecasts, filter(overall_deaths, project == i) %>% as_tibble() %>% select(epiweek, rate_int), by = "epiweek") ## using {yardstick} compute metrics ## RMSE: Root mean squared error ## MAE: Mean absolute error ## MASE: Mean absolute scaled error ## MAPE: Mean absolute percent error model_metrics <- bind_rows( ## in your forcasted dataset compare the observed to the predicted rmse(forecasts, rate_int, estimate), mae( forecasts, rate_int, estimate), mase(forecasts, rate_int, estimate), mape(forecasts, rate_int, estimate), ) %>% ## only keep the metric type and its output select(Metric = .metric, Measure = .estimate) %>% ## make in to wide format so can bind rows after pivot_wider(names_from = Metric, values_from = Measure) %>% ## add in current project name mutate(project = i) ## return model metrics model_metrics } ) %>% ## combine in to one data frame bind_rows()
for (i in projects) { ## define surveillance time series object ## subset testing cases to the project being run deaths_sts <- sts(observed = overall_deaths$rate_int[which(overall_deaths$project == i & ## have to drop the added empty rows !is.na(overall_deaths$rate_int))], # population = matrix(overall_deaths$population[which(overall_deaths$project == i)]), start = c( ## subset to only keep the year from start_date as.numeric(str_sub(start_date, 1, 4)), ## subset to only keep the week from start_date as.numeric(str_sub(start_date, 7, 8))), freq = 52) ## define which weeks to include ## (use first week available in data - otherwise SSudan fails, missing early data) weekrange <- surveillance_date - min(overall_deaths$epiweek[overall_deaths$project == i]) #### using the farrington algorithm if (aberration_method == "farrington") { ## define control ctrl <- list( ## define what time period that want threshold for (i.e. 2020) range = which(deaths_sts@epoch > weekrange), b = 5, ## how many years backwards for baseline w = 2, ## rolling window size in weeks weightsThreshold = 2.58, ## reweighting past outbreaks (improved noufaily method - original suggests 1) ## pastWeeksNotIncluded = 3, ## use all weeks available (noufaily suggests drop 26) trend = TRUE, pThresholdTrend = 1, ## 0.05 normally, however 1 is advised in the improved method (i.e. always keep) thresholdMethod = "nbPlugin", populationOffset = TRUE ) ## apply farrington flexible method detectionmethod <- farringtonFlexible(deaths_sts, ctrl) } ### using glrnb ## set up using https://cran.r-project.org/web/packages/surveillance/vignettes/glrnb.pdf if (aberration_method == "glrnb") { ## define control options ctrl <- list( ## define what time period that want threshold for (i.e. 2020) range = which(deaths_sts@epoch > weekrange), mu0 = list(S = 2, ## number of fourier terms (harmonics) to include trend = TRUE, ## whether to include trend or not refit = FALSE), ## whether to refit model after each alarm ## cARL = threshold for GLR statistic (arbitrary) ## 3 ~ middle ground for minimising false positives ## 1 fits to the 99%PI of glm.nb - with changes after peaks (threshold lowered for alert) c.ARL = 2, # theta = log(1.5), ## equates to a 50% increase in cases in an outbreak ret = "cases" ## return threshold upperbound as case counts ) ## apply the glrnb method detectionmethod <- glrnb(deaths_sts, control = ctrl, verbose = FALSE) } ## add outbreak threshold to dataset observed[which(observed$project == i & observed$epiweek >= surveillance_date & ## have to drop the added empty rows !is.na(observed$rate_int)), "threshold"] <- detectionmethod@upperbound }
## define the appropriate y-axis label unit_lable <- if_else(admissions_based, "LRTI (n/10,000 admissions)", "LRTI (n/10,000 discharges)") ## plot cases and fits with confidence intervals base_plot <- ggplot(observed, aes(x = as.Date(epiweek))) + ## plot for each project row-wise facet_grid(project ~ .) + ## plot interpolated observed values as line - label observed (colour defined later) geom_line(aes(y = rate_int, colour = "Observed")) + ## plot in points for the observed rates - colour by whether is above PI geom_point(aes(y = rate_int, colour = if_else(rate_int > upper_pi, "Excess", "Observed"), size = if_else(rate_int > upper_pi, "trigger", "normal"))) + ## plot 95% prediction intervals as a ribbon - alpha sets transparency geom_ribbon(aes(ymin = lower_pi, ymax = upper_pi, alpha = "95% PI"), fill = "grey60") + ## plot modelled values as a line geom_line(aes(y = estimate, colour = "Fitted")) + ## set the colours for defined parts (match names to colours) scale_colour_manual(values = c("Observed" = "black", "Fitted" = "blue", "Excess" = "red", "Alert threshold" = "orange")) + ## define how see-through parts are (match names to alpha) scale_alpha_manual(values = c("95% PI" = 0.4)) + scale_size_manual(values = c("trigger" = 2, "normal" = NA_real_), guide = FALSE) + ## reverse the order of the legend guides(colour = guide_legend(reverse = TRUE)) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## label the axes appropriately and remove legend titles labs(x = week_lable, y = unit_lable, colour = element_blank(), alpha = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme ## if aberration methods set to be included in control chunk then add dashed line if (include_aberration) { base_plot <- base_plot + ## add in upper bound of aberration algorithm geom_line(aes(y = threshold, colour = "Alert threshold"), linetype = "dashed") } ## return plot base_plot
\newpage
This section evaluates the impact of Covid-19 on services by using
segmented regression for specific diseases of interest.
It analyses if there have been changes in case numbers of other diseases
(beyond LRTI) that could be related to the Covid-19 pandemic
(e.g changes in health seeking behaviour or interruption/changes in service provision)
It uses rates over time to see if these change after the date the
first confirmed Covid-19 case was diagnosed in the project region (or the nearest
applicable region with reliable information).
Similar to how we would interpret surveillance results above. Based on the data available from previous years (pre Covid-19 period), we calculate an expected rate for the post Covid-19 period, while taking the trend (increase/decrease) over time and seasonal variation in to account. Then we calculate the percentage difference between the expected rate and the observed rate in the post Covid-19 period.
A relevant change in rate would have both confidence intervals either above or below zero (i.e. increased or decreased) and a small p-value. Nb. small case numbers or noisy (inconsistent) data will produce unreliable results. Therefore a small p-value with a very wide confidence interval might not actually be relevant.
## make patient counts in to a tsibble (to be able to add fourier) patient_counts <- patient_counts %>% ## make counts in to tsibble so it recognises weeks as main time var as_tsibble(index = epiweek, key = project) ## join bed counts to use as a denominator patient_counts <- left_join(patient_counts, select(bed_counts, project, week, beds), by = c("project", "epiweek" = "week") ) ## make rows where admissions or beds less than 10 NA (otherwise throws off everything) patient_counts[which(patient_counts$total < 10 | patient_counts$beds < 10), c("total", "beds")] <- NA ## create a moving average variable (deals with missings) # patient_counts <- patient_counts %>% # group_by_key() %>% # mutate(ma_2wk = slide_dbl(rate, ~mean(.x, na.rm = TRUE), .before = 2)) ## alternatively interpolate missings by linear trend ## between two nearest adjacent points patient_counts <- patient_counts %>% mutate(total = na_interpolation(total), beds = na_interpolation(beds), ## re-calculate rate based on interpolations rate_int = total / beds * 10000, ## create a population per 10,000 variable population = beds / 10000 )
## In general, the negative binomial regression model can be written as follows. # log(Y_t)= β_0 + β_1*t+ β_2*δ(t-t_0) + β_3*(t-t_0 )^+ + log(pop_t) + e_t # Y_t is the number of IPD cases observed in year t # pop_t is the population size in 100,000s in year t # t_0 is the last year (7th year) of the pre-period (including the transition time if there is any) # δ(x) is the indicator function (it is 0 if x≤0 and 1 if x>0) # (x)^+ is the cut off operator (it is x if x>0 and 0 otherwise) # e_t denotes the residual # Therefore β_2*δ(t-t_0) + β_3*(t-t_0 )^+ is generalised linear part of post-period # which is zero in the pre-period. # The estimates of β_2 and β_3 will denote the effects of the intervention. # More periods/terms can be added to control for seasonality. ## join the intervention weeks to the dataset by project patient_counts <- left_join(patient_counts, intervention_weeks, by = "project") ## define variables for regression patient_counts <- patient_counts %>% group_by_key() %>% mutate( ## corresponds to t in the formula ## count of weeks (could probably also just use straight epiweeks var) linear = row_number(epiweek), ## corresponds to delta(t-t0) in the formula ## pre or post covid period intervention = as.numeric(epiweek >= confirmed_region), ## corresponds to (t-t0)^+ in the formula ## count of weeks post intervention ## (choose the larger number between 0 and whatever comes from calculation) time_post = pmax(0, epiweek - confirmed_region + 1)) ## define fourier terms (sincos) patient_counts$fourier <- purrr::map( ## for each of the projects projects, ## run fourier (subsetting data appropriately) function(i) { ## only keep the relevant columns and project forecast::fourier( filter(patient_counts, project == i) %>% select(epiweek, total), ## choose number of sincos terms to include K = 2 ) }) %>% ## bind together a list (bind_rows doesnt work because cols not named) do.call(rbind, .) ## split data for fitting by project dat <- patient_counts %>% as_tibble() %>% group_by(project) %>% nest() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the furier terms to account for seasonality fourier + ## add in whether in the pre- or post-period intervention + ## add in the time post intervention time_post + ## include the population as an offset (to make) offset(log(population)) ) ## fit the baseline model dat <- dat %>% mutate( ## for base line data fit the model defined above for each projects fit = purrr::map(data, ~fit(model, .x) )) ## Need to change population to be 1 for prediction (this way returns rate) dat <- dat %>% ## within the nested list loop over the data frames for each project mutate(data = purrr::map(data, ## within each dataframe, change population to be 1 ~mutate(.x, population = 1))) ## calculate confidence intervals and prediction intervals for plotting observed <- dat %>% mutate(predicted = purrr::map(project, ~predict( dat$fit[dat$project == .x][[1]], dat$data[dat$project == .x][[1]], simulate_pi = FALSE))) %>% ## pull projects in to single data frame select(predicted) %>% unnest(cols = c(predicted))
# exponentiate results for IRRs and then pull bits out to get percentage change dat %>% ## use broom to exponentiate terms and get confidence interval for each project mutate(outputs = purrr::map(fit, function(i) { ## extract original negative binomial regression get_model(i) %>% ## get a tidy dataframe of results broom::tidy(exponentiate = TRUE, conf.int = TRUE) } )) %>% ## only keep project and the data frame with exponentiated results select(project, outputs) %>% ## pull out of list in to a dataframe unnest(cols = c(outputs)) %>% ## only keep the intervention value filter(term == "intervention") %>% ## change the IRR to percentage change for estimate and CIs mutate( ## for each of the columns of interest - create a new column across( all_of(c("estimate", "conf.low", "conf.high")), ## apply the formula to calculate percentage change .f = function(i) 100 * (i - 1), ## add a suffix to new column names with "_perc" .names = "{.col}_perc") ) %>% ## only keep (and rename) certain columns select("Percentage change" = estimate_perc, "95%CI low" = conf.low_perc, "95%CI high" = conf.high_perc, "p-value" = p.value) %>% ## show as a formatted table in the output flextable::flextable() %>% ## select number of digits want in different columns flextable::colformat_double(j = c("Percentage change", "95%CI low", "95%CI high"), digits = 1) %>% flextable::colformat_double(j = c("p-value"), digits = 3) %>% autofit() %>% set_caption("Percentage change in patient rates")
## define the height that want to use for text labels height <- patient_counts %>% as_tibble() %>% ## for each project group_by(project) %>% ## find the highest peak out of all projects summarise(height = max(rate_int)) %>% ## take 3/4s of that summarise(max(height) * 0.75) %>% ## return the value pull() ## define the appropriate y-axis label unit_lable <- if_else(admissions_based, "Admissions (n/10,000 beds)", "Discharges (n/10,000 beds)") ## plot cases and fits with confidence intervals ggplot() + ## plot for each project row-wise facet_grid(project ~ .) + ## plot interpolated observed values as line - label observed (colour defined later) geom_line(data = patient_counts, aes(x = as.Date(epiweek), group = intervention, y = rate_int, colour = "Observed")) + ## add in dhis2 start date line and label geom_vline(data = filter(timelines, event == "Covid region"), aes(xintercept = as.Date(calendar_week)), linetype = 2, colour = "red") + geom_label(data = filter(timelines, event == "Covid region"), aes(x = as.Date(calendar_week), y = height, label = "Covid-19")) + ## plot 95% prediction intervals as a ribbon - alpha sets transparency geom_ribbon(data = observed, aes(x = as.Date(epiweek), group = intervention, ymin = lower_pi, ymax = upper_pi, alpha = "95% PI"), fill = "grey60") + ## plot modelled values as a line geom_line(data = observed, aes(x = as.Date(epiweek), group = intervention, y = estimate, colour = "Fitted")) + ## set the colours for defined parts (match names to colours) scale_colour_manual(values = c("Observed" = "black", "Fitted" = "blue")) + scale_alpha_manual(values = c("95% PI" = 0.4)) + ## reverse the order of the legend guides(colour = guide_legend(reverse = TRUE)) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## label the axes appropriately and remove legend titles labs(x = week_lable, y = unit_lable, colour = element_blank(), alpha = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme
\newpage
## In general, the negative binomial regression model can be written as follows. # log(Y_t)= β_0 + β_1*t+ β_2*δ(t-t_0) + β_3*(t-t_0 )^+ + log(pop_t) + e_t # Y_t is the number of IPD cases observed in year t # pop_t is the population size in 100,000s in year t # t_0 is the last year (7th year) of the pre-period (including the transition time if there is any) # δ(x) is the indicator function (it is 0 if x≤0 and 1 if x>0) # (x)^+ is the cut off operator (it is x if x>0 and 0 otherwise) # e_t denotes the residual # Therefore β_2*δ(t-t_0) + β_3*(t-t_0 )^+ is generalised linear part of post-period # which is zero in the pre-period. # The estimates of β_2 and β_3 will denote the effects of the intervention. # More periods/terms can be added to control for seasonality and population ## join the intervention weeks to the dataset by project cases <- left_join(cases, intervention_weeks, by = "project") ## define variables for regression cases <- cases %>% group_by_key() %>% mutate( # corresponds to t in the formula # count of weeks (could probably also just use straight epiweeks var) linear = row_number(epiweek), # corresponds to delta(t-t0) in the formula # pre or post covid period intervention = as.numeric(epiweek >= confirmed_region), # corresponds to (t-t0)^+ in the formula # count of weeks post intervention time_post = pmax(0, epiweek - confirmed_region + 1)) ## drop hiv related because hardly any cases cases <- filter(cases, disease != "HIV related") ## drop balukhali in bangladesh if (z == "Bangladesh") { cases <- filter(cases, project != "Balukhali" & disease != "Malaria") projects <- projects[!projects %in% "Balukhali"] } ## drop diarrhoea for DRC south kivu if (z == "DRC South Kivu") { cases <- filter(cases, disease != "Diarrhoea") } ## drop diarrhoea and TB for Nigeria if (z == "Nigeria") { cases <- filter(cases, !disease %in% c("Diarrhoea", "TB", "Chronic diseases")) } ## drop tb in pakistan if (z == "Pakistan") { cases <- filter(cases, !disease %in% c("TB")) } ## define fourier terms (sincos) cases$fourier <- purrr::map( ## for each of the projects projects, function(.x) { ## within each project do for each disease map(unique(cases$disease), function (.y) { ## run fourier (subsetting data appropriately) forecast::fourier( ## only keep the relevant columns and project filter(cases, project == .x & disease == .y) %>% select(epiweek, total), ## choose number of sincos terms to include 2 ) })}) %>% ## pull diseases specific datasets out of project specific lists flatten() %>% ## bind together a list (bind_rows doesnt work because cols not named) do.call(rbind, .) ## split data for fitting by project dat <- cases %>% as_tibble() %>% group_by(project, disease) %>% nest() ## define the model you want to fit (negative binomial) model <- glm_nb_model( ## set number of cases as outcome of interest total ~ ## use epiweek to account for the trend epiweek + ## use the furier terms to account for seasonality fourier + ## add in whether in the pre- or post-period intervention + ## add in the time post intervention time_post + ## include the population as an offset (to make) offset(log(population)) ) ## fit the baseline model dat <- dat %>% mutate( ## for base line data fit the model defined above for each projects fit = purrr::map(data, ~fit(model, .x) )) ## Need to change population to be 1 for prediction (this way returns rate) dat <- dat %>% ## within the nested list loop over the data frames for each project mutate(data = purrr::map(data, ## within each dataframe, change population to be 1 ~mutate(.x, population = 1))) ## calculate confidence intervals and prediction intervals for plotting observed <- dat %>% mutate(predicted = purrr::map2(.x = project, .y = disease, ~predict( dat$fit[dat$project == .x & dat$disease == .y][[1]], dat$data[dat$project == .x & dat$disease == .y][[1]], simulate_pi = FALSE))) %>% ## pull projects in to single data frame select(predicted) %>% unnest(cols = c(predicted))
# exponentiate results for IRRs and then pull bits out to get percentage change dat %>% ## use broom to exponentiate terms and get confidence interval for each project mutate(outputs = purrr::map(fit, function(i) { ## extract original negative binomial regression get_model(i) %>% ## get a tidy dataframe of results broom::tidy(exponentiate = TRUE, conf.int = TRUE) } )) %>% ## only keep project and the data frame with exponentiated results select(project, disease, outputs) %>% ## pull out of list in to a dataframe unnest(cols = c(outputs)) %>% ## only keep the intervention value filter(term == "intervention") %>% ## change the IRR to percentage change for estimate and CIs mutate( ## for each of the columns of interest - create a new column across( all_of(c("estimate", "conf.low", "conf.high")), ## apply the formula to calculate percentage change .f = function(i) 100 * (i - 1), ## add a suffix to new column names with "_perc" .names = "{.col}_perc") ) %>% ## only keep (and rename) certain columns select(Project = project, Disease = disease, "Percentage change" = estimate, "95%CI low" = conf.low, "95%CI high" = conf.high, "p-value" = p.value) %>% ## show as a formatted table in the output flextable::flextable() %>% ## select number of digits want in different columns flextable::colformat_double(j = c("Percentage change", "95%CI low", "95%CI high"), digits = 1) %>% flextable::colformat_double(j = c("p-value"), digits = 3) %>% autofit() %>% set_caption("Percentage change in disease specific rates")
## define the appropriate y-axis label unit_lable <- if_else(admissions_based, "Disease admissions (n/10,000 admissions)", "Disease discharges (n/10,000 discharges)") ## loop over projects to produce faceted plots for each (otherwise too big for word doc) ## walk is the same as map but does it quietly (doesnt show list number in output) purrr::walk(projects, ~ print( ## plot cases and fits with confidence intervals ggplot(data = filter(cases, project == .x), aes(x = as.Date(epiweek), group = intervention)) + ## plot for each project row-wise facet_grid(project + disease ~ ., scales = "free_y") + ## plot interpolated observed values as line - label observed (colour defined later) geom_line(data = filter(cases, project == .x), aes(x = as.Date(epiweek), group = intervention, y = rate_int, colour = "Observed")) + ## plot 95% prediction intervals as a ribbon - alpha sets transparency geom_ribbon(data = filter(observed, project == .x), aes(x = as.Date(epiweek), group = intervention, ymin = lower_pi, ymax = upper_pi, alpha = "95% PI"), fill = "grey60") + ## plot modelled values as a line geom_line(data = filter(observed, project == .x), aes(x = as.Date(epiweek), group = intervention, y = estimate, colour = "Fitted")) + ## set the colours for defined parts (match names to colours) scale_colour_manual(values = c("Observed" = "black", "Fitted" = "blue")) + scale_alpha_manual(values = c("95% PI" = 0.4)) + ## reverse the order of the legend guides(colour = guide_legend(reverse = TRUE)) + ## make y and x axes meet at the origin (x = 0, y = 0) scale_y_continuous(expand = c(0,0), limits = c(0, NA)) + ## label the axes appropriately and remove legend titles labs(x = week_lable, y = unit_lable, colour = element_blank(), alpha = element_blank()) + ## add in theme and axis labels tsa_axes + tsa_theme ) )
\newpage
The methods used here have been explained in detail elsewhere [@hyndmann_forecasting_2019; @salmon_monitoring_2016], the below is a brief summary of techniques.
Data sources <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// wards_included \\
This chunk pulls together a sentence on whether maternal and ITFC wards were included in the analysis. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
wards_included <- if_else(medical_wards == "Medical", "This analysis only considered medical wards (not including maternal or inpatient therapeutic feeding centers)", "This analysis considered all medical wards (including maternal or inpatient therapeutic feeding centers)")
r wards_included
. The various data sources used are listed below.
## using the cleaned IPD data clean_data %>% ## do by project and data source type group_by(project, source) %>% ## get the beginning and end of each source reporting summarise(Start = min(calendar_week, na.rm = TRUE), Stop = max(calendar_week, na.rm = TRUE)) %>% ## capitalise var names rename(Project = project, Source = source) %>% ## show as a formatted table in the output flextable() %>% autofit() %>% set_caption("Time periods which data sources were used by project")
Data preparation
Given the variation in project activities over time, we present rates per 10,000 patient discharges rather than counts, as below:
$rate_{cases} = \frac{n_{cases}}{ n_{discharge}} \times 10,000$
For weeks where the total number of patients was less than ten, counts were set to missing.
Where weeks were missing counts, linear interpolation was used to fill these gaps [@moritz_imputets_2017]. This involves fitting a line between the previous and the next available non-missing values, and then filling in the missing values based on that line.
Descriptive
Classical decomposition was used [@hyndmann_forecasting_2019-1]. This involves breaking down the time series in to its component parts in order to understand whether there is a long-term upwards or downwards trend, whether there are differences by season and if there are longer term cycles beyond seasons.
Regression
A baseline of expected case rates was created by fitting a negative binomial regression (generalised linear model) of case data between 2014 and 2019. This regression included a term for the trend as well as the two best fitting sine and cosine terms (using Fourier series) [@hyndmann_forecasting_2019-2]. Two Fourier terms were used as there are some arguments for irregular LRTI seasonality in warmer climates [@hirve_influenza_2016; @lam_nonannual_2018]. Where there is no strong second peak, the second term does not substantially shift the fitted regression. Fourier terms are calculated independently for each project and disease.
This regression model is then used to predict what the case rate will be in each week of the following year. It was also used to calculate 95% and 99% prediction intervals [@hyndmann_forecasting_2019-3]. Prediction intervals were calculated based on quantiles of the binomial distribution for each fitted value [@jombart_r_2020].
Regression validation
To validate how well the model performed, two things were done. First, we looked at the differences between fitted values from the regression and the observed values (residuals) [@hyndmann_forecasting_2019-4]. In a well-functioning model these residuals should be uncorrelated, of constant variance with a mean of zero and normally distributed. This just means that the fit should not be overly wrong at one particular time period over another. Second, we used time series cross validation [@hyndmann_forecasting_2019-5]. This involves creating multiple subsets of your data and seeing how well the model is able to predict four weeks ahead. From this you can get several measures which tells you how accurate the model is able to predict the observed values.
Segmented regression
This uses negative binomial regression for interrupted time series analysis.
This regression included a term for the trend as well as the two best fitting
sine and cosine terms (using Fourier series) [@hyndmann_forecasting_2019-2]. On
top of this it includes terms for being pre- or post- intervention and number of days
post-intervention.
In this case the first week with a confirmed Covid-19 case in the region for each
project (or the nearest relevant region).
Effectively this fits two regressions (one pre, one post), which enables the
calculation of an incidence rate ratio (IRR).
From the IRR and 95%CI it is possible to calculate percentage change with the
formula below (where estimate is either IRR or CI). [@taljaard_use_2014]
$\%\Delta = 100 \times (estimate - 1)$
\newpage
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.