In R4EPI/sitrep: Report templates and helper functions for applied epidemiology

Introduction to this template

This is a template which can be used to create a report from a retrospective mortality survey.

There are sections for reading and cleaning data, followed by weighting and observation time calculations and then survey analysis.
For a more detailed explanation of this template, please visit https://r4epis.netlify.com/surveys
Feedback and suggestions are welcome at the GitHub issues page
Text within <! > will not show in your final document. These comments are used to explain the template. You can delete them if you want.

Installing and loading required packages

## hide all code chunks in the output, but show errors
knitr::opts_chunk$set(echo = FALSE, error = TRUE, warning = FALSE, message = FALSE, 
                      fig.width = 6*1.25, fig.height = 6)


## set default NA to - in output, define figure width/height
options(knitr.kable.NA = "-")

# Ensures the package "pacman" is installed
if (!require("pacman")) {
     install.packages("pacman") }

# Install and load required packages for this template
pacman::p_load(
  knitr,       # create output docs
  here,        # find your files
  rio,         # for importing data
  janitor,     # clean/shape data
  dplyr,       # clean/shape data
  tidyr,       # clean/shape data
  forcats,     # manipulate and rearrange factors
  stringr,     # manipulate texts
  ggplot2,     # create plots and charts
  ggalluvial,  # for visualising population flows
  apyramid,    # plotting age pyramids
  sitrep,      # MSF field epi functions
  survey,      # for survey functions
  srvyr,       # dplyr wrapper for survey package
  gtsummary,   # produce tables
  flextable,   # for styling output tables
  labelled,    # add labels to variables
  matchmaker,  # recode datasets with dictionaries
  lubridate,   # working with dates
  parsedate    # guessing dates
  )

## set default text size to 18 for plots
## give classic black/white axes for plots
ggplot2::theme_set(theme_classic(base_size = 18))

## generates MSF standard dictionary for Kobo 
study_data_dict <- msf_dict_survey("Mortality")
## generates MSF standard dictionary for Kobo in long format (for recoding)
study_data_dict_long <- msf_dict_survey(disease = "Mortality", compact = FALSE)


## generates a fake dataset for use as an example in this template
## this dataset already has household and individual levels merged
study_data_raw <- gen_data(dictionary = "Mortality",
                           varnames   = "name",
                           numcases   = 1000)

### Read in data ---------------------------------------------------------------


## Excel file ------------------------------------------------------------------

## read in household data sheet
# study_data_hh <- rio::import(here::here("mortality_survey.xlsx"), 
#                               which = "mortality_survey_erb", na = "")

## read in individual level data sheet
# study_data_indiv <- rio::import(here::here("mortality_survey.xlsx", 
#                               which = "r1", na = "")


## Excel file with password ----------------------------------------------------
## Use this section if your Excel has a password.

# install.packages(c("excel.link", "askpass"))
# library(excel.link)

# study_data_hh <- xl.read.file(here::here("mortality_survey.xlsx"),
#                              xl.sheet = "mortality_survey_erb",
#                              password = askpass::askpass(prompt = "please enter file password"))

# study_data_indiv <- xl.read.file(here::here("mortality_survey.xlsx"),
#                              xl.sheet = "r1",
#                              password = askpass::askpass(prompt = "please enter file password"))

## Excel file ------------------------------------------------------------------
## to read in a specific sheet use "which"
# study_data_hh <- rio::import(here::here("mortality_survey.xlsx"), which = "Sheet1")

## Excel file -- Specific range ------------------------------------------------
## you can specify a range in an excel sheet.
# study_data_hh <- rio::import(here::here("mortality_survey.xlsx"), range = "B2:J102")

## Excel file with password ----------------------------------------------------
## use this section if your Excel has a password.
# install.packages(c("excel.link", "askpass"))
# library(excel.link)
# study_data_hh <- xl.read.file(here::here("mortality_survey.xlsx"),
#                              xl.sheet = "Sheet1",
#                              password = askpass::askpass(prompt = "please enter file password"))

## CSV file --------------------------------------------------------------------
# study_data_hh <- rio::import(here::here("mortality_survey.csv"))

## Stata data file -------------------------------------------------------------
# study_data_hh <- rio::import(here::here("mortality_survey.dat"))

## join the individual and household data to form a complete data set
#study_data_raw <- left_join(study_data_hh, study_data_indiv, by = c("_index" = "_parent_index"))

## Data dictionary -------------------------------------------------------------


## read in a kobo data dictionary
## important to specify template is FALSE (otherwise you get the generic dict)
# study_data_dict <- msf_dict_survey(name = here::here("mortality_survey_dict.xlsx"), 
#                                    template = FALSE)

## look at the dictionary by uncommenting the line below
# View(study_data_dict) 


## Clean column names ----------------------------------------------------------
## This step fixes the column names so they are easy to use in R.


## make a copy of your orginal dataset and name it study_data_cleaned
study_data_cleaned <- study_data_raw

## Sometimes you want to rename specific variables
## For example we want to add details to violance_nature_other
## This is because otherwise it will be duplicated after we clean names below
## The formula for this is rename(data, NEW_NAME = OLD_NAME). 
## You can add multiple column name recodes by simply separating them with a comma
study_data_cleaned <- rename(study_data_cleaned,
                             violence_nature_other_details    =  violence_nature_other)

## define clean variable names using clean_names from the janitor package. 
study_data_cleaned <- janitor::clean_names(study_data_cleaned)

## create a unique identifier by combining indeces of the two levels 
## x and y are added on to the end of variables that are duplicated 
## (in this case, parent and child index)
# study_data_cleaned <- study_data_cleaned %>% 
#   mutate(uid = str_glue("{index}_{index_y}"))

## MSF Survey Dictionary ----------------------------------------------------------

## generates MSF standard dictionary for Kobo 
# study_data_dict <- msf_dict_survey("Mortality")
## generates MSF standard dictionary for Kobo in long format (for recoding)
# study_data_dict_long <- msf_dict_survey(disease = "Mortality", compact = FALSE)

## look at the standard dictionary by uncommenting the line below
# View(study_data_dict) 
## You will need to recode your variables to match the data dictionary. This is
## addressed below.



## Clean column names ----------------------------------------------------------
## This step fixes the column names so they are easy to use in R.

## make a copy of your orginal dataset and name it study_data_cleaned
# study_data_cleaned <- study_data_raw

## define clean variable names using clean_names from the janitor package. 
# study_data_cleaned <- janitor::clean_names(study_data_cleaned)

## Match column names ---------------------------------------------------------
## This step helps you match your variables to the standard variables.
## This step will require some patience. Courage!

## Use the function msf_dict_rename_helper() to create a template based on the
## standard dictionary. This will copy a rename command like the one above to your
## clipboard.

# msf_dict_rename_helper("mortality", varnames = "name")

## Paste the result below and your column names to the matching variable.

## Be careful! You still need to be aware of what each variable means and what
## values it takes.

## If there are any variables that are in the MSF dictionary that are not in
## your data set, then you should comment them out, but be aware that some
## analyses may not run because of this. 


## PASTE HERE


## Here is an EXAMPLE for changing a few specific names. function. In this
## example, we have the columns "gender" and "age" that we want to rename as
## "sex" and "age_years". 
## The formula for this is rename(data, NEW_NAME = OLD_NAME).

# study_data_cleaned <- rename(study_data_cleaned, 
#                            sex       = gender, # TEXT
#                            age_years = age     # INTEGER_POSITIVE
# )

## Read data -------------------------------------------------------------------

## This step reads in your population data from Excel.
## You may need to rename your columns.

# population_data <- rio::import(here::here("population.xlsx"), which = "Sheet1")

## repeat preparation steps as appropriate


## Enter counts directly -------------------------------------------------------


## Below is an example of how to enter population counts by groups. 

# population_data_age <- gen_population(
#   groups = c("0-2", "3-14", "15-29", "30-44", "45+"),
#   counts = c(3600, 18110, 13600, 8080, 6600),
#   strata = NULL) %>%
#   rename(age_group = groups,
#     population = n)


## Create counts from proportions ----------------------------------------------
## This step helps you estimate sub-group size with proportions.
## You need to replace the total_pop and proportions. You can change the groups
## to fit your needs. 

## Here we repeat the steps for two regions (district A and B) then bind the two
## together 


## generate population data by age groups in years for district A
population_data_age_district_a <- gen_population(total_pop = 10000, # set total population 
  groups      = c("0-2", "3-14", "15-29", "30-44", "45+"), # set groups
  proportions = c(0.0340, 0.1811, 0.1380, 0.0808, 0.0661), # set proportions for each group
  strata      = c("Male", "Female")) %>%           # stratify by gender
  rename(age_group  = groups,                      # rename columns (NEW NAME = OLD NAME)
         sex        = strata,
         population = n) %>% 
  mutate(health_district = "district_a")           # add a column to identify region 


## generate population data by age groups in years for district B
population_data_age_district_b <- gen_population(total_pop = 10000, # set total population 
  groups      = c("0-2", "3-14", "15-29", "30-44", "45+"), # set groups
  proportions = c(0.0340, 0.1811, 0.1380, 0.0808, 0.0661), # set proportions for each group
  strata      = c("Male", "Female")) %>%           # stratify by gender
  rename(age_group  = groups,                      # rename columns (NEW NAME = OLD NAME)
         sex        = strata,
         population = n) %>% 
  mutate(health_district = "district_b")           # add a column to identify region 



## bind region population data together to get overall population 
population_data_age <- bind_rows(population_data_age_district_a, 
                                 population_data_age_district_b)

cluster_counts <- tibble(cluster = c("Village 1", "Village 2", "Village 3", "Village 4", 
                                     "Village 5", "Village 6", "Village 7", "Village 8",
                                     "Village 9", "Village 10"), 
                         households = c(700, 400, 600, 500, 300, 
                                        800, 700, 400, 500, 500))

## view the first ten rows of data
head(study_data_raw, n = 10)

## view your whole dataset interactivley (in an excel style format)
View(study_data_raw)

## overview of variable types and contents
str(study_data_raw)

## get summary: 
## mean, median and max values of variables
## counts for categorical variables
## also gives number of NAs
summary(study_data_raw)

## view unique values contained in variables 
## you can run this for any column -- just replace the column name
unique(study_data_raw$sex)

## check for logical date inconsistencies 
## for example check deaths before births and return corresponding IDs
## unique ID is generated in the prepare_kobo_data chunk (need to uncomment)
study_data_raw %>% 
  filter(date_death < date_birth) %>% 
  select("uid")

## use the dfSummary function in combination with view
## note that view is not capitalised with this package
# summarytools::dfSummary(study_data_cleaned) %>%
#   summarytools::view()

## Kobo standard data --------------------------------------------------------
## If you got your data from Kobo, use this portion of the code.
## If not, comment this section out and use the below.

## make sure all date variables are formatted as dates 
## get the names of variables which are dates
DATEVARS <- study_data_dict %>% 
  filter(type == "date") %>% 
  filter(name %in% names(study_data_cleaned)) %>% 
  ## filter to match the column names of your data
  pull(name) # select date vars

## find if there are date variables which are completely empty
EMPTY_DATEVARS <- purrr::map(DATEVARS, ~all(
  is.na(study_data_cleaned[[.x]])
  )) %>% 
  unlist() %>% 
  which()

## remove exclude the names of variables which are completely emptys
DATEVARS <- DATEVARS[-EMPTY_DATEVARS]

## change to dates 
## use the parse_date() function to make a first pass at date variables.
## parse_date() produces a complicated format - so we use as.Date() to simplify
study_data_cleaned <- study_data_cleaned %>%
  mutate(
    across(.cols = all_of(DATEVARS),
           .fns = ~parsedate::parse_date(.x) %>% as.Date()))


## Non-Kobo data -------------------------------------------------------------
## Use this section if you did not have Kobo data. 

## use the parse_date() function to make a first pass at date variables.
## parse_date() produces a complicated format - so we use as.Date() to simplify# study_data_cleaned <- study_data_cleaned %>%
#   mutate(
#     across(.cols = matches("date|Date"),
#            .fns  = ~parsedate::parse_date(.x) %>% as.Date()))


## Define recall period --------------------------------------------------------

## set the start/end of recall period
## can be changed to date variables from dataset 
## (e.g. arrival date & date questionnaire)
study_data_cleaned <- study_data_cleaned %>% 
  mutate(recall_start = as.Date("2018-02-01"), 
         recall_end   = as.Date("2018-05-01")
  )

## Fix wrong dates ------------------------------------------------------------- 

## Some dates will be unrealistic or wrong.
## Here is an example of how to manually fix dates. 
## Look at your data and edit as needed.

## set specific unrealistic dates to NA
# study_data_cleaned <- mutate(study_data_cleaned,
#                              death_date < as.Date("2017-11-01") ~ as.Date(NA), 
#                              death_date == as.Date("2081-01-01") ~ as.Date("2018-01-01"))

## set inappropriate dates to NA based on rules 
## e.g. arrivals before start, departures departures after end
# study_data_cleaned <- study_data_cleaned %>%
#   mutate(arrived_date = ifelse(arrived_date < recall_start, 
#                                              as.Date(NA), 
#                                              arrived_date),
#          birthday_date = ifelse(birthday_date < recall_start, 
#                                     as.Date(NA),
#                                     birthday_date),
#          left_date = ifelse(left_date > recall_end, 
#                                         as.Date(NA),
#                                         left_date),
#          death_date = ifelse(death_date > recall_end,
#                                      as.Date(NA),
#                                      death_date)
#          )

## Age group variables ----------------------------------------------------------
## This step shows you how to create categorical variables from numeric variables.
## We have some intermediate steps on the way.

## make sure age is an integer 
study_data_cleaned <- study_data_cleaned %>% 
  mutate(age_years = as.integer(age_years))

## create an age group variable by specifying categorical breaks (of years)
study_data_cleaned <- study_data_cleaned %>% 
  mutate(age_group = age_categories(age_years, 
                                    breakers = c(0, 3, 15, 30, 45)
                                    ))

## create age group variable for under 2 years based on months
study_data_cleaned <- study_data_cleaned %>% 
  mutate(age_group_mon = if_else(
    ## if months <= 12 then calculated age groups 
    age_months <= 12, 
    age_categories(study_data_cleaned$age_months, 
                                        breakers = c(0, 6, 9, 12), 
                                        ceiling = TRUE), 
    ## otherwise set new variable to missing
    factor(NA)))


## alternatively, create an age group variable specify a sequence
# study_data_cleaned$age_group <- age_categories(study_data_cleaned$age,
#                                                lower = 0, 
#                                                upper = 100, 
#                                                by = 10)

## If you already have an age group variable defined, you should manually
## arrange the categories
# study_data_cleaned$age_group <- factor(study_data_cleaned$age_group,
#                                        c("0-4y", "5-14y", "15-29y", "30-44y", "45+y"))


## to combine different age categories use the following function 
## this prioritises the smaller unit, meaning:
## it will order your age_category from smallest to largest unit 
## i.e. days first then months then years (depending on which are given)
study_data_cleaned <- group_age_categories(study_data_cleaned, 
                                           years  = age_group,
                                           months = age_group_mon)

## recode values with matchmaker package (value labels cant be used for analysis) 
study_data_cleaned <- match_df(study_data_cleaned, 
         study_data_dict_long, 
         from = "option_name", 
         to = "option_label_english", 
         by = "name", 
         order = "option_order_in_set")

## Change a yes/no variable in to TRUE/FALSE
## create a new variable called consent 
## where the old one is yes place TRUE in the new one
## (this could also be done with the if_else or case_when function - 
## but this option is shorter)
study_data_cleaned <- study_data_cleaned %>% 
  mutate(consent = consent == "Yes")


## Change a yes/no variable in to TRUE/FALSE
## create a new variable called died 
## where the old one is yes place TRUE in the new one
study_data_cleaned <- study_data_cleaned %>% 
  mutate(died = died == "Yes")


## Recode character variables
## This step shows how to fix misspellings in the geographic region variable.
## Ideally, you want these values to match and population data!
# study_data_cleaned <- study_data_cleaned %>%
#   mutate(village_name = case_when(
#     village_name == "Valliages 1"       ~ "village 1",
#     village_name == "Village1"          ~ "village 1",
#     village_name == "Town 3"            ~ "village 3"
#     village_name == "Town3"             ~ "village 3",
#     TRUE ~ as.character(village_name))
#   ))


## create a character variable based off groups of a different variable 
study_data_cleaned <- study_data_cleaned %>% 
  mutate(health_district = case_when(
    cluster_number %in% c(1:5) ~ "district_a", 
    TRUE ~ "district_b"
  ))


## Set the levels of a factor 
# study_data_cleaned <- study_data_cleaned %>%
#   mutate(cause = factor(cause,
#                         levels = c("fever_malaria", "diarrhoea", 
#                                    "respiratory_infection", "trauma_accident", 
#                                    "during_pregnancy", "during_delivery",
#                                    "post_partum", "violence", "dont_know", 
#                                    "other")))

## explicitly replace NA of a factor
study_data_cleaned <- study_data_cleaned %>%
  mutate(cause = fct_explicit_na(cause, na_level = "Not Applicable"))

## if you had a multi-choice question and the missing values were given as a "."
## replace missing values in multi-choice questions to "" so that we can
## filter them out later.
# study_data_cleaned <- study_data_cleaned %>%
#   mutate(
#     across(
#       .cols = contains("violence"), # all variables that contain the word "violence"
#       .fns = ~fct_explicit_na(as.character(.), "") # replace missing values with ""
#       )
#     )


## fix factor levels -----------------------------------------------------------

## make sure there are the appropriate levels (names)
study_data_cleaned$no_consent_reason <- fct_recode(study_data_cleaned$no_consent_reason,
                                            "No time" = "I do not have time",
                                            "Previous negative experience" = "I have had a negative experience with a previous survey.",
                                            "No perceived benefit" = "There is no benefit to me and my family"
                                           )

## Find start and end dates/causes ---------------------------------------------

## create new variables for start and end dates/causes
study_data_cleaned <- study_data_cleaned %>%
  ## choose earliest date entered in survey
  ## from births, household arrivals, and camp arrivals 
  find_start_date("date_birth",
                  "date_arrived",
                  period_start = "recall_start",
                  period_end   = "recall_end",
                  datecol      = "startdate",
                  datereason   = "startcause" 
                 ) %>%
  ## choose earliest date entered in survey
  ## from camp departures, death and end of the study
  find_end_date("date_left",
                "date_death",
                period_start = "recall_start",
                period_end   = "recall_end",
                datecol      = "enddate",
                datereason   = "endcause" 
               ) %>%
  ## label those that were present at the start/end (except births/deaths)
  mutate(startcause = if_else(startdate == recall_start & startcause != "birthday_date",
                              "Present at start", startcause)) %>%
  mutate(endcause = if_else(enddate == recall_end & endcause != "death_date", 
                            "Present at end", endcause))

## fix factor levels -----------------------------------------------------------

## make sure there are the appropriate levels (names)
study_data_cleaned$startcause <- fct_recode(study_data_cleaned$startcause,
                                            "Present at start" = "Present at start",
                                            "Born" = "date_birth",
                                            "Other arrival" = "date_arrived"
                                           )

## make sure there are the appropriate levels (names)
study_data_cleaned$endcause <- fct_recode(study_data_cleaned$endcause,
                                          "Present at end" = "Present at end",
                                          "Died" = "date_death",
                                          "Other departure" = "date_left")

## month of death --------------------------------------------------------------

## create a variable for month of end (used for deaths by month)
## note this is not used because we have to iterrate in the cmr_month chunk
# study_data_cleaned <- study_data_cleaned %>% 
#   mutate(endmonth = tsibble::yearmonth(enddate))

## time sequence of events -----------------------------------------------------

## check that you do not have any negative observation times
## (e.g. any rows with end date before start date)
check_dates <- assert_positive_timespan(study_data_cleaned, startdate, enddate)

## return the unique identifiers which have negative observation time 
# check_dates$uid

## observation time ------------------------------------------------------------

## Define observation time in days
study_data_cleaned <- study_data_cleaned %>% 
  mutate(obstime = as.numeric(enddate - startdate))

## create variable names with labelled  

## define a list of names and labels based on the dictionary 
var_labels <- setNames(
  ## add variable labels as a list
  as.list(study_data_dict$short_name), 
  ## name the list with the current variable names
  study_data_dict$name)

## add the variable labels to dataset 
study_data_cleaned <- study_data_cleaned %>% 
  set_variable_labels(
    ## set the labels with the object defined above
    .labels = var_labels, 
    ## do not throw an error if some names dont match 
    ## not all names in the dictionary appear in the dataset 
    .strict = FALSE)

## it is possible to update individual variables manually too
study_data_cleaned <- study_data_cleaned %>% 
  set_variable_labels(
    ## variable name = variable label 
    age_years     = "Age (years)",
    age_group     = "Age group (years)",
    age_months    = "Age (months)", 
    age_group_mon = "Age group (months)",
    age_category  = "Age category", 
    died          = "Died", 
    startcause    = "Observation start", 
    endcause      = "Observation end"
    )

## Drop unused rows  -----------------------------------------------------------

## store the cases that you drop so you can describe them (e.g. non-consenting 
## or wrong village/cluster)
dropped <- study_data_cleaned %>% 
  filter(!consent | is.na(startdate) | is.na(enddate) | 
           ## note that whatever you use for weights cannot be missing!
           village_name == "Other"| is.na(age_years) | is.na(sex))

## use the dropped cases to remove the unused rows from the survey data set  
study_data_cleaned <- anti_join(study_data_cleaned, dropped, by = names(dropped))


## Drop columns ----------------------------------------------------------------
## OPTIONAL: This step shows you how you can remove certain variables.
## study_data_cleaned <- select(study_data_cleaned, -c("age_years", "sex"))

## OPTIONAL: if you want to inspect certain variables, you can select these by
## name or column number. This example creates a reduced dataset for the first
## three columns, age_years, and sex.

# study_data_reduced <- select(study_data_cleaned, c(1:3, "age_years", "sex")

## option 1: only keep the first occurrence of duplicate case 
study_data_cleaned <- study_data_cleaned %>% 
  ## find duplicates based on case number, sex and age group 
  ## only keep the first occurrence 
  distinct(uid, sex, age_group, .keep_all = TRUE)

# ## option 2: create flagging variables for duplicates (then use to browse)
# 
# study_data_cleaned <- study_data_cleaned %>% 
#   ## choose which variables to use for finding unique rows 
#   group_by(uid, sex, age_group) %>% 
#   mutate(
#     ## get the number of times duplicate occurs 
#     num_dupes = n(), 
#     duped = if_else(num_dupes > 1 , TRUE, FALSE)
#   )
# 
# ## browse duplicates based on flagging variables 
# study_data_cleaned %>% 
#   ## only keep rows that are duplicated
#   filter(duped) %>% 
#   ## arrange by variables of interest 
#   arrange(uid, sex, age_group) %>% 
#   View()
# 
# ## filter duplicates to only keep the row with the earlier entry 
# study_data_cleaned %>% 
#   ## choose which variables to use for finding unique rows 
#   group_by(uid, sex, age_group) %>% 
#   ## sort to have the earliest date by person first
#   arrange(as.Date(start)) %>% 
#   ## only keep the earliest row 
#   slice(1)

## stratified ------------------------------------------------------------------

## create a variable called "surv_weight_strata" 
## contains weights for each individual - by age group, sex and health district
study_data_cleaned <- add_weights_strata(x = study_data_cleaned, 
                                         p = population_data_age, 
                                         surv_weight = "surv_weight_strata",
                                         surv_weight_ID = "surv_weight_ID_strata",
                                         age_group, sex, health_district)


## cluster ---------------------------------------------------------------------

## get the number of individuals interviewed per household 
## adds a variable with counts of the household (parent) index variable
study_data_cleaned <- study_data_cleaned %>% 
  add_count(index, name = "interviewed")


## create cluster weights 
study_data_cleaned <- add_weights_cluster(x = study_data_cleaned, 
                                          cl = cluster_counts, 
                                          eligible = member_number, 
                                          interviewed = interviewed, 
                                          cluster_x = village_name, 
                                          cluster_cl = cluster, 
                                          household_x = index, 
                                          household_cl = households, 
                                          surv_weight = "surv_weight_cluster", 
                                          surv_weight_ID = "surv_weight_ID_cluster", 
                                          ignore_cluster = FALSE, 
                                          ignore_household = FALSE)

## stratified and cluster ------------------------------------------------------

## create a survey weight for cluster and strata 
study_data_cleaned <- study_data_cleaned %>% 
  mutate(surv_weight_cluster_strata = surv_weight_strata * surv_weight_cluster)

## simple random ---------------------------------------------------------------

survey_design_simple <- study_data_cleaned %>% 
  as_survey_design(ids = 1, # 1 for no cluster ids 
                   weights = NULL, # No weight added
                   strata = NULL # sampling was simple (no strata)
                  )

## stratified ------------------------------------------------------------------

survey_design_strata <- study_data_cleaned %>% 
  as_survey_design(ids = 1, # 1 for no cluster ids 
                   weights = surv_weight_strata, # weight variable created above 
                   strata = health_district # sampling was stratified by district
                  )

## cluster ---------------------------------------------------------------------

survey_design_cluster <- study_data_cleaned %>% 
  as_survey_design(ids = village_name, # cluster ids 
                   weights = surv_weight_cluster, # weight variable created above 
                   strata = NULL # sampling was simple (no strata)
                  )

## stratified cluster ----------------------------------------------------------

survey_design <- study_data_cleaned %>% 
  as_survey_design(ids = village_name, # cluster ids 
                   weights = surv_weight_cluster_strata, # weight variable created above 
                   strata = health_district # sampling was stratified by district
                  )

rio::export(study_data_cleaned, here::here("data", str_glue("study_data_cleaned_{Sys.Date()}.xlsx")))

Results

Survey inclusion

## get counts of number of clusters 
num_clus <- study_data_cleaned %>%
  ## trim data to unique clusters
  distinct(cluster_number) %>% 
  ## get number of rows (count how many unique)
  nrow()

## get counts of number households 
num_hh <- study_data_cleaned %>% 
  ## get unique houses by cluster
  distinct(cluster_number, household_number) %>% 
  ## get number of rounds (count how many unique)
  nrow()

We included r num_hh households across r num_clus clusters in this survey analysis.

Among the r nrow(dropped) individuals excluded from the survey analysis, r fmt_count(dropped, consent) individuals were excluded due to missing start- or end-dates and r fmt_count(dropped, !consent) were excluded for lack of consent. The reasons for no consent are shown below.

## using the dataset with dropped individuals 
dropped %>% 
  ## only keep reasons for no consent 
  select(no_consent_reason) %>% 
  ## make NAs a factor level called missing
  ## to make the proportion of the total, including the missings
  mutate(no_consent_reason = fct_explicit_na(no_consent_reason,
                                             na_level = "Missing")) %>%
  ## get counts and proportions 
  tbl_summary() %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

## get counts of the number of households per cluster
clustersize <- study_data_cleaned %>% 
  ## trim data to only unique households within each cluster
  distinct(cluster_number, household_number) %>%
  ## count the number of households within each cluster
  count(cluster_number) %>% 
  pull(n)

## get the median number of households per cluster
clustermed <- median(clustersize)

## get the min and max number of households per cluster
## paste these together seperated by a dash 
clusterrange <- str_c(range(clustersize), collapse = "--")

## get counts of children per household 
## do this by cluster as household IDs are only unique within clusters
hhsize <- study_data_cleaned %>% 
  count(cluster_number, household_number) %>%
  pull(n) 

## get median number of children per household
hhmed <- median(hhsize)
## get the min and max number of children per household
## paste these together seperated by a dash 
hhrange <- str_c(range(hhsize), collapse = "--")

## get standard deviation 
hhsd <- round(sd(hhsize), digits = 1)

The median number of households per cluster was r clustermed, with a range of r clusterrange. The median number of children per household was r hhmed (range: r hhrange, standard deviation: r hhsd).

Demographic information

In total we included r nrow(study_data_cleaned) in the survey analysis. The age break down and a comparison with the source population is shown below.

## counts and props of the study population
ag <- tabyl(study_data_cleaned, age_group, show_na = FALSE) %>% 
  mutate(n_total = sum(n), 
         age_group = fct_inorder(age_group))


## counts and props of the source population
propcount <- population_data_age %>% 
  group_by(age_group) %>%
    tally(population) %>%
    mutate(percent = n / sum(n))

## bind together the columns of two tables, group by age, and perform a 
## binomial test to see if n/total is significantly different from population
## proportion.
  ## suffix here adds to text to the end of columns in each of the two datasets
left_join(ag, propcount, by = "age_group", suffix = c("", "_pop")) %>%
  group_by(age_group) %>%

  ## broom::tidy(binom.test()) makes a data frame out of the binomial test and
  ## will add the variables p.value, parameter, conf.low, conf.high, method, and
  ## alternative. We will only use p.value here. You can include other
  ## columns if you want to report confidence intervals
  mutate(binom = list(broom::tidy(binom.test(n, n_total, percent_pop)))) %>%
  unnest(cols = c(binom)) %>% # important for expanding the binom.test data frame
  mutate(
    across(.cols = contains("percent"), 
            .fns = ~.x * 100)) %>%

  ## Adjusting the p-values to correct for false positives 
  ## (because testing multiple age groups). This will only make 
  ## a difference if you have many age categories
  mutate(p.value = p.adjust(p.value, method = "holm")) %>%

  ## Only show p-values over 0.001 (those under report as <0.001)
  mutate(p.value = ifelse(p.value < 0.001, "<0.001", as.character(round(p.value, 3)))) %>%

  ## rename the columns appropriately
  select(
    "Age group" = age_group,
    "Study population (n)" = n,
    "Study population (%)" = percent,
    "Source population (n)" = n_pop,
    "Source population (%)" = percent_pop,
    "P-value" = p.value
  ) %>%
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit") %>% 
  ## set to only show 1 decimal place 
  colformat_double(digits = 1)

## compute the median age 
medage <- median(study_data_cleaned$age_years)
## paste the lower and upper quartile together
iqr <- str_c(  # basically copy paste together the following
  ## calculate the 25% and 75% of distribution, with missings removed
  quantile(     
    study_data_cleaned$age_years, 
    probs = c(0.25, 0.75), 
    na.rm = TRUE), 
  ## between lower and upper place an en-dash
  collapse = "--")


## compute overall sex ratio 
sex_ratio <- study_data_cleaned %>% 
  count(sex) %>% 
  pivot_wider(names_from = sex, values_from = n) %>%
  mutate(ratio = round(Male/Female, digits = 1)) %>%
  pull(ratio)

## compute sex ratios by age group 
sex_ratio_age <- study_data_cleaned %>% 
  count(age_group, sex) %>% 
  pivot_wider(names_from = sex, values_from = n) %>%
  mutate(ratio = round(Male/Female, digits = 1)) %>%
  select(age_group, ratio)

## sort table by ascending ratio then select the lowest (first)
min_sex_ratio_age <- arrange(sex_ratio_age, ratio) %>% slice(1)

pregnant_women_in_data <- study_data_cleaned %>% 
  ## filter for only women in the data
  filter(sex == "Female") %>% 
  ## get counts (ignore missings)
  tabyl(age_group, pregnant, show_na = FALSE) %>%
  ## add in percentages based on column totals
  adorn_percentages(denominator = "col") %>% 
  ## format percentages
  adorn_pct_formatting() %>% 
  ## arrange proportions in descending order
  arrange(desc(Yes)) %>% 
  ## grab the first row
  slice(1)

Among the r nrow(study_data_cleaned) surveyed individuals, there were r fmt_count(study_data_cleaned, sex == "Female") females and r fmt_count(study_data_cleaned, sex == "Male") males (unweighted). The male to female ratio was r sex_ratio in the surveyed population. The lowest male to female ratio was r min_sex_ratio_age$ratio in the r min_sex_ratio_age$age_group year age group. The median age of surveyed individuals was r medage years (Q1-Q3 of r iqr years). Children under five years of age made up r fmt_count(study_data_cleaned, age_years < 5)of the surveyed individuals. The highest number of surveyed individuals (unweighted) were in the r table(study_data_cleaned$age_group) %>% which.max() %>% names() year age group. The number of surveyed women who were pregnant was r fmt_count(study_data_cleaned, sex == "Female" & pregnant == "Yes"). The highest proportion of pregnant women was among the r pregnant_women_in_data$age_group year age group, at r pregnant_women_in_data$yes.

Unweighted age distribution of population by year age group and gender.

# get cross tabulated counts and proportions
study_data_cleaned %>% 
  tbl_cross( 
    row = age_group, 
    col = sex, 
    percent = "cell") %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Unweighted age distribution of population by age category (month and year) and gender.

# get cross tabulated counts and proportions
study_data_cleaned %>% 
  tbl_cross( 
    row = age_category, 
    col = sex, 
    percent = "cell") %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

There were r fmt_count(dropped, is.na(sex)) cases missing information on sex and r fmt_count(dropped, is.na(age_group)) missing age group.

Unweighted age and gender distribution of household population covered by the survey. <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// age_pyramid \\

This chunk creates an unweighted (using study_data_cleaned) age/sex pyramid of your cases. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

age_pyramid(study_data_cleaned, 
            age_group, 
            split_by = sex, 
            proportional = TRUE) + 
  labs(y = "Proportion", x = "Age group (years)") +    # change axis labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )

Unweighted age and gender distribution, stratified by health district, of household population covered by the survey. <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// age_pyramid_strata \\

This chunk creates an unweighted (using study_data_cleaned) age/sex pyramid of your cases, stratified by health district.

If you have a stratified survey design this may be useful for visualising if you have an excess of representation in either sex or gender in any of your survey strata. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

age_pyramid(study_data_cleaned, 
                 age_group, 
                 split_by = sex,
                 stack_by = health_district,
                 proportional = TRUE, 
                 pal = c("red", "blue")) + 
  labs(y = "Proportion", x = "Age group (years)") +    # change axis labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )

Weighted age and gender distribution of household population covered by the survey.

age_pyramid(survey_design,
                 age_group,
                 split_by = sex, 
                 proportion = TRUE) +
  labs(y = "Proportion (weighted)", x = "Age group (years)") + # change axis labels
  theme(legend.position = "bottom",     # move legend to bottom
        legend.title = element_blank(), # remove title
        text = element_text(size = 18)  # change text size
       )

Mortality

## weighted counts and proportion of dead
death_props <- survey_design %>% 
  select(died) %>% 
  tbl_svysummary() %>% 
  ## add the weighted total
  add_n() %>%   
  ## add in confidence intervals

  ## add in deff 

  ## modify the column headers
  modify_header(
    list(
      n ~ "**Weighted total (N)**",
      stat_0 ~ "**Weighted Count (n)**"
    )
    )

## mortality per 10,000 persons/day with CI
CMR <- survey_design %>% 
  ## survey ratio used to account for observation time 
  summarize(mortality = survey_ratio(died * 10000, obstime, vartype = "ci")) %>% 
  ## if negative CIs set to zero
  mutate(mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
         mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp)) %>% 
  ## merge confidence intervals for mortality in to one column (do not change to percent)
  unite_ci("Mortality", starts_with("mortality"), m100 = FALSE, percent = FALSE) %>%
  pull("Mortality")

During the recall period the weighted number and proportion of deaths in the population was r inline_text(death_props, variable = "died", column = "stat_0"), with a weighted confidence interval of r #inline_text(death_props, variable = "died", column = "add_stat_1"), and a design effect of r #inline_text(death_props, variable = "died", column = "add_stat_2"). This is a crude mortality rate of r CMR deaths per 10000 person-days.

## weighted counts and proportion of dead
death_props_strat <- survey_design %>% 
  select(died, health_district) %>%
  tbl_svysummary(by = health_district) %>% 
  ## add the weighted total
  add_n() %>%   
  ## add in confidence intervals

  ## add in deff 

  ## modify the column headers
  modify_header(
    list(
      n ~ "**Weighted total (N)**",
      stat_1 ~ "**District A Weighted Count (n)** \n N={n}", 
      stat_2 ~ "**District B Weighted Count (n)** \n N={n}" 
    )
    )


## mortality per 10,000 persons/day with CI
CMR_strat <- survey_design %>% 
  group_by(health_district) %>% 
  ## survey ratio used to account for observation time 
  summarize(mortality = survey_ratio(as.numeric(died) * 10000, obstime, vartype = "ci")) %>% 
  ## if negative CIs set to zero
  mutate(mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
         mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp)) %>% 
  ## merge confidence intervals for mortality in to one column (do not change to percent)
  unite_ci("Mortality", starts_with("mortality"), m100 = FALSE, percent = FALSE, digits = 1)

In District A the weighted number of deaths was r inline_text(death_props_strat, variable = "died", column = "stat_1"), which gives a weighted proportion of r #death_props_strat %>% pull("district_a ci"), and a design effect of r #death_props_strat %>% pull("district_a deff") %>% round(digits = 3).
In comparison, the weighted number of deaths in District B was r inline_text(death_props_strat, variable = "died", column = "stat_2"), the weighted proportion was r #death_props_strat %>% pull("district_b ci") and the design effect was r #death_props_strat %>% pull("district_b deff") %>% round(digits = 3).
The crude mortality rate in District A was r CMR_strat %>% filter(health_district == "district_a") %>% pull(Mortality) deaths per 10000 person-days and in District B was r CMR_strat %>% filter(health_district == "district_b") %>% pull(Mortality)

Below is a graph of the weighted mortality ratio per 10,000 population by month.

## retrieve all months that we have in the dataset
study_months <- seq.Date(
  min(study_data_cleaned$recall_start), 
  max(study_data_cleaned$recall_end), 
  by = "month"
  )

## retrieve weighted mortality ratios by month
CMR_month <- purrr::map(
  ## for each month run the function
  study_months, 
           function(x) {

             ## define beginning and end of month 
             month_start <- x 
             month_end <- lubridate::ceiling_date(x, unit = "month") - 1
             ## define all the days in the current month
             month_span <- seq.Date(month_start, month_end, by = "day")


             survey_design %>% 
               ## drop if enddate before or startdate after current month
               filter(
                 startdate <= month_end, 
                 enddate >= month_start, 
               ) %>% 
               mutate(
                 # if startdate before current month then set to beginning of month
                 startdate = if_else(startdate < month_start, month_start, startdate), 
                 # edit to died in current month 
                 died =  died & (enddate %in% month_span),
                 ## if enddate after current month then set to end of month
                 enddate = if_else(enddate > month_end, month_end, enddate), 
                 ## calculate new obstime for the month
                 obstime = as.numeric(enddate - startdate)
                 ) %>% 
               ## survey ratio includes the observation times in calculation
               summarize(mortality = survey_ratio(as.numeric(died) * 10000, obstime, vartype = "ci"))  %>% 
                ## if count zero or negative CIs set to zero
                mutate(mortality = if_else(is.infinite(mortality), 0, mortality),
                       mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
                       mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp), 
                       ## add in the current month 
                       month = factor(format(x, format = "%B %Y")))
             }) %>% 
  ## combine list in to one dataframe
  bind_rows()

## plot with months as x axis and mortality rate as y
ggplot(CMR_month, aes(x = month, y = mortality)) + 
  geom_bar(stat = "identity", col = "black", fill = "red") +  # plot as bars 
  geom_errorbar(aes(ymin = mortality_low, ymax = mortality_upp, width = 0.2)) + # add CIs
  scale_y_continuous(expand = c(0,0)) + # set origin for axes
  labs(x = "Month", y = "Mortality rate (per 10,000)")  # add labels to axes

Weighted counts, proportions and cause-specific mortality ratios, by reported causes of death.

## weighted counts and proportions by cause of death 
cause_of_death_prop <- survey_design %>% 
  # proportions only among those who died
  filter(died) %>% 
  select(cause) %>% 
  tbl_svysummary()
  ## add ci 

## weighted cause-specific mortality ratios
cause_of_death_mort <- survey_design %>% 
  filter(died) %>%  # rates only among those who died
  group_by(cause) %>% 
  ## use survey ratio to calculate mortality rate per 10000 person years
  summarize(mortality = survey_ratio(as.numeric(died) * 10000, obstime, vartype = "ci")) %>% 
  ## if negative CIs set to zero
  mutate(mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
         mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp)) %>% 
  ## merge confidence intervals for mortality in to one column 
  ## (do not change to percent)
  unite_ci("mortality", starts_with("mortality"),
           m100 = FALSE, percent = FALSE, digits = 1)

## Join the counts, props and mortality ratios together
cause_of_death_prop %>% 
  ## edit the underlying table for gtsummary
  modify_table_body(
    ## join the prop table with mortality table
    ~dplyr::left_join(
        .x,
        cause_of_death_mort,
        ## row labels in prop gtsummary match the cause in the mortality table
        by = c("label" = "cause")
      )
  ) %>% 
  ## modify the column headers
  modify_header(
    list(
      stat_0 ~ "**Deaths \n N={round(N, digits = 0)}**", 
      mortality ~ "**Mortality per 10,000 person/days (95% CI)**" 
    )
    ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Weighted counts and proportions of reported causes of death by health district <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// weighted_cause_death_strata \\

The below chunk creates a weighted table of counts proportions for
reported causes of death by strata

Note that low counts or short observation times may lead to a confidence interval that crosses zero (i.e. negative) for mortality ratios. These should be interpreted as if no deaths or recoded to zero (impossible to have negative deaths). ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->

survey_design %>% 
  filter(died) %>% # proportions only among those who died
  select(cause, health_district) %>%
  tbl_svysummary(by = health_district) %>%   
  ## add in confidence intervals

  ## modify the column headers
  modify_header(
    list(
      stat_1 ~ "**District A Weighted Count (n)** \n N={n}", 
      stat_2 ~ "**District B Weighted Count (n)** \n N={n}" 
    )
    ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Weighted cause specific mortality ratios for reported causes of death by health district

survey_design %>% 
  filter(died) %>%  # proportions only among those who died
  group_by(cause, health_district) %>% 
  summarize(mortality = survey_ratio(as.numeric(died) * 10000, obstime, vartype = "ci")) %>% 
  ## if negative CIs set to zero
  mutate(mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
         mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp)) %>% 
  unite_ci("mort", starts_with("mortality"),
           m100 = FALSE, percent = FALSE, digits = 1) %>% 
  pivot_wider(id_cols = cause,     # choose variable for rows
              names_from = health_district, # choose variable for columns 
              values_from = mort) %>%       # choose what goes in cells
  select("Main cause of death" = cause, 
         "District A MR (95%CI)" = "district_a", 
         "District B MR (95%CI)" = "district_b") %>%
  # produce styled output table with auto-adjusted column widths with {flextable}
  qflextable() %>% 
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

cause_of_death_sex <- survey_design %>% 
  group_by(sex) %>% 
  summarize(mortality = survey_ratio(as.numeric(died) * 10000, obstime, vartype = "ci")) %>% 
  ## if negative CIs set to zero
  mutate(mortality_low = if_else(mortality_low < 0, 0, mortality_low), 
         mortality_upp = if_else(mortality_upp < 0, 0, mortality_upp)) %>% 
  ## merge confidence intervals for mortality in to one column (do not change to percent)
  unite_ci("mortality", starts_with("mortality"), m100 = FALSE, percent = FALSE)

male_mortality   <- cause_of_death_sex %>% filter(sex == "Male") %>% pull(mortality)
female_mortality <- cause_of_death_sex %>% filter(sex == "Female") %>% pull(mortality)

The gender specific mortality rate was r female_mortality deaths/10,000 persons/day amongst females and r male_mortality deaths/10,000 persons/day amongst males.

Reported causes of death and cause-specific mortality rates, by age group, weighted

survey_design %>% 
  filter(died) %>% # proportions only among those who died
  select(cause, age_group) %>%
  tbl_svysummary(by = age_group) %>%   
  ## add in confidence intervals

  ## modify the column headers
  modify_spanning_header(
    all_stat_cols() ~ "**Age group (years)**"
  ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Reported causes of death and cause-specific mortality rates, by gender, weighted

survey_design %>% 
  filter(died) %>% # proportions only among those who died
  select(cause, sex) %>%
  tbl_svysummary(by = sex) %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Population dynamics

at_start   <- sum(study_data_cleaned$startcause == "Present at start", na.rm = TRUE)
at_end     <- sum(study_data_cleaned$endcause == "Present at end", na.rm = TRUE)
net_change <- round( (1 - (abs(at_start - at_end) / at_start)) * 100, digits = 1)

Among the r nrow(study_data_cleaned) surveyed individuals included in the analyses; there were r fmt_count(study_data_cleaned, startcause == "Present at start") household members present at the start of the recall period. There were r fmt_count(study_data_cleaned, startcause != "Present at start") individuals who arrived during the recall period and r fmt_count(study_data_cleaned, endcause != "Present at end") who departed. This resulted in r fmt_count(study_data_cleaned, endcause == "Present at end") individuals who were present at the end of the recall period; for a net change in sample population of r net_change%.

Below shows the unweighted counts and proportions of arrival reasons during the study period.

study_data_cleaned %>% 
  select(startcause) %>% 
  tbl_summary() %>% 
  ## modify the column headers
  modify_header(
    list(
      stat_0 ~ "**Individuals** \n N={N}"
    )
    ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

This shows the unweighted counts and proportions of departure reasons during the study period

study_data_cleaned %>% 
  select(endcause) %>% 
  tbl_summary() %>% 
  ## modify the column headers
  modify_header(
    list(
      stat_0 ~ "**Individuals** \n N={N}"
    )
    ) %>% 
  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

This parallel set diagram shows the flow of population between each individual starting and end points during the study period.

## summarize data
flow_table <- study_data_cleaned %>%
  count(startcause, endcause, sex)  # get counts

## plot your dataset 
  ## on the x axis is the start and end causes
  ## on the y show n - gives it as counts (could also be changed to proportion)
ggplot(data = flow_table,
       aes(axis1 = startcause, axis2 = endcause,
           y = n)) +
  ## define x axis limits and labels
  scale_x_discrete(limits = c("Start\ncause", "End\ncause")) +
  ## colour lines by sex 
  geom_alluvium(aes(fill = sex)) +
  ## fill in the label boxes grey
  geom_stratum(fill = "grey90") +
  ## add in text of labels to boxes
  geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
  ## define y axis label
  labs(y = "Frequency (n)") + 
  ## adjust theme
  theme_minimal() + 
  theme(
    title = element_text(size = 26),
    text = element_text(size = 26),
    legend.position = "bottom", 
    legend.title = element_blank()
  )

Health seeking behaviour

## get counts of the number of households with someone ill 
ill_num <- study_data_cleaned %>% 
  ## only keep households with someone ill 
  filter(ill_hh_number > 0) %>% 
  ## trim data to only unique households within each cluster
  distinct(cluster_number, household_number, ill_hh_number) %>%
  summarise(
    ## get the number of households with an ill person
    ill_hh       = n(), 
    ## get the proportion of households with an ill person
    ill_hh_prop  = round(ill_hh / num_hh * 100, digits = 1),
    ## get the median number of ill household members
    ill_hh_med   = median(ill_hh_number),
    ## get the range and standard deviation of ill household members
    ill_hh_range = str_c(range(clustersize), collapse = "--"),
    ill_hh_sd    = round(sd(hhsize), digits = 1)
  )

The unweighted number of households with at least one person ill was r str_glue("{ill_num$ill_hh} ({ill_num$ill_hh_prop}%)"). Among households with an ill member, the median number of ill people was r str_glue("{ill_num$ill_hh_med} (range: {ill_num$ill_hh_range}, standard deviation: {ill_num$ill_hh_sd})")

Symptoms reported among those who were sick

survey_design %>% 
  filter(ill_recently == "Yes") %>% # filter for those who were sick in last 2 weeks
  select(cause_illness) %>%
  tbl_svysummary() %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Symptoms reported among those who were sick, stratified by age group

survey_design %>% 
  filter(ill_recently == "Yes") %>% # filter for those who were sick in last 2 weeks
  select(cause_illness, age_group) %>%
  tbl_svysummary(by = age_group) %>%   
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Symptoms reported among those who were sick, stratified by gender

survey_design %>% 
  filter(ill_recently == "Yes") %>% # filter for those who were sick in last 2 weeks
  select(cause_illness, sex) %>%
  tbl_svysummary(by = sex) %>%   
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Reason reported not seeking healthcare among those who were sick, and did not seek help

survey_design %>% 
  # filter for those who were sick in last 2 weeks and did not seek help
  filter(ill_recently == "Yes" & 
           care_illness_recent == "No") %>% 
  select(no_care_illness) %>%
  tbl_svysummary() %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Violence

Number, place and types of violence reported among those who had an experience

survey_design %>% 
  # filter for those who experience violence
  filter(violent_episode == "Yes" ) %>% 
  select(violent_episodes_number, violence_nature, uniform, place_violence) %>%
  tbl_svysummary() %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Malaria

Current febrile status and health seeking behaviour among children reporting fever in the last two weeks

survey_design %>% 
  # filter for those who were sick in last 2 weeks and less than 5 years old
  filter(fever_past_weeks == "Yes" & 
           age_years < 5) %>% 
  select(fever_now, care_fever) %>%
  tbl_svysummary() %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")

Reason reported not seeking healthcare among children who were sick, and did not seek help

survey_design %>% 
  # filter for those who were sick in last 2 weeks, less than 5 years old didnt seek help
  filter(fever_past_weeks == "Yes" & 
           age_years < 5 & 
           care_fever == "No") %>% 
  select(reason_no_care) %>%
  tbl_svysummary() %>% 
  ## add in confidence intervals

  ## make variable names bold 
  bold_labels() %>% 
  # change to flextable format
  as_flex_table() %>%
  # make header text bold (using {flextable})
  bold(part = "header") %>% 
  # make your table fit to the maximum width of the word document
  set_table_properties(layout = "autofit")