In dosgillespie/hseclean: Health Survey Data Wrangling

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.pos = 'H'
)

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(testthat))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(hseclean))

Preparing the data

To prepare for the process that imputes missing data, run the full set of functions to read and clean the data. It is important to note that there has already been some filling-in of missing data done by these cleaning functions - using simple rules,

For 2015+, the clean_age() function has randomly assigned single years of age within each age category.
If someone is classified as a current smoker, is younger than 16 and has missing data on the time between waking and having their first cigarette, we assume that this time is one hour or more.
If someone is younger than age 16 and has missing data on their number of children (for survey years prior to 2015), we assume that they had no children.
If someone of any age is classified as a current smoker but has a missing value for the amount smoked, then we fill that missing value with the average amount smoked within a year, age, sex and IMD quintile subgroup.
Information on education, employment and socioeconomic status is triangulated across several variables.

The number of children in the household is missing for years 2015+. This is imputed in the function clean_family() based on the fit of a multinomial model to years 2012-2014 (see vignette("covariate_data")).

The function select_data() has the option to filter the data to retain only complete cases for certain variables. To prepare the data, the example code below filters out any incomplete data on key survey variables (age, sex, year, quarter, psu, cluster, imd_quintile). It also filters out any incomplete information on the key smoking and drinking variables, "cig_smoker_status" and "drinks_now".

# Prepare the data by applying all cleaning functions 
# and retaining all derived variables

root_dir <- "/Volumes/Shared/"

cleandata <- function(data) {

  data %<>%
    clean_age %>%
    clean_demographic %>% 
    clean_education %>%
    clean_economic_status %>%
    clean_family %>%
    clean_income %>%
    clean_health_and_bio %>%
    smk_status %>%
    smk_former %>%
    smk_life_history %>%
    smk_amount %>%
    alc_drink_now_allages %>%
    alc_weekmean_adult %>%
    alc_sevenday_adult %>%
    alc_sevenday_child %>%

    select_data(
      ages = 12:89,
      years = 2001:2017,

      # variables to retain
      keep_vars = c("wt_int", "psu", "cluster", "year", "quarter",
                    "age", "age_cat", "censor_age", "sex", "imd_quintile",
                    "ethnicity_4cat", "ethnicity_2cat",
                    "degree", "relationship_status", "employ2cat", "social_grade", "kids", "income5cat",
                    "nssec3_lab", "man_nonman", "activity_lstweek", "eduend4cat",

                    "hse_cancer", "hse_endocrine", "hse_heart", "hse_mental", "hse_nervous", "hse_eye", "hse_ear", "hse_respir", 
                    "hse_disgest", "hse_urinary", "hse_skin", "hse_muscskel", "hse_infect", "hse_blood",

                    "weight", "height",

                    "cig_smoker_status", "years_since_quit", "years_reg_smoker", "cig_ever",
                    "cigs_per_day", "smoker_cat", "banded_consumption", "cig_type", "time_to_first_cig",
                    "smk_start_age", "smk_stop_age", "censor_age",

                    "drinks_now", 
                    "drink_freq_7d", "n_days_drink", "peakday", "binge_cat",
                    "beer_units", "wine_units", "spirit_units", "rtd_units", 
                    "weekmean", 
                    "perc_spirit_units", "perc_wine_units", "perc_rtd_units", "perc_beer_units", 
                    "drinker_cat", 
                    "spirits_pref_cat", "wine_pref_cat", "rtd_pref_cat", "beer_pref_cat", 
                    "total_units7_ch"
      ),

      # The variables that must have complete cases
      complete_vars = c("age", "sex", "year", "quarter", "psu", "cluster", "cig_smoker_status", "drinks_now")
    )

return(data)
}

# Read and clean each year of data and bind them together in one big dataset
data <- combine_years(list(
  cleandata(read_2001(root = root_dir)),
  cleandata(read_2002(root = root_dir)),
  cleandata(read_2003(root = root_dir)),
  cleandata(read_2004(root = root_dir)),
  cleandata(read_2005(root = root_dir)),
  cleandata(read_2006(root = root_dir)),
  cleandata(read_2007(root = root_dir)),
  cleandata(read_2008(root = root_dir)),
  cleandata(read_2009(root = root_dir)),
  cleandata(read_2010(root = root_dir)),
  cleandata(read_2011(root = root_dir)),
  cleandata(read_2012(root = root_dir)),
  cleandata(read_2013(root = root_dir)),
  cleandata(read_2014(root = root_dir)),
  cleandata(read_2015(root = root_dir)),
  cleandata(read_2016(root = root_dir)),
  cleandata(read_2017(root = root_dir))
))

# clean the survey weights
data <- clean_surveyweights(data)

Multiple imputation

The variable with the most missingness in the data is income5cat (19% missing). In this example, the other variables to be imputed are: kids, ethnicity_4cat, eduend4cat, degree, relationship_status, nssec3_lab, activity_lstweek.

To conduct the multiple imputation, we use the R package mice [@Rmice]. The process of running the multiple imputation can take a long time and consume a lot of RAM. There is a range of mice documentation and tutorials online.

In hseclean, multiple imputation is implemented in a basic way by the impute_data_mice() function.

mice fits a chained series of regression equations that predict the missing values of variables based on their relationships with other selected variables in the data. The impute_data_mice() function currently only imputes categorical variables, which could be one of three types: "logreg" - binary Logistic regression; "polr" - ordered Proportional odds model; "polyreg" - unordered Polytomous logistic regression.

In running the multiple imputation, the number of iterations of the imputed data is selected (choosing a small number e.g. < 5 helps keep the size of the resulting imputed data manageable), and the variables to either be predicted or to inform the prediction are selected. If a variable is just going to inform the prediction of the other variables but is not going to be predicted itself, then the model type is set to "", otherwise to one of "logreg", "polr" or "polyreg".

# variables with missingness
table(data$ethnicity_4cat, useNA = "ifany") # 350 missing (0.2%)
table(data$eduend4cat, useNA = "ifany") # 179 missing
table(data$degree, useNA = "ifany") # 94 missing
table(data$relationship_status, useNA = "ifany") # 7210 missing
table(data$kids, useNA = "ifany") # 24,710 missing
table(data$income5cat, useNA = "ifany") # 35,072 missing
table(data$nssec3_lab, useNA = "ifany") # 552 missing
table(data$activity_lstweek, useNA = "ifany")


# Set a broader age category variable
data[, agegroup := c("12-16",
                     "16-17",
                     "18-24",
                     "25-34",
                     "35-49",
                     "50-64",
                     "65-74",
                     "75-89")[findInterval(age, c(-10, 16, 18, 25, 35, 50, 65, 75, 1000))]]

# Run the imputation
imp <- impute_data_mice(data = data,
  var_names = c(
    "agegroup",
    "sex",
    "imd_quintile",
    "ethnicity_4cat",
    "eduend4cat",
    "degree",
    "relationship_status",
    "kids",
    "income5cat",
    "nssec3_lab",
    "activity_lstweek"
  ),
  var_methods = c(
    "",
    "",
    "",
    "polyreg",
    "polyreg",
    "logreg",
    "polyreg",
    "polyreg",
    "polyreg",
    "polyreg",
    "polyreg"
  ), n_imputations = 1)


data_imp <- copy(imp$data)