In STAPM/hseclean: Health Survey Data Wrangling

Reading the HSE data files

There are separate functions in hseclean to read each year of HSE data. You must specify the link to where the data is stored. The functions read in all variables related to tobacco and alcohol and selected socio-economic and other descriptor variables.

test_2001 <- read_2001(
  root = "X:/",
  file = "ScHARR/PR_Consumption_TA/HSE/HSE 2001/UKDA-4628-tab/tab/hse01ai.tab"
)

hseclean contains separate functions for reading the survey data for each year, e.g. read_SHeS_2008().

Processing socioeconomic, demographic and health variables

There are separate functions that focus on processing a different theme of socioeconomic, demographic and health variables. See vignette("covariate_data"). Note that the order in which the cleaning functions are applied can matter - as some functions use variables that are cleaned by others.

library(magrittr)

temp <- read_2017(root = root_dir) %>%
  clean_age %>%
  clean_family %>%
  clean_demographic %>% 
  clean_education %>%
  clean_economic_status %>%
  clean_income %>%
  clean_health_and_bio

Alcohol data

Detailed description of how to clean the alcohol data are given in vignette("alcohol_data"). As an example, here is the workflow to plot the frquency of drinking among people who drank in 2017.

library(magrittr)
library(dplyr)
library(ggplot2)

# Frequency of drinking in 2017 among drinkers
root_dir <- "/Volumes/Shared/"
#root_dir <- "X:/"

read_2017(root = root_dir) %>%
  clean_age %>%
  clean_demographic %>%
  alc_drink_now_allages %>%
  filter(age < 90, age >= 8, drinks_now == "drinker") %>%
  group_by(imd_quintile, age_cat) %>% 
  summarise(av_freq = mean(drink_freq_7d, na.rm = T)) %>% 
  ggplot(aes(x = imd_quintile, y = av_freq, fill = age_cat)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  ylab("average number of days drink in a week")

Clean all years of smoking and alcohol data

See vignette("smoking_data").

library(magrittr)

# Wrap the individual cleaning functions in another function for applying to each year

cleandata <- function(data) {

  data %<>%
    clean_age %>%
    clean_demographic %>% 
    clean_education %>%
    clean_economic_status %>%
    clean_family %>%
    clean_income %>%
    clean_health_and_bio %>%
    smk_status %>%
    smk_former %>%
    smk_life_history %>%
    smk_amount %>%
    alc_drink_now_allages %>%
    alc_weekmean_adult %>%
    alc_sevenday_adult %>%
    alc_sevenday_child %>%

    select_data(
      ages = 12:89,
      years = 2001:2017,

      # variables to retain
      keep_vars = c("wt_int", "psu", "cluster", "year", "quarter",
                    "age", "age_cat", "sex", "imd_quintile",
                    "ethnicity_4cat", "ethnicity_2cat",
                    "degree", "relationship_status", "employ2cat", "social_grade", "kids", "income5cat",
                    "nssec3_lab", "man_nonman", "activity_lstweek", "eduend4cat",

                    "hse_cancer", "hse_endocrine", "hse_heart", "hse_mental", "hse_nervous", "hse_eye", "hse_ear", "hse_respir", 
                    "hse_disgest", "hse_urinary", "hse_skin", "hse_muscskel", "hse_infect", "hse_blood",

                    "weight", "height", "bmi",

                    "cig_smoker_status", "years_since_quit", "years_reg_smoker", "cig_ever",
                    "cigs_per_day", "smoker_cat", "banded_consumption", "cig_type", "time_to_first_cig",
                    "smk_start_age", "smk_stop_age", "censor_age", "giveup_smk",

                    "drinks_now", 
                    "drink_freq_7d", "n_days_drink", "peakday", "binge_cat",
                    "beer_units", "wine_units", "spirit_units", "rtd_units", 
                    "weekmean", 
                    "perc_spirit_units", "perc_wine_units", "perc_rtd_units", "perc_beer_units", 
                    "drinker_cat", 
                    "spirits_pref_cat", "wine_pref_cat", "rtd_pref_cat", "beer_pref_cat", 
                    "total_units7_ch"
      ),

      # The variables that must have complete cases
      complete_vars = c("age", "sex", "year", "quarter", "psu", "cluster")
    )

  return(data)
}

# Read and clean each year of data and bind them together in one big dataset
data <- combine_years(list(
  cleandata(read_2001(root = root_dir)),
  cleandata(read_2002(root = root_dir)),
  cleandata(read_2003(root = root_dir)),
  cleandata(read_2004(root = root_dir)),
  cleandata(read_2005(root = root_dir)),
  cleandata(read_2006(root = root_dir)),
  cleandata(read_2007(root = root_dir)),
  cleandata(read_2008(root = root_dir)),
  cleandata(read_2009(root = root_dir)),
  cleandata(read_2010(root = root_dir)),
  cleandata(read_2011(root = root_dir)),
  cleandata(read_2012(root = root_dir)),
  cleandata(read_2013(root = root_dir)),
  cleandata(read_2014(root = root_dir)),
  cleandata(read_2015(root = root_dir)),
  cleandata(read_2016(root = root_dir)),
  cleandata(read_2017(root = root_dir))
))

# clean the survey weights
data <- clean_surveyweights(data)

Summarise data

The function survey::svyby() in the survey R package is used by the function prop_summary() in hseclean to estimate the uncertainty around proportions calculated from a binary variable - prop_summary() was designed to simplify the process of estimating smoking prevalence from the HSE data, stratified by a specified set of variables.

prop_smokers <- prop_summary(
  data = hse_data,
  var_name = "smk.state",
  levels_1 = "current",
  levels_0 = c("former", "never"),
  strat_vars = c("year", "sex", "imd_quintile")
)

Missing data imputation

hseclean uses the function mice::mice() in the mice R package, implemented in a basic way by the impute_data_mice() function. See vignette("missing_data").

# Run the imputation (takes a long time)
imp <- impute_data_mice(data = hse_data,
                        var_names = c("smk.state", "agegroup", "sex", 
                                      "imd_quintile", "degree", "kids", "income5cat",
                                      "relationship_status", "employ2cat", "social_grade"),
                        var_methods = c("", "", "", 
                                        "polr", "logreg", "polr", "polr",
                                        "polyreg", "logreg", "logreg"),
                        n_imputations = 5)

# imp$data is a single data.table containing all 5 imputed versions of the data
hse_data_imputed <- copy(imp$data)