data-raw/0-prep-understanding-society.R

##
##  Warning!
##  Requires ~8GB RAM to run the script
##

if (!dir.exists("inst/extdata/restricted_us")) {

  if (Sys.info()["sysname"] != "Linux") {
    warning("It looks like you're not using a Linux system\n
            I cannot automatically unzip and place the
            Understanding Society data files in the correct place.\n
            Manually copy the *.tab files in UKDA-6614-tab/tab
            to inst/extdata/restricted_us/\n
            You can also move the documentation to
            inst/doc/restricted_us [optional]")
  } else {
    dir.create("inst/extdata/restricted_us/", showWarnings = FALSE)

    # Looks like filename is unique; use 6614TAB_ to obtain the filename
    us_zip <- list.files("inst/extdata/", pattern = "6614TAB_",
                         full.names = TRUE)
    unzip(us_zip, exdir = "inst/extdata/restricted_us/")

    # Move documentation
    dir.create("inst/doc/restricted_us",
               recursive = TRUE, showWarnings = FALSE)
    system2("mv", args = c("inst/extdata/restricted_us/UKDA-6614-tab/mrdoc",
                           "inst/doc/restricted_us"))

    # Move data files
    system2("mv",
            args = c("inst/extdata/restricted_us/UKDA-6614-tab/tab/*.tab",
                     "inst/extdata/restricted_us/"))
    system2("rm", c("-rf", "inst/extdata/restricted_us/UKDA-6614-tab/tab/"))

    # Move README and file information
    system2("mv", c("inst/extdata/restricted_us/UKDA-6614-tab/*.*",
                    "inst/doc/restricted_us/"))
    system2("rm", c("-rf", "inst/extdata/restricted_us/UKDA-6614-tab/"))

  }
  rm(us_zip)

}


if (!file.exists("data/us.RData")) {

  # Prepare wave f ====
  usf <- readr::read_tsv("inst/extdata/restricted_us/f_indresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")
                  # -11 only available for IEMB
                  # -10 not available for IEMB
                  # -9 missing
                  # -8 Inapplicable
                  # -7 Proxy
                  # -2 Refused
                  # -1 Don't know
  ) %>%
    select(pidp, f_hidp,   # id and household id
           f_hhorig,       # sample origin (i.e. BHPS)
           f_dvage,        # derived age
           f_sex,          # sex corrected
           f_racel,        # ethnic group
           f_hiqual_dv,    # highest qualification
           f_jbstat,       # current economic activity
           f_health,       # long-standing illness or disability
           f_scsf1,        # self-reported general health
           f_jbnssec_dv,   # NS-SEC/social class
           f_jbnssec8_dv,  # NS-SEC 8
           f_jbnssec5_dv,  # NS-SEC 5
           f_jbnssec3_dv,  # NS-SEC 3
           f_mlstat,       # marital status
           f_scopngbhe,    # willing to improve neighbourhood
           f_scopngbhg,    # similar to others in neighbourhood
           f_nbrcoh3,      # people in neighbourhood can be trusted
           f_scopngbha,    # belong to neighbourhood
           f_orga1,        # active in organisation
           f_orga2,        # active in organisation
           f_orga3,        # active in organisation
           f_orga4,        # active in organisation
           f_orga5,        # active in organisation
           f_orga6,        # active in organisation
           f_orga7,        # active in organisation
           f_orga8,        # active in organisation
           f_orga9,        # active in organisation
           f_orga10,       # active in organisation
           f_orga11,       # active in organisation
           f_orga12,       # active in organisation
           f_orga13,       # active in organisation
           f_orga14,       # active in organisation
           f_orga15,       # active in organisation
           f_orga16,       # active in organisation
           f_nbrcoh4,      # people in neighbourhood don't get along
           f_crdark,       # feel safe walking alone at night
           f_siminc,       # Prop friends with similar income
           f_simrace,      # Friends similar ethnicity
           f_poleff4,      # don't have say in what g'ment does
           f_finnow,       # subjective financial status
           f_closenum,     # number of close friends
           f_scghqa,       # GHQ: concentration
           f_scghqb,       # GHQ: loss of sleep
           f_scghqc,       # GHQ: playing a useful role
           f_scghqd,       # GHQ: capable of making decisions
           f_scghqe,       # GHQ: constantly under strain
           f_scghqf,       # GHQ: problem overcoming difficulties
           f_scghqg,       # GHQ: enjoy day-to-day activities
           f_scghqh,       # GHQ: ability to face problems
           f_scghqi,       # GHQ: unhappy or depressed
           f_scghqj,       # GHQ: losing confidence
           f_scghqk,       # GHQ: believe worthless
           f_scghql,       # GHQ: general happiness
           f_ncigs,        # Cigs smoked
           f_netlv_1,      # distance best friend lives
           f_netlv_2,      # distance best friend lives
           f_netlv_3,      # distance best friend lives
           f_netph_1,      # best friend often in touch
           f_netph_2,      # best friend often in touch
           f_netph_3,      # best friend often in touch
           f_save,         # saves money
                           # received core benefit:
           f_benbase1,     #   1 == income support
           f_benbase2,     #   2 == job-seekers allowance
                           # other benefit:
           f_othben5,      #   5 == working tax credit;
           f_othben8,      #   8 == housing benefit
           f_benctc,       # received child tax credit
           f_bendis2,      # disability benefits (ESA == 2)
           f_hubuys,       # who does food shopping? (for cohabiting couples)
           f_hcondno1,
           f_hcondno2,
           f_hcondno3,
           f_hcondno4,
           f_hcondno5,
           f_hcondno6,
           f_hcondno7,
           f_hcondno8,
           f_hcond17       # clinical depression
           # f_btype6,     # tax credits
           # f_disdif,     # Type of impairment or disability
           # f_hcondno1,   # Diagnosed health condition(s)
           # f_hcondno2, f_hcondno3, f_hcondno4, f_hcondno5, f_hcondno6,
           # f_hcondno7,
           # f_hcondns1,   # still has condition
           # f_hcondns2, f_hcondns3, f_hcondns4, f_hcondns5, f_hcondns6,
           # f_hcondns7,
           # f_scsf2a,     # health limits moderate activities
           # f_scsf2b,     # health limits several flights of stairs
           # f_scsf3a,     # health limits work (last 4 wks)
           # f_scsf3b,     # health limits kind of work (last 4 wks)
           # f_scsf4a,     # mental health accomp. less (last 4 wks)
           # f_scsf4b,     # mental health worked less carefully (last 4 wks)
           # f_scsf5,      # pain interfered w/ work (last 4 wks)
           # f_scsf6a,     # felt calm & peaceful (last 4 wks)
           # f_scsf6b,     # Had lots of energy (last 4 wks)
           # f_scsf6c,     # Felt downhearted/depressed (last 4 wks)
           # f_scsf7,      # health (p or m) interfered w/ social life (4 wks)
           # f_sclfsat1,     # Satisfaction with health
           # f_sclfsato,     # Satisfaction with life overall
    )

  # Prepare wave e ====
  use <- readr::read_tsv("inst/extdata/restricted_us/e_indresp.tab",
                         na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(pidp, e_hidp,   # id and household id
           e_hhorig,       # sample origin (i.e. BHPS)
           e_dvage,        # derived age
           e_sex,          # sex corrected
           e_racel,        # ethnic group
           e_hiqual_dv,    # highest qualification
           e_jbstat,       # current economic activity
           e_health,       # long-standing illness or disability
           e_scsf1,        # self-reported general health
           e_jbnssec_dv,   # NS-SEC/social class
           e_jbnssec8_dv,  # NS-SEC 8
           e_jbnssec5_dv,  # NS-SEC 5
           e_jbnssec3_dv,  # NS-SEC 3
           e_mlstat,       # marital status
           e_finnow,       # subjective financial status
           e_scghqa,       # GHQ: concentration
           e_scghqb,       # GHQ: loss of sleep
           e_scghqc,       # GHQ: playing a useful role
           e_scghqd,       # GHQ: capable of making decisions
           e_scghqe,       # GHQ: constantly under strain
           e_scghqf,       # GHQ: problem overcoming difficulties
           e_scghqg,       # GHQ: enjoy day-to-day activities
           e_scghqh,       # GHQ: ability to face problems
           e_scghqi,       # GHQ: unhappy or depressed
           e_scghqj,       # GHQ: losing confidence
           e_scghqk,       # GHQ: believe worthless
           e_scghql,       # GHQ: general happiness
           e_ncigs,        # cigs smoked per day
           e_benunemp1,    # job-seekers allowance
           e_btype2,       # income support
           e_btype3,       # sickness benefit, incl. ESA
           # I think there's an error in the coding of Universal Credit
           # it's labelled as e_btype10 but is in the wrong order
           e_btype6,       # tax credits
           e_hcondno1,
           e_hcondno2,
           e_hcondno3,
           e_hcondno4,
           e_hcondno5,
           e_hcondno6,
           e_hcondno7,
           e_hcond17       # clinical depression
    )


  # Prepare wave d ====
  usd <- readr::read_tsv("inst/extdata/restricted_us/d_indresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(pidp, d_hidp,
           d_hhorig,
           d_dvage,
           d_sex,
           d_racel,
           d_hiqual_dv,
           d_jbstat,       # economic activity
           d_health,
           d_scsf1,
           d_jbnssec_dv,   # NS-SEC/social class
           d_jbnssec8_dv,  # NS-SEC 8
           d_jbnssec5_dv,  # NS-SEC 5
           d_jbnssec3_dv,  # NS-SEC 3
           d_mlstat,       # marital status
           d_finnow,       # subjective financial situation
           d_scghqa,       # GHQ: concentration
           d_scghqb,       # GHQ: loss of sleep
           d_scghqc,       # GHQ: playing a useful role
           d_scghqd,       # GHQ: capable of making decisions
           d_scghqe,       # GHQ: constantly under strain
           d_scghqf,       # GHQ: problem overcoming difficulties
           d_scghqg,       # GHQ: enjoy day-to-day activities
           d_scghqh,       # GHQ: ability to face problems
           d_scghqi,       # GHQ: unhappy or depressed
           d_scghqj,       # GHQ: losing confidence
           d_scghqk,       # GHQ: believe worthless
           d_scghql,       # GHQ: general happiness
           d_save,         # saves money
           d_benunemp1,    # job-seekers allowance
           d_btype2,       # income support
           d_btype3,       # sickness benefit, incl. ESA
           # I think there's an error in the coding of Universal Credit
           # it's labelled as e_btype10 but is in the wrong order
           d_btype6,       # tax credits
           d_hubuys,       # who does food shopping? (for cohabiting couples)
           d_hcondno1,
           d_hcondno2,
           d_hcondno3,
           d_hcondno4,
           d_hcondno5,
           d_hcondno6,
           d_hcondno7,
           d_hcondno8,
           d_hcondno9,
           d_hcondno10,
           d_hcond17     # clinical depression
    )


  # Prepare wave c ====
  usc <- readr::read_tsv("inst/extdata/restricted_us/c_indresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(pidp, c_hidp,
           c_hhorig,
           c_dvage,
           c_sex,
           c_racel,
           c_hiqual_dv,
           c_jbstat,
           c_health,
           c_sf1,
           c_jbnssec_dv,   # NS-SEC/social class
           c_jbnssec8_dv,  # NS-SEC 8
           c_jbnssec5_dv,  # NS-SEC 5
           c_jbnssec3_dv,  # NS-SEC 3
           c_mlstat,       # marital status
           c_scopngbhe,    # willing improve neighbourhood
           c_scopngbhg,    # similar to others in neighbourhood
           c_scopngbha,    # belong neighbourhood
           c_nbrcoh3,      # people in neighbourhood can be trusted
           c_crdark,       # safe walking alone at night
           c_orga1,        # active in organisation
           c_orga2,        # active in organisation
           c_orga3,        # active in organisation
           c_orga4,        # active in organisation
           c_orga5,        # active in organisation
           c_orga6,        # active in organisation
           c_orga7,        # active in organisation
           c_orga8,        # active in organisation
           c_orga9,        # active in organisation
           c_orga10,       # active in organisation
           c_orga11,       # active in organisation
           c_orga12,       # active in organisation
           c_orga13,       # active in organisation
           c_orga14,       # active in organisation
           c_orga15,       # active in organisation
           c_orga16,       # active in organisation
           c_nbrcoh4,      # people in neighbourhood don't get along
           c_poleff4,      # don't have say in what g'ment does
           c_siminc,       # Prop. friends with similar income
           c_simrace,      # Prop. friends similar ethnicity
           c_finnow,       # subjective financial situation
           c_benunemp1,    # job-seekers allowance
           c_btype2,       # income support
           c_btype3,       # sickness benefit (inc. ESA)
           # I think there's an error in the coding of Universal Credit
           # it's labelled as e_btype10 but is in the wrong order
           c_btype6,       # tax credits
           c_closenum,     # Number of close friends
           c_scghqa,       # GHQ: concentration
           c_scghqb,       # GHQ: loss of sleep
           c_scghqc,       # GHQ: playing a useful role
           c_scghqd,       # GHQ: capable of making decisions
           c_scghqe,       # GHQ: constantly under strain
           c_scghqf,       # GHQ: problem overcoming difficulties
           c_scghqg,       # GHQ: enjoy day-to-day activities
           c_scghqh,       # GHQ: ability to face problems
           c_scghqi,       # GHQ: unhappy or depressed
           c_scghqj,       # GHQ: losing confidence
           c_scghqk,       # GHQ: believe worthless
           c_scghql,       # GHQ: general happiness
           c_netlv_1,      # distance best friend lives
           c_netlv_2,      # distance best friend lives
           c_netlv_3,      # distance best friend lives
           c_netph_1,      # best friend often in touch
           c_netph_2,      # best friend often in touch
           c_netph_3,      # best friend often in touch
           c_hcondno1,
           c_hcondno2,
           c_hcondno3,
           c_hcondno4,
           c_hcondno5,
           c_hcondno6,
           c_hcondno7,
           c_hcondno8,
           c_hcond17       # clinical depression
    )


  # Prepare wave b ====
  usb <- readr::read_tsv("inst/extdata/restricted_us/b_indresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(pidp, b_hidp,
           b_hhorig,
           b_dvage,
           b_sex,
           b_racel,
           b_hiqual_dv,
           b_jbstat,
           b_health,
           b_sf1,
           b_jbnssec_dv,   # NS-SEC/social class
           b_jbnssec8_dv,  # NS-SEC 8
           b_jbnssec5_dv,  # NS-SEC 5
           b_jbnssec3_dv,  # NS-SEC 3
           b_mlstat,       # marital status
           b_finnow,       # subjective financial situation
           b_scghqa,       # GHQ: concentration
           b_scghqb,       # GHQ: loss of sleep
           b_scghqc,       # GHQ: playing a useful role
           b_scghqd,       # GHQ: capable of making decisions
           b_scghqe,       # GHQ: constantly under strain
           b_scghqf,       # GHQ: problem overcoming difficulties
           b_scghqg,       # GHQ: enjoy day-to-day activities
           b_scghqh,       # GHQ: ability to face problems
           b_scghqi,       # GHQ: unhappy or depressed
           b_scghqj,       # GHQ: losing confidence
           b_scghqk,       # GHQ: believe worthless
           b_scghql,       # GHQ: general happiness
           b_ncigs,        # cigs smoked per day
           b_save,         # saves money
           b_benunemp1,    # job-seekers allowance
           b_btype2,       # income support
           b_btype3,       # sickness benefit (inc. ESA)
           # I think there's an error in the coding of Universal Credit
           # it's labelled as e_btype10 but is in the wrong order
           b_btype6,       # tax credits
           b_hubuys,       # who does food shopping? (for cohabiting couples)
           b_hcondno1,
           b_hcondno2,
           b_hcondno3,
           b_hcondno4,
           b_hcondno5,
           b_hcondno6,
           b_hcondno7,
           b_hcondno8,
           b_hcondn17      # clinical depression
    )


  # Prepare wave a ====
  usa <- readr::read_tsv("inst/extdata/restricted_us/a_indresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(pidp, a_hidp,
           a_hhorig,
           a_dvage,
           a_sex,
           a_racel,
           a_hiqual_dv,
           a_jbstat,
           a_health,
           a_sf1,
           a_jbnssec_dv,   # NS-SEC/social class
           a_jbnssec8_dv,  # NS-SEC 8
           a_jbnssec5_dv,  # NS-SEC 5
           a_jbnssec3_dv,  # NS-SEC 3
           a_mlstat,       # marital status
           a_scopngbhe,    # willing to improve neighbourhood
           a_scopngbhg,    # similar to others in neighbourhood
           a_scopngbha,    # belong to neighbourhood
           a_sctrust,      # trust in others (neighbourhood)
           a_finnow,       # subjective financial situation
           a_benunemp1,    # job-seekers allowance
           a_btype2,       # income support
           a_btype3,       # sickness benefit (inc. ESA)
           # I think there's an error in the coding of Universal Credit
           # it's labelled as e_btype10 but is in the wrong order
           a_btype6,       # tax credits
           a_scghqa,       # GHQ: concentration
           a_scghqb,       # GHQ: loss of sleep
           a_scghqc,       # GHQ: playing a useful role
           a_scghqd,       # GHQ: capable of making decisions
           a_scghqe,       # GHQ: constantly under strain
           a_scghqf,       # GHQ: problem overcoming difficulties
           a_scghqg,       # GHQ: enjoy day-to-day activities
           a_scghqh,       # GHQ: ability to face problems
           a_scghqi,       # GHQ: unhappy or depressed
           a_scghqj,       # GHQ: losing confidence
           a_scghqk,       # GHQ: believe worthless
           a_scghql,       # GHQ: general happiness
           a_hcond17       # clinical depression
    )


  # Merge waves ====
  us <- usf %>%
    full_join(use, by = "pidp") %>%
    full_join(usd, by = "pidp") %>%
    full_join(usc, by = "pidp") %>%
    full_join(usb, by = "pidp") %>%
    full_join(usa, by = "pidp")

  rm(usa, usb, usc, usd, use, usf)

  stopifnot(
    us$pidp == unique(us$pidp),
    !(any(us == -9, na.rm = TRUE))
  )


  # Prepare household responses ====
  hhf <- readr::read_tsv("inst/extdata/restricted_us/f_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(f_hidp,
           f_hsownd, f_tenure_dv,
           f_ncars,
           f_heatch,
           f_gor_dv,
           f_hsbeds,        # no. bedrooms
           f_hsrooms,       # no. other rooms
           f_hhsize,        # no. people currently in household
           f_nkids015,      # no. children 0-15 in hhold
           f_fihhmngrs_dv,  # gross hh income month before interview
           f_rent,          # last rental payment
           f_rentwc,        # weeks covered by last rent payment f_rent
           f_xpmg,          # last mortgage payment
           f_xphsdb,        # problems paying housing costs
           f_xpaltob_g3     # £ spent on alcohol
           )

  hhe <- readr::read_tsv("inst/extdata/restricted_us/e_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(e_hidp,
           e_hsownd, e_tenure_dv,
           e_ncars,
           e_heatch,
           e_gor_dv,
           e_hsbeds,        # no. bedrooms
           e_hsrooms,       # no. other rooms
           e_hhsize,        # no. people currently in household
           e_nkids_dv,      # no. children in hhold
           e_fihhmngrs_dv,  # gross hh income month before interview
           e_rent,          # last rent payment £
           e_rentwc,        # weeks covered by e_rent
           e_xpmg,          # last mortgage payment
           e_xphsdb,        # behind with rent/mortgage
           e_xpaltob_g3     # £ spent on alcohol
    )

  hhd <- readr::read_tsv("inst/extdata/restricted_us/d_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(d_hidp,
           d_hsownd, d_tenure_dv,
           d_ncars,
           d_heatch,
           d_gor_dv,
           d_hsbeds,
           d_hsrooms,
           d_hhsize,
           d_nkids015,
           d_fihhmngrs_dv,  # gross hh income month before interview
           d_rent,          # last rent payment £
           d_rentwc,        # weeks covered by d_rent
           d_xphsdb,        # problems paying for housing costs
           d_xpmg,        # last mortgage payment
           d_xpaltob_g3     # £ spent on alcohol
    )

  hhc <- readr::read_tsv("inst/extdata/restricted_us/c_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(c_hidp,
           c_hsownd, c_tenure_dv,
           c_ncars,
           c_heatch,
           c_gor_dv,
           c_hsbeds,
           c_hsrooms,
           c_hhsize,
           c_nkids015,      # no. children (<16) in hhold
           c_fihhmngrs_dv,  # gross hh income month before interview
           c_rent,          # last rent payment £
           c_rentwc,        # period covered c_rent
           c_xpmg,          # last mortgage payment
           c_xphsdb,        # behind with rent/mortgage
           c_xpaltob_g3     # £ spent on alcohol
    )

  hhb <- readr::read_tsv("inst/extdata/restricted_us/b_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(b_hidp,
           b_hsownd, b_tenure_dv,
           b_ncars,
           b_heatch,
           b_gor_dv,
           b_hsbeds,
           b_hsrooms,
           b_hhsize,
           b_nkids015,      # no. children in hhold (<16)
           b_fihhmngrs_dv,  # gross hh income month before interview
           b_xphsdb,        # problems paying housing costs
           b_xpmg,          # last mortgage payment
           b_rent,          # last rent payment £
           b_rentwc,        # weeks covered b_rent
           b_xpaltob_g3     # £ spent on alcohol
    )

  hha <- readr::read_tsv("inst/extdata/restricted_us/a_hhresp.tab",
                  na = c("-11", "-10", "-9", "-8", "-7", "-2", "-1")) %>%
    select(a_hidp,
           a_hsownd, a_tenure_dv,
           a_ncars,
           a_heatch,
           a_gor_dv,
           a_hsbeds,
           a_hsrooms,
           a_hhsize,
           a_numchild,      # no. children in hhold
           a_fihhmngrs_dv,  # gross hh income month before interview
           a_rent,          # last rent payment £
           a_rentwc,        # weeks covered last rent payment
           a_xpmg,          # last mortgage payment
           a_xphsdb,        # problems paying housing costs
           a_xpaltob_g3     # £ spent on alcohol
    )

  stopifnot(
    !(any(hhe == -9, na.rm = TRUE)),
    !(any(hhd == -9, na.rm = TRUE)),
    !(any(hhc == -9, na.rm = TRUE)),
    !(any(hhb == -9, na.rm = TRUE)),
    !(any(hha == -9, na.rm = TRUE))
  )


  # Merge household responses ====
  us <- us %>%
    left_join(hhf, by = "f_hidp") %>%
    left_join(hhe, by = "e_hidp") %>%
    left_join(hhd, by = "d_hidp") %>%
    left_join(hhc, by = "c_hidp") %>%
    left_join(hhb, by = "b_hidp") %>%
    left_join(hha, by = "a_hidp")

  rm(hhf, hhe, hhd, hhc, hhb, hha)


  # # Nurse health assessments ====
  #
  # nhac <- readr::read_tsv("inst/extdata/restricted_us_nha/c_indresp_ns.tab")
  #
  # nhac <- select(nhac, -c_waist3)  # All NA
  #
  # nhac[nhac == -9] <- NA  # missing
  # nhac[nhac == -8] <- NA  # Inapplicable
  # nhac[nhac == -7] <- NA  # Proxy
  # nhac[nhac == -2] <- NA  # Refused
  # nhac[nhac == -1] <- NA  # Don't know
  #
  # stopifnot(
  #   !(any(nhac == -9, na.rm = TRUE))
  # )
  #
  # nhab <- readr::read_tsv("inst/extdata/restricted_us_nha/b_indresp_ns.tab",
  #                  col_types = cols(
  #                    b_wtpc        = col_double()
  #                  )
  # )
  #
  # nhab[nhab == -9] <- NA  # missing
  # nhab[nhab == -8] <- NA  # Inapplicable
  # nhab[nhab == -7] <- NA  # Proxy
  # nhab[nhab == -2] <- NA  # Refused
  # nhab[nhab == -1] <- NA  # Don't know
  #
  # stopifnot(
  #   !(any(nhab == -9, na.rm = TRUE))
  # )
  #
  #
  # # Merge nurse health assessments ====
  #
  # us <- left_join(us, nhac, by = "pidp")
  # us <- left_join(us, nhab, by = "pidp")
  #
  # rm(nhac, nhab)


  # Test ====
  stopifnot(  # make sure variables from all waves are present
    any(grepl("a_", colnames(us))),
    any(grepl("b_", colnames(us))),
    any(grepl("c_", colnames(us))),
    any(grepl("d_", colnames(us))),
    any(grepl("e_", colnames(us))),
    any(grepl("f_", colnames(us))),
    any(grepl("hh", colnames(us)))
  )


  # Code region ====
  us$region <- NA
  us$region <- us$f_gor_dv
  us$region[is.na(us$region)] <- us$e_gor_dv[is.na(us$region)]
  us$region[is.na(us$region)] <- us$d_gor_dv[is.na(us$region)]
  us$region[is.na(us$region)] <- us$c_gor_dv[is.na(us$region)]
  us$region[is.na(us$region)] <- us$b_gor_dv[is.na(us$region)]
  us$region[is.na(us$region)] <- us$a_gor_dv[is.na(us$region)]

  us$region <- factor(us$region, levels = 1:12,
                      labels = c("NE", "NW", "YH", "EM", "WM", "EE",
                                 "LN", "SE", "SW", "WA", "SC", "NI"),
                      ordered = FALSE)

  us <- us %>%
    select(-f_gor_dv, -e_gor_dv, -d_gor_dv, -c_gor_dv, -b_gor_dv, -a_gor_dv)


  # Code respondent origin ====
  # Was going to remove BHPS sample members because of survey fatigue
  # us$origin <- NA
  # us$origin <- us$f_hhorig
  # us$origin[is.na(us$origin)] <- us$e_hhorig[is.na(us$origin)]
  # us$origin[is.na(us$origin)] <- us$d_hhorig[is.na(us$origin)]
  # us$origin[is.na(us$origin)] <- us$c_hhorig[is.na(us$origin)]
  # us$origin[is.na(us$origin)] <- us$b_hhorig[is.na(us$origin)]
  # us$origin[is.na(us$origin)] <- us$a_hhorig[is.na(us$origin)]

  # us <- us %>%
  #   filter(origin != 3) %>%
  #   filter(origin != 4) %>%
  #   filter(origin != 5) %>%
  #   filter(origin != 6)

  us <- us %>%
    select(-f_hhorig, -e_hhorig, -d_hhorig, -c_hhorig, -b_hhorig, -a_hhorig)


  # Recode age ====
  # cut us$c_dvage to match census
  us$age_full <- NA
  us$age_full <- us$f_dvage
  us$age_full[is.na(us$age_full)] <- us$e_dvage[is.na(us$age_full)] + 1
  us$age_full[is.na(us$age_full)] <- us$d_dvage[is.na(us$age_full)] + 2
  us$age_full[is.na(us$age_full)] <- us$c_dvage[is.na(us$age_full)] + 3
  us$age_full[is.na(us$age_full)] <- us$b_dvage[is.na(us$age_full)] + 4
  us$age_full[is.na(us$age_full)] <- us$a_dvage[is.na(us$age_full)] + 5
  us$age_full[is.na(us$age_full)] <- round(mean(us$age_full, na.rm = TRUE))

  us$age <- cut(us$age_full,
                breaks = c(min(us$age_full),
                           17, 19, 24, 29, 44, 59, 64, 74, 84, 89,
                           max(us$age_full)),
                include.lowest = TRUE,
                labels = c("age_16_17", "age_18_19", "age_20_24",
                           "age_25_29", "age_30_44", "age_45_59",
                           "age_60_64", "age_65_74", "age_75_84",
                           "age_85_89", "age_90_plus"))

  stopifnot(
    all(!is.na(us$age))
  )

  us <- us %>%
    select(-f_dvage, -e_dvage, -d_dvage, -c_dvage, -b_dvage, -a_dvage)


  # Recode sex ====
  us$sex <- NA
  us$sex <- us$f_sex
  us$sex[is.na(us$sex)] <- us$e_sex[is.na(us$sex)]
  us$sex[is.na(us$sex)] <- us$d_sex[is.na(us$sex)]
  us$sex[is.na(us$sex)] <- us$c_sex[is.na(us$sex)]
  us$sex[is.na(us$sex)] <- us$b_sex[is.na(us$sex)]
  us$sex[is.na(us$sex)] <- us$a_sex[is.na(us$sex)]

  us$sex <- factor(us$sex,
                   levels = 1:2,
                   labels = c("sex_male", "sex_female"))

  us$sex <- forcats::fct_relevel(us$sex, "sex_female", "sex_male")

  us <- us %>%
    select(-f_sex, -e_sex, -d_sex, -c_sex, -b_sex, -a_sex)


  # Recode ethnicity ====
  us$eth_full <- NA
  us$eth_full <- us$f_racel
  us$eth_full[is.na(us$eth_full)] <- us$e_racel[is.na(us$eth_full)]
  us$eth_full[is.na(us$eth_full)] <- us$d_racel[is.na(us$eth_full)]
  us$eth_full[is.na(us$eth_full)] <- us$c_racel[is.na(us$eth_full)]
  us$eth_full[is.na(us$eth_full)] <- us$b_racel[is.na(us$eth_full)]
  us$eth_full[is.na(us$eth_full)] <- us$a_racel[is.na(us$eth_full)]

  us$eth_full <- factor(us$eth_full, levels = c(1:17, 97),
                        labels = c("White British",
                                   "Irish",
                                   "Gypsy or Irish Traveller",
                                   "other White",
                                   "White and Black Caribbean",
                                   "White and Black African",
                                   "White and Asian",
                                   "other mixed",
                                   "Indian",
                                   "Pakistani",
                                   "Bangladeshi",
                                   "Chinese",
                                   "other Asian",
                                   "Caribbean",
                                   "African",
                                   "other black",
                                   "Arab",
                                   "other ethnic group"),
                        ordered = FALSE)

  us <- us %>%
    select(-f_racel, -e_racel, -d_racel, -c_racel, -b_racel, -a_racel)


  # Code ethnic minority group
  us$eth_minority <- NA
  us$eth_minority[us$eth_full == "White British"] <- FALSE
  us$eth_minority[us$eth_full != "White British"] <- TRUE


  # Code reduced ethnic groups
  # Reduced ethnic group classes - to match census aggregate files
  us$eth <- NA
  us$eth[us$eth_full == "White British"]             <- 1
  us$eth[us$eth_full == "Irish"]                     <- 2
  us$eth[us$eth_full == "Gypsy or Irish Traveller"]  <- 3
  us$eth[us$eth_full == "other White"]               <- 3
  us$eth[us$eth_full == "White and Black Caribbean"] <- 4
  us$eth[us$eth_full == "White and Black African"]   <- 4
  us$eth[us$eth_full == "White and Asian"]           <- 4
  us$eth[us$eth_full == "other mixed"]               <- 4
  us$eth[us$eth_full == "Indian"]                    <- 5
  us$eth[us$eth_full == "Pakistani"]                 <- 5
  us$eth[us$eth_full == "Bangladeshi"]               <- 5
  us$eth[us$eth_full == "Chinese"]                   <- 5
  us$eth[us$eth_full == "other Asian"]               <- 5
  us$eth[us$eth_full == "Caribbean"]                 <- 6
  us$eth[us$eth_full == "African"]                   <- 6
  us$eth[us$eth_full == "other black"]               <- 6
  us$eth[us$eth_full == "Arab"]                      <- 7
  us$eth[us$eth_full == "other ethnic group"]        <- 7
  us$eth <- factor(us$eth,
                   levels = 1:7,
                   labels = c("eth_british",
                              "eth_irish",
                              "eth_other_white",
                              "eth_mixed_multiple_ethnic",
                              "eth_asian_asian_british",
                              "eth_black_african_caribbean_british",
                              "eth_other_ethnicity"),
                   ordered = FALSE)


  # Recode education ====
  # Create most recent known hiqual
  us$tmp <- NA
  us$tmp <- us$f_hiqual_dv
  us$tmp[is.na(us$tmp)] <- us$e_hiqual_dv[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$d_hiqual_dv[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$c_hiqual_dv[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$b_hiqual_dv[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$a_hiqual_dv[is.na(us$tmp)]

  us$qual <- NA
  us$qual[us$tmp == 9] <- 0  # No qualification
  # GCSE A*-C Level 2; GCSE D-G Level 1. US not split, so all GCSEs L2
  us$qual[us$tmp == 4] <- 2  # L1 or L2: GCSE
  us$qual[us$tmp == 3] <- 3  # L3: AS/A Level
  us$qual[us$tmp == 2] <- 4  # L4: Degree or above
  us$qual[us$tmp == 1] <- 4  # L4: Degree or above
  us$qual[us$tmp == 5] <- 5  # other qualification

  us$qual <- factor(us$qual,
                    levels = c(0, 2:5),
                    labels = c("qual_0",
                               "qual_2",
                               "qual_3",
                               "qual_4_plus",
                               "qual_other"),
                    ordered = FALSE)

  us <- us %>%
    select(-tmp,
           -f_hiqual_dv, -e_hiqual_dv, -d_hiqual_dv, -c_hiqual_dv, -b_hiqual_dv,
           -a_hiqual_dv)


  # Recode car or van availability ====
  us$car <- NA
  us$car <- us$f_ncars
  us$car[is.na(us$car)] <- us$e_ncars[is.na(us$car)]
  us$car[is.na(us$car)] <- us$d_ncars[is.na(us$car)]
  us$car[is.na(us$car)] <- us$c_ncars[is.na(us$car)]
  us$car[is.na(us$car)] <- us$b_ncars[is.na(us$car)]
  us$car[is.na(us$car)] <- us$a_ncars[is.na(us$car)]

  us$car <- replace(us$car, us$car > 2 & !is.na(us$car), 2)
  us$car <- factor(us$car, levels = 0:2,
                   labels = c("car_0", "car_1", "car_2_plus"),
                   ordered = FALSE)

  us <- us %>%
    select(-f_ncars, -e_ncars, -d_ncars, -c_ncars, -b_ncars, -a_ncars)


  # Recode heat ====
  us$heat <- NA
  us$heat <- us$f_heatch
  us$heat[is.na(us$heat)] <- us$e_heatch[is.na(us$heat)]
  us$heat[is.na(us$heat)] <- us$d_heatch[is.na(us$heat)]
  us$heat[is.na(us$heat)] <- us$c_heatch[is.na(us$heat)]
  us$heat[is.na(us$heat)] <- us$b_heatch[is.na(us$heat)]
  us$heat[is.na(us$heat)] <- us$a_heatch[is.na(us$heat)]

  us$heat[us$heat == 2] <- 0

  us$heat <- factor(us$heat, levels = 0:1,
                    labels = c("heat_no", "heat_yes"),
                    ordered = FALSE)

  us <- us %>%
    select(-f_heatch, -e_heatch, -d_heatch, -c_heatch, -b_heatch, -a_heatch)


  # Recode tenure ====
  us$ten_full <- NA
  us$ten_full <- us$f_hsownd
  us$ten_full[is.na(us$ten_full)] <- us$e_hsownd[is.na(us$ten_full)]
  us$ten_full[is.na(us$ten_full)] <- us$d_hsownd[is.na(us$ten_full)]
  us$ten_full[is.na(us$ten_full)] <- us$c_hsownd[is.na(us$ten_full)]
  us$ten_full[is.na(us$ten_full)] <- us$b_hsownd[is.na(us$ten_full)]
  us$ten_full[is.na(us$ten_full)] <- us$a_hsownd[is.na(us$ten_full)]

  us$ten_full <- factor(us$ten_full, levels = c(1:5, 97),
                        labels = c("owned_outright",
                                   "owned_mortgage_loan",
                                   "shared_ownership",
                                   "rented",
                                   "living_rent_free",
                                   "other_tenure"),
                        ordered = FALSE)

  # Reduced categories of tenure to match census aggregates
  us$ten <- NA
  us$ten[us$ten_full == "owned_outright"]      <- 1
  us$ten[us$ten_full == "owned_mortgage_loan"] <- 2
  us$ten[us$ten_full == "shared_ownership"]    <- 2
  us$ten[us$ten_full == "rented"]              <- 3
  us$ten[us$ten_full == "living_rent_free"]    <- 3
  us$ten[us$ten_full == "other_tenure"]        <- NA

  us$ten <- factor(us$ten,
                   levels = 1:3,
                   labels = c("ten_owned_outright",
                              "ten_owned_mortgage_shared",
                              "ten_rented"),
                   ordered = FALSE)

  us <- us %>%
    select(-f_tenure_dv, -e_tenure_dv, -d_tenure_dv, -c_tenure_dv,
           -b_tenure_dv, -a_tenure_dv) %>%
    select(-f_hsownd, -e_hsownd, -d_hsownd, -c_hsownd, -b_hsownd, -a_hsownd)


  # Recode economic activity ====
  us$econ_act <- NA
  us$econ_act <- us$f_jbstat
  us$econ_act[is.na(us$econ_act)] <- us$e_jbstat[is.na(us$econ_act)]
  us$econ_act[is.na(us$econ_act)] <- us$d_jbstat[is.na(us$econ_act)]
  us$econ_act[is.na(us$econ_act)] <- us$c_jbstat[is.na(us$econ_act)]
  us$econ_act[is.na(us$econ_act)] <- us$b_jbstat[is.na(us$econ_act)]
  us$econ_act[is.na(us$econ_act)] <- us$a_jbstat[is.na(us$econ_act)]

  # levels checked as correct 2018-01-12
  us$econ_act <- factor(us$econ_act, levels = c(1:11, 97),
                        labels = c("self_emp",
                                   "emp",
                                   "unemp",
                                   "retired",
                                   "maternity",
                                   "home_fam",
                                   "stu_ft",
                                   "lt_sick",
                                   "gment_train",
                                   "unpaid_fam",
                                   "apprent",
                                   "other"))

  # Reaggregate some variables to match census
  us$econ_act <- us$econ_act %>%
    forcats::fct_collapse(home_fam = c("home_fam", "maternity"),
                          other    = c("other", "gment_train", "unpaid_fam",
                                       "apprent"))

  us$econ_act <- us$econ_act %>%
    forcats::fct_recode(
      eca_emp     = "emp",
      eca_homefam = "home_fam",
      eca_ltsick  = "lt_sick",
      eca_other   = "other",
      eca_retired = "retired",
      eca_selfemp = "self_emp",
      eca_student = "stu_ft",
      eca_unemp   = "unemp"
    ) %>%
    forcats::fct_relevel(  # order alphabetically for simulation
      "eca_emp", "eca_homefam", "eca_ltsick", "eca_other", "eca_retired",
      "eca_selfemp", "eca_student", "eca_unemp"
    )

  us <- us %>%
    select(-f_jbstat, -e_jbstat, -d_jbstat, -c_jbstat, -b_jbstat, -a_jbstat)


  # Overcrowding (> 1 persons per room) ====
  us$rooms <- NA
  # add bedrooms + other accommodation rooms
  us$rooms <- us$f_hsbeds + us$f_hsrooms

  # make sure bedrooms and other rooms are from the same wave, i.e. the
  # same property
  # e.g. e_hsrooms + d_hsbeds might be two different properties
  us$rooms[is.na(us$rooms)] <-
    us$e_hsbeds[is.na(us$rooms)] + us$e_hsrooms[is.na(us$rooms)]
  us$rooms[is.na(us$rooms)] <-
    us$d_hsbeds[is.na(us$rooms)] + us$d_hsrooms[is.na(us$rooms)]
  us$rooms[is.na(us$rooms)] <-
    us$c_hsbeds[is.na(us$rooms)] + us$c_hsrooms[is.na(us$rooms)]
  us$rooms[is.na(us$rooms)] <-
    us$b_hsbeds[is.na(us$rooms)] + us$b_hsrooms[is.na(us$rooms)]
  us$rooms[is.na(us$rooms)] <-
    us$a_hsbeds[is.na(us$rooms)] + us$a_hsrooms[is.na(us$rooms)]

  us$ppr <- NA
  us$ppr <- us$f_hhsize
  us$ppr[is.na(us$ppr)] <- us$e_hhsize[is.na(us$ppr)]
  us$ppr[is.na(us$ppr)] <- us$d_hhsize[is.na(us$ppr)]
  us$ppr[is.na(us$ppr)] <- us$c_hhsize[is.na(us$ppr)]
  us$ppr[is.na(us$ppr)] <- us$b_hhsize[is.na(us$ppr)]
  us$ppr[is.na(us$ppr)] <- us$a_hhsize[is.na(us$ppr)]

  us$ppr <- us$ppr / us$rooms

  us$ocrowd <- NA
  us$ocrowd[us$ppr >  1] <- 1
  us$ocrowd[us$ppr <= 1] <- 0

  us$ocrowd <- factor(us$ocrowd, levels = c(0, 1),
                      labels = c("ocrowd_no",
                                 "ocrowd_yes"),
                      ordered = FALSE)

  stopifnot(  # check ocrowd coded properly
    all.equal(nrow(filter(us, ppr > 1,  ocrowd == "ocrowd_no")),  0),
    all.equal(nrow(filter(us, ppr <= 1, ocrowd == "ocrowd_yes")), 0)
  )

  # hhsize needed for alcohol consumption
  us <- us %>%
    select(-rooms, -ppr) %>%
    select(-f_hsbeds, -e_hsbeds, -d_hsbeds, -c_hsbeds, -b_hsbeds, -a_hsbeds) %>%
    select(-f_hsrooms, -e_hsrooms, -d_hsrooms, -c_hsrooms, -b_hsrooms,
           -a_hsrooms)


  # Limiting long-term illness or disability ====
  us$llid <- NA
  us$llid <- us$f_health
  us$llid[is.na(us$llid)] <- us$e_health[is.na(us$llid)]
  us$llid[is.na(us$llid)] <- us$d_health[is.na(us$llid)]
  us$llid[is.na(us$llid)] <- us$c_health[is.na(us$llid)]
  us$llid[is.na(us$llid)] <- us$b_health[is.na(us$llid)]
  us$llid[is.na(us$llid)] <- us$a_health[is.na(us$llid)]

  us$llid <- replace(us$llid, us$llid == 2, 0)
  us$llid <- factor(us$llid, levels = 0:1,
                    labels = c("llid_no", "llid_yes"))

  us <- us %>%
    select(-f_health, -e_health, -d_health, -c_health, -b_health, -a_health)


  # Recode self-reported general health ====
  us$tmp <- NA
  us$tmp <- us$f_scsf1
  us$tmp[is.na(us$tmp)] <- us$e_scsf1[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$d_scsf1[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$c_sf1[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$b_sf1[is.na(us$tmp)]
  us$tmp[is.na(us$tmp)] <- us$a_sf1[is.na(us$tmp)]

  us$srgh <- NA
  us$srgh[us$tmp == 1] <- 5  # excellent
  us$srgh[us$tmp == 2] <- 4  # very good
  us$srgh[us$tmp == 3] <- 3  # good
  us$srgh[us$tmp == 4] <- 2  # fair
  us$srgh[us$tmp == 5] <- 1  # poor

  us$srgh <- factor(us$srgh, levels = 5:1,
                    labels = c("srgh_excellent",
                               "srgh_verygood",
                               "srgh_good",
                               "srgh_fair",
                               "srgh_poor"),
                    ordered = FALSE)

  us <- us %>%
    select(-f_scsf1, -e_scsf1, -d_scsf1, -c_sf1, -b_sf1, -a_sf1,
           -tmp)


  # Recode social class ====
  us$class8 <- NA
  us$class8 <- us$f_jbnssec8_dv
  us$class8[is.na(us$class8)] <- us$e_jbnssec8_dv[is.na(us$class8)]
  us$class8[is.na(us$class8)] <- us$d_jbnssec8_dv[is.na(us$class8)]
  us$class8[is.na(us$class8)] <- us$c_jbnssec8_dv[is.na(us$class8)]
  us$class8[is.na(us$class8)] <- us$b_jbnssec8_dv[is.na(us$class8)]
  us$class8[is.na(us$class8)] <- us$a_jbnssec8_dv[is.na(us$class8)]

  # Full NS-SEC, NS-SEC5 and NS-SEC3 have same number of NAs as NSSEC8
  # us$class <- NA
  # us$class <- us$f_jbnssec_dv
  # us$class[is.na(us$class)] <- us$e_jbnssec_dv[is.na(us$class)]
  # us$class[is.na(us$class)] <- us$d_jbnssec_dv[is.na(us$class)]
  # us$class[is.na(us$class)] <- us$c_jbnssec_dv[is.na(us$class)]
  # us$class[is.na(us$class)] <- us$b_jbnssec_dv[is.na(us$class)]
  # us$class[is.na(us$class)] <- us$a_jbnssec_dv[is.na(us$class)]
  #
  # us$class5 <- NA
  # us$class5 <- us$f_jbnssec5_dv
  # us$class5[is.na(us$class5)] <- us$e_jbnssec5_dv[is.na(us$class5)]
  # us$class5[is.na(us$class5)] <- us$d_jbnssec5_dv[is.na(us$class5)]
  # us$class5[is.na(us$class5)] <- us$c_jbnssec5_dv[is.na(us$class5)]
  # us$class5[is.na(us$class5)] <- us$b_jbnssec5_dv[is.na(us$class5)]
  # us$class5[is.na(us$class5)] <- us$a_jbnssec5_dv[is.na(us$class5)]
  #
  # us$class3 <- NA
  # us$class3 <- us$f_jbnssec3_dv
  # us$class3[is.na(us$class3)] <- us$e_jbnssec3_dv[is.na(us$class3)]
  # us$class3[is.na(us$class3)] <- us$d_jbnssec3_dv[is.na(us$class3)]
  # us$class3[is.na(us$class3)] <- us$c_jbnssec3_dv[is.na(us$class3)]
  # us$class3[is.na(us$class3)] <- us$b_jbnssec3_dv[is.na(us$class3)]
  # us$class3[is.na(us$class3)] <- us$a_jbnssec3_dv[is.na(us$class3)]

  us$class8 <- factor(us$class8, levels = 1:9,
                      labels = c("Large employers & higher management",
                      "Higher professional",
                      "Lower management & professional",
                      "Intermediate",
                      "Small employers & own account",
                      "Lower supervisory & technical",
                      "Semi-routine",
                      "Routine",
                      "Never worked & LT unemployed"),
           ordered = FALSE)

  us <- us %>%
    select(-f_jbnssec_dv, -f_jbnssec8_dv, -f_jbnssec5_dv, -f_jbnssec3_dv) %>%
    select(-e_jbnssec_dv, -e_jbnssec8_dv, -e_jbnssec5_dv, -e_jbnssec3_dv) %>%
    select(-d_jbnssec_dv, -d_jbnssec8_dv, -d_jbnssec5_dv, -d_jbnssec3_dv) %>%
    select(-c_jbnssec_dv, -c_jbnssec8_dv, -c_jbnssec5_dv, -c_jbnssec3_dv) %>%
    select(-b_jbnssec_dv, -b_jbnssec8_dv, -b_jbnssec5_dv, -b_jbnssec3_dv) %>%
    select(-a_jbnssec_dv, -a_jbnssec8_dv, -a_jbnssec5_dv, -a_jbnssec3_dv)


  # # Tax credits ====
  # us$taxcred <- NA
  # us$taxcred <- us$f_btype6
  # us$taxcred[is.na(us$taxcred)] <- us$e_btype6[is.na(us$taxcred)]
  # us$taxcred[is.na(us$taxcred)] <- us$d_btype6[is.na(us$taxcred)]
  # us$taxcred[is.na(us$taxcred)] <- us$c_btype6[is.na(us$taxcred)]
  # us$taxcred[is.na(us$taxcred)] <- us$b_btype6[is.na(us$taxcred)]
  # us$taxcred[is.na(us$taxcred)] <- us$a_btype6[is.na(us$taxcred)]


  # Marital status ====
  us$marital <- NA
  us$marital <- us$f_mlstat
  us$marital[is.na(us$marital)] <- us$e_mlstat[is.na(us$marital)]
  us$marital[is.na(us$marital)] <- us$d_mlstat[is.na(us$marital)]
  us$marital[is.na(us$marital)] <- us$c_mlstat[is.na(us$marital)]
  us$marital[is.na(us$marital)] <- us$b_mlstat[is.na(us$marital)]
  us$marital[is.na(us$marital)] <- us$a_mlstat[is.na(us$marital)]

  # checked these levels are correct 2018-01-18
  us$marital <- factor(us$marital, levels = 1:9,
                       labels = c("single",
                                  "married",
                                  "civil_part",
                                  "separated",
                                  "divorced",
                                  "widowed",
                                  "sep_civil_part",
                                  "former_civil_part",
                                  "surviving_civil_part")) %>%
    forcats::fct_collapse(
      separated = c("separated", "sep_civil_part"),
      divorced  = c("divorced", "former_civil_part"),
      widowed   = c("widowed", "surviving_civil_part")) %>%
    forcats::fct_recode(
      mar_civil_part = "civil_part",
      mar_divorced   = "divorced",
      mar_married    = "married",
      mar_separated  = "separated",
      mar_single     = "single",
      mar_widowed    = "widowed"
    ) %>%
    forcats::fct_relevel(  # make alphabetical for simulation
      "mar_civil_part", "mar_divorced", "mar_married", "mar_separated",
      "mar_single", "mar_widowed"
    )

  us <- us %>%
    select(-f_mlstat, -e_mlstat, -d_mlstat, -c_mlstat, -b_mlstat, -a_mlstat)


  # Couple ====
  # This is couple, married or not
  # In waves f_, d_ and b_ asked about domestic chores
  # if there's any answer (or they're married) they're part of a couple
  # I have to assume the remaining NAs are single
  us$couple <- NA
  us$couple[us$f_hubuys > 0 & !is.na(us$f_hubuys)] <- 1
  us$couple[is.na(us$couple) & us$marital == "mar_married"] <- 1
  us$couple[is.na(us$couple) & us$d_hubuys > 0 & !is.na(us$d_hubuys)] <- 1
  us$couple[is.na(us$couple) & us$b_hubuys > 0 & !is.na(us$b_hubuys)] <- 1

  us <- us %>%
    select(-f_hubuys, -d_hubuys, -b_hubuys)


  # Neighbourhood cohesion ====
  # Paper 13 neighbourhood cohesion:
  #   extent to which neighbours pull together to improve neighbourhood
  # No direct analogy but proxy:
  #   willing to improve neighbourhood, and
  #   similar to others in neighbourhood

  # Willing to improve neighbourhood
  us$imp_nhood <- NA
  us$imp_nhood <- us$f_scopngbhe
  us$imp_nhood[is.na(us$imp_nhood)] <- us$c_scopngbhe[is.na(us$imp_nhood)]
  us$imp_nhood[is.na(us$imp_nhood)] <- us$a_scopngbhe[is.na(us$imp_nhood)]

  # Similar to others in neighbourhood
  us$sim_nhood <- NA
  us$sim_nhood <- us$f_scopngbhg
  us$sim_nhood[is.na(us$sim_nhood)] <- us$c_scopngbhg[is.na(us$sim_nhood)]
  us$sim_nhood[is.na(us$sim_nhood)] <- us$a_scopngbhg[is.na(us$sim_nhood)]

  # dplyr::case_when() doesn't yet work inside mutate()
  # <= 2 == strongly agree + agree
  # >= 3 == Neither + disagree + strongly disagree
  us <- us %>%
    mutate(nhood_coh = if_else(imp_nhood <= 2 & sim_nhood <= 2,
                               "ncoh_yes", "ncoh_no"))

  stopifnot(
    # more agree + strongly agree than cohesive neighbourhoods
    nrow(us[us$imp_nhood <= 2 & !is.na(us$imp_nhood), ]) >
      nrow(us[us$nhood_coh == "ncoh_yes" & !is.na(us$nhood_coh), ]),

    nrow(us[us$sim_nhood <= 2 & !is.na(us$sim_nhood), ]) >
      nrow(us[us$nhood_coh == "ncoh_yes" & !is.na(us$nhood_coh), ])
  )

  us <- us %>%
    select(-imp_nhood, -sim_nhood,
           -f_scopngbhe, -c_scopngbhe, -a_scopngbhe,
           -f_scopngbhg, -c_scopngbhg, -a_scopngbhg)


  # Neighbourhood trust ====
  # Waves f and c are different to how wave a is coded
  # Wave f and c: strongly agree + agree == trust
  # wave a: most people can be trusted == trust
  us$nhood_trust <- NA
  us$nhood_trust <- if_else(us$f_nbrcoh3 <= 2, "trust_yes", "trust_no")
  us$nhood_trust[is.na(us$nhood_trust)] <-
    if_else(us$c_nbrcoh3[is.na(us$nhood_trust)] <= 2,
            "trust_yes",
            "trust_no")
  us$nhood_trust[is.na(us$nhood_trust)] <-
    if_else(us$a_sctrust[is.na(us$nhood_trust)] == 1,
            "trust_yes",
            "trust_no")

  us <- us %>%
    select(-f_nbrcoh3, -c_nbrcoh3, -a_sctrust)


  # Neighbourhood belonging ====
  us$nhood_belong <- NA
  us$nhood_belong <- us$f_scopngbha
  us$nhood_belong[is.na(us$nhood_belong)] <-
    us$c_scopngbha[is.na(us$nhood_belong)]
  us$nhood_belong[is.na(us$nhood_belong)] <-
    us$a_scopngbha[is.na(us$nhood_belong)]

  us$nhood_belong <- if_else(us$nhood_belong <= 2, "belong_yes", "belong_no")

  us <- us %>%
    select(-f_scopngbha, -c_scopngbha, -a_scopngbha)


  # Civic participation ====
  civic_vars <- colnames(us[, grep("f_orga", colnames(us))])
  civic_vars <- civic_vars[civic_vars != "f_orga96"]  # 'none of these'

  us$civic <- NA
  us$civic <- rowSums(us[, civic_vars])

  civic_vars <- colnames(us[, grep("c_orga", colnames(us))])
  civic_vars <- civic_vars[civic_vars != "c_orga96"]  # 'none of these'

  us$civic[is.na(us$civic)] <- rowSums(us[is.na(us$civic), civic_vars])
  us$civic <- if_else(us$civic >= 1, "civic_yes", "civic_no")

  us <- us %>%
    select(-f_orga1,  -f_orga2,  -f_orga3,  -f_orga4,  -f_orga5,  -f_orga6,
           -f_orga7,  -f_orga8,  -f_orga9,  -f_orga10, -f_orga11, -f_orga12,
           -f_orga13, -f_orga14, -f_orga15, -f_orga16,
           -c_orga1,  -c_orga2,  -c_orga3,  -c_orga4,  -c_orga5,  -c_orga6,
           -c_orga7,  -c_orga8,  -c_orga9,  -c_orga10, -c_orga11, -c_orga12,
           -c_orga13, -c_orga14, -c_orga15, -c_orga16)
  rm(civic_vars)


  # Social cohesion ====
  # People in this neighbourhood get along - paper 13
  # People in this neighbourhood DON'T get along - US
  # Reverse, so take strongly disagree + disagree
  us$social_coh <- NA
  us$social_coh <- us$f_nbrcoh4
  us$social_coh[is.na(us$social_coh)] <- us$c_nbrcoh4[is.na(us$social_coh)]
  us$social_coh <- if_else(us$social_coh >= 4, "scoh_yes", "scoh_no")

  us <- us %>%
    select(-f_nbrcoh4, -c_nbrcoh4)


  # Safe walking alone at night ====
  us$safe_alone <- NA
  us$safe_alone <- us$f_crdark
  us$safe_alone[is.na(us$safe_alone)] <- us$c_crdark[is.na(us$safe_alone)]

  # Recode to safe or unsafe
  # 1 = very safe
  us$safe_alone[us$safe_alone == 2] <- 1  # fairly safe

  us$safe_alone[us$safe_alone == 3] <- 2  # a bit unsafe
  us$safe_alone[us$safe_alone == 4] <- 2  # very unsafe
  us$safe_alone[us$safe_alone == 5] <- 2  # don't go out after dark

  us$safe_alone <- factor(us$safe_alone, levels = 1:2,
                          labels = c("safe_yes",
                                     "safe_no"))

  us <- us %>%
    select(-f_crdark, -c_crdark)


  # Similar income to friends ====
  # Paper 13 doesn't specify which levels map to 'hetergeneous' or homogeneous
  # friendship. Assume ~ <= 1/2 is hetero
  us$sim_inc <- NA
  us$sim_inc <- us$f_siminc
  us$sim_inc[is.na(us$sim_inc)] <- us$c_siminc[is.na(us$sim_inc)]

  us$sim_inc <- if_else(us$sim_inc >= 3, "siminc_het", "siminc_hom")

  us <- us %>%
    select(-f_siminc, -c_siminc)


  # Similiar ethnicity to friends ====
  # Paper 13 again doesn't specify which levels map to hetero/homo
  # Assume again ~ <= 1/2 hetero; >= 1/2 homo
  us$sim_eth <- NA
  us$sim_eth <- us$f_simrace
  us$sim_eth[is.na(us$sim_eth)] <- us$c_simrace[is.na(us$sim_eth)]

  us$sim_eth <- if_else(us$sim_eth >= 3, "simeth_het", "simeth_hom")

  us <- us %>%
    select(-f_simrace, -c_simrace)


  # Political efficacy ====
  # Don't have a say in what government does, reversed
  us$pol_eff <- NA
  us$pol_eff <- us$f_poleff4
  us$pol_eff[is.na(us$pol_eff)] <- us$c_poleff4[is.na(us$pol_eff)]

  us$pol_eff <- if_else(us$pol_eff >= 4, "poleff_yes", "poleff_no")

  us <- us %>%
    select(-f_poleff4, -c_poleff4)



  # Financial situation ====
  us$fin_sit <- NA
  us$fin_sit <- us$f_finnow
  us$fin_sit[is.na(us$fin_sit)] <- us$e_finnow[is.na(us$fin_sit)]
  us$fin_sit[is.na(us$fin_sit)] <- us$d_finnow[is.na(us$fin_sit)]
  us$fin_sit[is.na(us$fin_sit)] <- us$c_finnow[is.na(us$fin_sit)]
  us$fin_sit[is.na(us$fin_sit)] <- us$b_finnow[is.na(us$fin_sit)]
  us$fin_sit[is.na(us$fin_sit)] <- us$a_finnow[is.na(us$fin_sit)]

  us$fin_sit <- if_else(us$fin_sit <= 2, "fin_good", "fin_bad")

  us <- us %>%
    select(-f_finnow, -e_finnow, -d_finnow, -c_finnow, -b_finnow, -a_finnow)


  # Social isolation ====
  # >=1 close friend
  us$soc_isol <- NA
  us$soc_isol <- us$f_closenum
  us$soc_isol[is.na(us$soc_isol)] <- us$c_closenum[is.na(us$soc_isol)]

  us$soc_isol <- if_else(us$soc_isol > 0, "isol_no", "isol_yes")

  us <- us %>%
    select(-f_closenum, -c_closenum)


  # Income ====
  # I've tested:
  #   - total earnings annually (x_prearna)
  #   - total earnings weekly (x_prearnw)
  #   - total personal income annually (x_prfitba)
  #   - total personal income weekly (x_prfitw)
  #
  #    All (even when combined) have low response rates and ~ 77k missing cases


  # General Health Questionnaire (GHQ) ====
  recode_ghq <- function(varname) {
    x <- NA
    x <- us[[paste0("f_", varname)]]
    x[is.na(x)] <- us[[paste0("f_", varname)]][is.na(x)]
    x[is.na(x)] <- us[[paste0("e_", varname)]][is.na(x)]
    x[is.na(x)] <- us[[paste0("d_", varname)]][is.na(x)]
    x[is.na(x)] <- us[[paste0("c_", varname)]][is.na(x)]
    x[is.na(x)] <- us[[paste0("b_", varname)]][is.na(x)]
    x[is.na(x)] <- us[[paste0("a_", varname)]][is.na(x)]

    x
  }

  us$ghq_conc     <- recode_ghq("scghqa")
  us$ghq_sleep    <- recode_ghq("scghqb")
  us$ghq_useful   <- recode_ghq("scghqc")
  us$ghq_decision <- recode_ghq("scghqd")
  us$ghq_strain   <- recode_ghq("scghqe")
  us$ghq_diff     <- recode_ghq("scghqf")
  us$ghq_enjoy    <- recode_ghq("scghqg")
  us$ghq_face     <- recode_ghq("scghqh")
  us$ghq_unhappy  <- recode_ghq("scghqi")
  us$ghq_confid   <- recode_ghq("scghqj")
  us$ghq_worth    <- recode_ghq("scghqk")
  us$ghq_happy    <- recode_ghq("scghql")

  us$ghq_conc     <- if_else(us$ghq_conc <= 2,
                             "ghq_conc_good", "ghq_conc_bad")

  us$ghq_sleep    <- if_else(us$ghq_sleep <= 2,
                             "ghq_sleep_good", "ghq_sleep_bad")

  us$ghq_useful   <- if_else(us$ghq_useful <= 2,
                             "ghq_useful_good", "ghq_useful_bad")

  us$ghq_decision <- if_else(us$ghq_decision <= 2,
                             "ghq_decision_good", "ghq_decision_bad")

  us$ghq_strain   <- if_else(us$ghq_strain <= 2,
                             "ghq_strain_good", "ghq_strain_bad")

  us$ghq_diff     <- if_else(us$ghq_diff <= 2,
                             "ghq_diff_good", "ghq_diff_bad")

  us$ghq_enjoy    <- if_else(us$ghq_enjoy <= 2,
                             "ghq_enjoy_good", "ghq_enjoy_bad")

  us$ghq_face     <- if_else(us$ghq_face <= 2,
                             "ghq_face_good", "ghq_face_bad")

  us$ghq_unhappy  <- if_else(us$ghq_unhappy <= 2,
                             "ghq_unhappy_good", "ghq_unhappy_bad")

  us$ghq_confid   <- if_else(us$ghq_confid <= 2,
                             "ghq_confid_good", "ghq_confid_bad")

  us$ghq_worth    <- if_else(us$ghq_worth <= 2,
                             "ghq_worth_good", "ghq_worth_bad")

  us$ghq_happy    <- if_else(us$ghq_happy <= 2,
                             "ghq_happy_good", "ghq_happy_bad")

  us <- us %>%
    select(-f_scghqa, -e_scghqa, -d_scghqa, -c_scghqa, -b_scghqa, -a_scghqa,
           -f_scghqb, -e_scghqb, -d_scghqb, -c_scghqb, -b_scghqb, -a_scghqb,
           -f_scghqc, -e_scghqc, -d_scghqc, -c_scghqc, -b_scghqc, -a_scghqc,
           -f_scghqd, -e_scghqd, -d_scghqd, -c_scghqd, -b_scghqd, -a_scghqd,
           -f_scghqe, -e_scghqe, -d_scghqe, -c_scghqe, -b_scghqe, -a_scghqe,
           -f_scghqf, -e_scghqf, -d_scghqf, -c_scghqf, -b_scghqf, -a_scghqf,
           -f_scghqg, -e_scghqg, -d_scghqg, -c_scghqg, -b_scghqg, -a_scghqg,
           -f_scghqh, -e_scghqh, -d_scghqh, -c_scghqh, -b_scghqh, -a_scghqh,
           -f_scghqi, -e_scghqi, -d_scghqi, -c_scghqi, -b_scghqi, -a_scghqi,
           -f_scghqj, -e_scghqj, -d_scghqj, -c_scghqj, -b_scghqj, -a_scghqj,
           -f_scghqk, -e_scghqk, -d_scghqk, -c_scghqk, -b_scghqk, -a_scghqk,
           -f_scghql, -e_scghql, -d_scghql, -c_scghql, -b_scghql, -a_scghql)
  rm(recode_ghq)


  # Cigarettes smoked ====
  us$cig <- NA
  us$cig <- us$f_ncigs
  us$cig[is.na(us$cig)] <- us$e_ncigs[is.na(us$cig)]
  us$cig[is.na(us$cig)] <- us$b_ncigs[is.na(us$cig)]

  us$cig <- if_else(us$cig > 0, "cig_yes", "cig_no")

  us <- us %>%
    select(-f_ncigs, -e_ncigs, -b_ncigs)


  # Alcohol ====
  # units = (volume (ml) * ABV) / 1000
  # 1 pint = 568.261ml (Google)
  # Source of alcohol costs: institute alcohol studies
  # http://www.ias.org.uk/uploads/pdf/Factsheets/Alcohol%20pricing%20factsheet%20April%202014.pdf
  # All dates 2012 data
  #
  # ABV from NHS.uk/livewell/alcohol/pages/alcohol-units.aspx and IAS
  #
  # Beer:    £2.70  pint 3.6% abv
  # Lager:   £2.96  pint 5.2% abv
  # Cider:   £1.82  pint 5.3% abv
  # Sherry:  £6.55  70cl 17.5% abv
  # Wine:    £4.04  75cl 12% abv
  # Whiskey: £15.35 70cl 40% abv
  # Vodka:   £13.86 70cl 37.5% abv

  units <- data.frame(
    beer    = c(2.7,   3.6,  568),
    lager   = c(2.96,  5.2,  568),
    cider   = c(1.82,  5.3,  568),
    sherry  = c(6.55,  17.5, 700),
    wine    = c(4.04,  12.0, 750),
    whiskey = c(15.35, 40.0, 700),
    vodka   = c(13.86, 37.5, 700)
  )
  units <- as.data.frame(t(units))
  colnames(units) <- c("cost", "abv", "ml")

  units$units     <- (units$ml * units$abv) / 1000
  # source: http://www.nhs.uk/Livewell/alcohol/Pages/alcohol-units.aspx

  units$unit_cost <- units$units / units$cost
  units$unit_cost <- round(units$unit_cost, digits = 2)
  unit_cost <- median(units$unit_cost)

  us <- us %>%
    mutate(f_alc = (f_xpaltob_g3 / (f_hhsize - f_nkids015)) / unit_cost / 4,
           e_alc = (e_xpaltob_g3 / (e_hhsize - e_nkids_dv)) / unit_cost / 4,
           d_alc = (d_xpaltob_g3 / (d_hhsize - d_nkids015)) / unit_cost / 4,
           c_alc = (c_xpaltob_g3 / (c_hhsize - c_nkids015)) / unit_cost / 4,
           b_alc = (b_xpaltob_g3 / (b_hhsize - b_nkids015)) / unit_cost / 4,
           a_alc = (a_xpaltob_g3 / (a_hhsize - a_numchild)) / unit_cost / 4)

  us$alcohol <- NA
  us$alcohol <- us$f_alc
  us$alcohol[is.na(us$alcohol)] <- us$e_alc[is.na(us$alcohol)]
  us$alcohol[is.na(us$alcohol)] <- us$d_alc[is.na(us$alcohol)]
  us$alcohol[is.na(us$alcohol)] <- us$c_alc[is.na(us$alcohol)]
  us$alcohol[is.na(us$alcohol)] <- us$b_alc[is.na(us$alcohol)]
  us$alcohol[is.na(us$alcohol)] <- us$a_alc[is.na(us$alcohol)]

  # Low/high risk thresholds
  # cmo2016a, p. 4
  us$alcohol <- if_else(us$alcohol <= 14, "alcohol_low", "alcohol_high")

  us <- us %>%
    select(-f_xpaltob_g3, -e_xpaltob_g3, -d_xpaltob_g3,
           -c_xpaltob_g3, -b_xpaltob_g3, -a_xpaltob_g3,
           -f_alc, -e_alc, -d_alc, -c_alc, -b_alc, -a_alc)
  rm(units, unit_cost)


  # Best friend distance ====
  us <- us %>%
    rowwise() %>%
    mutate(f_bfdist = min(f_netlv_1, f_netlv_2, f_netlv_3),
           c_bfdist = min(c_netlv_1, c_netlv_2, c_netlv_3))

  us$bf_dist <- NA
  us$bf_dist <- us$f_bfdist
  us$bf_dist[is.na(us$bf_dist)] <- us$f_bfdist[is.na(us$bf_dist)]

  # Paper: < 5 mins. Use < 1 mile as proxy
  us$bf_dist <- if_else(us$bf_dist > 1, "bfdist_far", "bfdist_near")

  us <- us %>%
    select(-f_bfdist, -c_bfdist,
           -f_netlv_1, -f_netlv_2, -f_netlv_3,
           -c_netlv_1, -c_netlv_2, -c_netlv_3)


  # Best friend often in touch ====
  # Paper: frequent contact == at least weekly
  us <- us %>%
    rowwise() %>%
    mutate(f_bfoften = min(f_netph_1, f_netph_2, f_netph_3),
           c_bfoften = min(c_netph_1, c_netph_2, c_netph_3))

  us$bf_often <- NA
  us$bf_often <- us$f_bfoften
  us$bf_often[is.na(us$bf_often)] <- us$c_bfoften[is.na(us$bf_often)]

  us$bf_often <- if_else(us$bf_often > 1, "bfoften_not", "bfoften_often")

  us <- us %>%
    select(-f_bfoften, -c_bfoften,
           -f_netph_1, -f_netph_2, -f_netph_3,
           -c_netph_1, -c_netph_2, -c_netph_3)


  # Savings ====
  # Budgeting/money management skills
  # Too many NAs save_reg
  us$save <- NA
  us$save <- us$f_save
  us$save[is.na(us$save)] <- us$d_save[is.na(us$save)]
  us$save[is.na(us$save)] <- us$b_save[is.na(us$save)]
  us$save <- if_else(us$save == 1, "save_yes", "save_no")

  us <- us %>%
    select(-f_save, -d_save, -b_save)


  # Number of children ====
  us$nkids <- NA
  us$nkids <- us$f_nkids015
  us$nkids[is.na(us$nkids)] <- us$e_nkids_dv[is.na(us$nkids)]
  us$nkids[is.na(us$nkids)] <- us$d_nkids015[is.na(us$nkids)]
  us$nkids[is.na(us$nkids)] <- us$c_nkids015[is.na(us$nkids)]
  us$nkids[is.na(us$nkids)] <- us$b_nkids015[is.na(us$nkids)]
  us$nkids[is.na(us$nkids)] <- us$a_numchild[is.na(us$nkids)]

  us <- us %>%
    select(-f_nkids015, -e_nkids_dv, -d_nkids015, -c_nkids015, -b_nkids015,
           -a_numchild)

  # Can't use numeric data with rakeR::extract() easily because it creates a
  # new variable for each level of the variable (i.e. one for each unique
  # integer)
  # Recode and top-code to 5 kids
  us$kids <- cut(us$nkids,
                 breaks = c(0, 1, 2, 3, 4, 5, max(us$nkids, na.rm = TRUE)),
                 labels = c("kids_0", "kids_1", "kids_2", "kids_3", "kids_4",
                            "kids_5ormore"),
                 right = FALSE, include.lowest = TRUE)

  # Household income and poverty ====
  # Many measures of poverty based on household income (i.e. children do not
  # earn but are not in poverty if earning adults aren't)

  # US provides monthly hh income. Convert to annual.
  # Poverty defined as <60% median income
  # Median income in 2011 was £23,200 (source: ons2013c, p. 5)
  us$hh_income <- us$f_fihhmngrs_dv
  us$hh_income[is.na(us$hh_income)] <- us$e_fihhmngrs_dv[is.na(us$hh_income)]
  us$hh_income[is.na(us$hh_income)] <- us$d_fihhmngrs_dv[is.na(us$hh_income)]
  us$hh_income[is.na(us$hh_income)] <- us$c_fihhmngrs_dv[is.na(us$hh_income)]
  us$hh_income[is.na(us$hh_income)] <- us$b_fihhmngrs_dv[is.na(us$hh_income)]
  us$hh_income[is.na(us$hh_income)] <- us$a_fihhmngrs_dv[is.na(us$hh_income)]

  # Set negative values to 0
  us$hh_income[us$hh_income < 0] <- 0L

  us <- us %>%
    select(-f_fihhmngrs_dv, -e_fihhmngrs_dv, -d_fihhmngrs_dv, -c_fihhmngrs_dv,
           -b_fihhmngrs_dv, -a_fihhmngrs_dv)



  # Number of cohabitants ====
  us$cohab <- NA
  us$cohab <- us$f_hhsize
  us$cohab[is.na(us$cohab)] <- us$e_hhsize[is.na(us$cohab)]
  us$cohab[is.na(us$cohab)] <- us$d_hhsize[is.na(us$cohab)]
  us$cohab[is.na(us$cohab)] <- us$c_hhsize[is.na(us$cohab)]
  us$cohab[is.na(us$cohab)] <- us$b_hhsize[is.na(us$cohab)]
  us$cohab[is.na(us$cohab)] <- us$a_hhsize[is.na(us$cohab)]

  us$cohab <- us$cohab - 1  # remove the respondent!

  us <- us %>%
    select(-f_hhsize, -e_hhsize, -d_hhsize, -c_hhsize, -b_hhsize, -a_hhsize)


  # Clinical depression ====
  us$depress <- NA_integer_

  hconds <- colnames(us)[grepl("_hcond", colnames(us))]
  for (i in hconds) {
    us[[i]] <- as.integer(us[[i]])
    us$depress[is.na(us$depress) & us[[i]] == 17 & !is.na(us[[i]])] <- 1
  }
  rm(i)

  us$depress[is.na(us$depress)] <- us$f_hcond17[is.na(us$depress)]
  us$depress[is.na(us$depress)] <- us$e_hcond17[is.na(us$depress)]
  us$depress[is.na(us$depress)] <- us$d_hcond17[is.na(us$depress)]
  us$depress[is.na(us$depress)] <- us$c_hcond17[is.na(us$depress)]
  us$depress[is.na(us$depress)] <- us$b_hcondn17[is.na(us$depress)]
  us$depress[is.na(us$depress)] <- us$a_hcond17[is.na(us$depress)]

  us$depress <- factor(us$depress, levels = 0:1,
                       labels = c("depress_no", "depress_yes"),
                       ordered = FALSE)


  # IHD and stroke ====
  # Air pollution is associated with IHD and stroke (among others)
  # This ihd measure therefore includes IND and stroke
  # IHD includes angina and myocardial infarction (MI)
  us$ihd <- NA_integer_
  for (i in hconds) {
    us[[i]] <- as.integer(us[[i]])
    us$ihd[is.na(us$ihd) & !is.na(us[[i]]) &
             us[[i]] == 4 |  # coronary heart disease
             us[[i]] == 5 |  # angina
             us[[i]] == 6 |  # myocardial infarction
             us[[i]] == 7] <- 1   # stroke
  }
  us$ihd[is.na(us$ihd)] <- 0

  us$ihd <- factor(us$ihd, levels = 0:1,
                   labels = c("ihd_no", "ihd_yes"))

  us <- us %>%
    select(-f_hcond17, -e_hcond17, -d_hcond17, -c_hcond17, -b_hcondn17,
           -a_hcond17)

  us <- us %>%
    select(-f_hcondno1:-f_hcondno8) %>%
    select(-e_hcondno1:-e_hcondno7) %>%
    select(-d_hcondno1:-d_hcondno10) %>%
    select(-c_hcondno1:-c_hcondno8) %>%
    select(-b_hcondno1:-b_hcondno8)


  # Problems paying housing costs ====
  us$prob_pay <- us$f_xphsdb
  us$prob_pay[is.na(us$prob_pay)] <- us$e_xphsdb[is.na(us$prob_pay)]
  us$prob_pay[is.na(us$prob_pay)] <- us$d_xphsdb[is.na(us$prob_pay)]
  us$prob_pay[is.na(us$prob_pay)] <- us$c_xphsdb[is.na(us$prob_pay)]
  us$prob_pay[is.na(us$prob_pay)] <- us$b_xphsdb[is.na(us$prob_pay)]
  us$prob_pay[is.na(us$prob_pay)] <- us$a_xphsdb[is.na(us$prob_pay)]

  us <- us %>%
    select(-f_xphsdb, -e_xphsdb, -d_xphsdb, -c_xphsdb, -b_xphsdb, -a_xphsdb)

  us$prob_pay <- factor(us$prob_pay, levels = 1:2,
                        labels = c("prob_pay_yes", "prob_pay_no"))


  # Housing costs ====
  # household income is based on monthly, so standardise on monthly
  us$f_rent[us$f_rentwc == 1 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 1 & !is.na(us$f_rentwc)]  * 52 / 12
  us$f_rent[us$f_rentwc == 2 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 2 & !is.na(us$f_rentwc)]  * 52 / 2 / 12
  us$f_rent[us$f_rentwc == 3 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 3 & !is.na(us$f_rentwc)]  * 52 / 3 / 12
  us$f_rent[us$f_rentwc == 4 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 4 & !is.na(us$f_rentwc)]  * 52 / 4 / 12
  us$f_rent[us$f_rentwc == 7 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 7 & !is.na(us$f_rentwc)]  / 2
  us$f_rent[us$f_rentwc == 8 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 8 & !is.na(us$f_rentwc)]  * 8  / 12
  us$f_rent[us$f_rentwc == 9 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 9 & !is.na(us$f_rentwc)]  * 9  / 12
  us$f_rent[us$f_rentwc == 10 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 10 & !is.na(us$f_rentwc)] * 10 / 12
  us$f_rent[us$f_rentwc == 13 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 13 & !is.na(us$f_rentwc)] / 3
  us$f_rent[us$f_rentwc == 26 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 26 & !is.na(us$f_rentwc)] / 6
  us$f_rent[us$f_rentwc == 52 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 52 & !is.na(us$f_rentwc)] / 12
  us$f_rent[us$f_rentwc == 90 & !is.na(us$f_rentwc)] <-
    us$f_rent[us$f_rentwc == 90 & !is.na(us$f_rentwc)] * 52 / 12
  us$f_rent[us$f_rentwc == 95 & !is.na(us$f_rentwc)] <- NA  # one-off payment
  us$f_rent[us$f_rentwc == 96 & !is.na(us$f_rentwc)] <- NA  # none of these

  us$e_rent[us$e_rentwc == 1 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 1 & !is.na(us$e_rentwc)]  * 52 / 12
  us$e_rent[us$e_rentwc == 2 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 2 & !is.na(us$e_rentwc)]  * 52 / 2 / 12
  us$e_rent[us$e_rentwc == 3 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 3 & !is.na(us$e_rentwc)]  * 52 / 3 / 12
  us$e_rent[us$e_rentwc == 4 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 4 & !is.na(us$e_rentwc)]  * 52 / 4 / 12
  us$e_rent[us$e_rentwc == 7 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 7 & !is.na(us$e_rentwc)]  / 2
  us$e_rent[us$e_rentwc == 8 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 8 & !is.na(us$e_rentwc)]  * 8  / 12
  us$e_rent[us$e_rentwc == 9 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 9 & !is.na(us$e_rentwc)]  * 9  / 12
  us$e_rent[us$e_rentwc == 10 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 10 & !is.na(us$e_rentwc)] * 10 / 12
  us$e_rent[us$e_rentwc == 13 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 13 & !is.na(us$e_rentwc)] / 3
  us$e_rent[us$e_rentwc == 26 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 26 & !is.na(us$e_rentwc)] / 6
  us$e_rent[us$e_rentwc == 52 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 52 & !is.na(us$e_rentwc)] / 12
  us$e_rent[us$e_rentwc == 90 & !is.na(us$e_rentwc)] <-
    us$e_rent[us$e_rentwc == 90 & !is.na(us$e_rentwc)] * 52 / 12
  us$e_rent[us$e_rentwc == 95 & !is.na(us$e_rentwc)] <- NA  # one-off payment
  us$e_rent[us$e_rentwc == 96 & !is.na(us$e_rentwc)] <- NA  # none of these

  us$d_rent[us$d_rentwc == 1 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 1 & !is.na(us$d_rentwc)]  * 52 / 12
  us$d_rent[us$d_rentwc == 2 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 2 & !is.na(us$d_rentwc)]  * 52 / 2 / 12
  us$d_rent[us$d_rentwc == 3 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 3 & !is.na(us$d_rentwc)]  * 52 / 3 / 12
  us$d_rent[us$d_rentwc == 4 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 4 & !is.na(us$d_rentwc)]  * 52 / 4 / 12
  us$d_rent[us$d_rentwc == 7 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 7 & !is.na(us$d_rentwc)]  / 2
  us$d_rent[us$d_rentwc == 8 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 8 & !is.na(us$d_rentwc)]  * 8  / 12
  us$d_rent[us$d_rentwc == 9 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 9 & !is.na(us$d_rentwc)]  * 9  / 12
  us$d_rent[us$d_rentwc == 10 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 10 & !is.na(us$d_rentwc)] * 10 / 12
  us$d_rent[us$d_rentwc == 13 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 13 & !is.na(us$d_rentwc)] / 3
  us$d_rent[us$d_rentwc == 26 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 26 & !is.na(us$d_rentwc)] / 6
  us$d_rent[us$d_rentwc == 52 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 52 & !is.na(us$d_rentwc)] / 12
  us$d_rent[us$d_rentwc == 90 & !is.na(us$d_rentwc)] <-
    us$d_rent[us$d_rentwc == 90 & !is.na(us$d_rentwc)] * 52 / 12
  us$d_rent[us$d_rentwc == 95 & !is.na(us$d_rentwc)] <- NA  # one-off payment
  us$d_rent[us$d_rentwc == 96 & !is.na(us$d_rentwc)] <- NA  # none of these

  us$c_rent[us$c_rentwc == 1 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 1 & !is.na(us$c_rentwc)]  * 52 / 12
  us$c_rent[us$c_rentwc == 2 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 2 & !is.na(us$c_rentwc)]  * 52 / 2 / 12
  us$c_rent[us$c_rentwc == 3 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 3 & !is.na(us$c_rentwc)]  * 52 / 3 / 12
  us$c_rent[us$c_rentwc == 4 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 4 & !is.na(us$c_rentwc)]  * 52 / 4 / 12
  us$c_rent[us$c_rentwc == 7 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 7 & !is.na(us$c_rentwc)]  / 2
  us$c_rent[us$c_rentwc == 8 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 8 & !is.na(us$c_rentwc)]  * 8  / 12
  us$c_rent[us$c_rentwc == 9 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 9 & !is.na(us$c_rentwc)]  * 9  / 12
  us$c_rent[us$c_rentwc == 10 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 10 & !is.na(us$c_rentwc)] * 10 / 12
  us$c_rent[us$c_rentwc == 13 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 13 & !is.na(us$c_rentwc)] / 3
  us$c_rent[us$c_rentwc == 26 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 26 & !is.na(us$c_rentwc)] / 6
  us$c_rent[us$c_rentwc == 52 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 52 & !is.na(us$c_rentwc)] / 12
  us$c_rent[us$c_rentwc == 90 & !is.na(us$c_rentwc)] <-
    us$c_rent[us$c_rentwc == 90 & !is.na(us$c_rentwc)] * 52 / 12
  us$c_rent[us$c_rentwc == 95 & !is.na(us$c_rentwc)] <- NA  # one-off payment
  us$c_rent[us$c_rentwc == 96 & !is.na(us$c_rentwc)] <- NA  # none of these

  us$b_rent[us$b_rentwc == 1 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 1 & !is.na(us$b_rentwc)]  * 52 / 12
  us$b_rent[us$b_rentwc == 2 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 2 & !is.na(us$b_rentwc)]  * 52 / 2 / 12
  us$b_rent[us$b_rentwc == 3 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 3 & !is.na(us$b_rentwc)]  * 52 / 3 / 12
  us$b_rent[us$b_rentwc == 4 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 4 & !is.na(us$b_rentwc)]  * 52 / 4 / 12
  us$b_rent[us$b_rentwc == 7 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 7 & !is.na(us$b_rentwc)]  / 2
  us$b_rent[us$b_rentwc == 8 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 8 & !is.na(us$b_rentwc)]  * 8  / 12
  us$b_rent[us$b_rentwc == 9 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 9 & !is.na(us$b_rentwc)]  * 9  / 12
  us$b_rent[us$b_rentwc == 10 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 10 & !is.na(us$b_rentwc)] * 10 / 12
  us$b_rent[us$b_rentwc == 13 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 13 & !is.na(us$b_rentwc)] / 3
  us$b_rent[us$b_rentwc == 26 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 26 & !is.na(us$b_rentwc)] / 6
  us$b_rent[us$b_rentwc == 52 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 52 & !is.na(us$b_rentwc)] / 12
  us$b_rent[us$b_rentwc == 90 & !is.na(us$b_rentwc)] <-
    us$b_rent[us$b_rentwc == 90 & !is.na(us$b_rentwc)] * 52 / 12
  us$b_rent[us$b_rentwc == 95 & !is.na(us$b_rentwc)] <- NA  # one-off payment
  us$b_rent[us$b_rentwc == 96 & !is.na(us$b_rentwc)] <- NA  # none of these

  us$a_rent[us$a_rentwc == 1 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 1 & !is.na(us$a_rentwc)]  * 52 / 12
  us$a_rent[us$a_rentwc == 2 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 2 & !is.na(us$a_rentwc)]  * 52 / 2 / 12
  us$a_rent[us$a_rentwc == 3 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 3 & !is.na(us$a_rentwc)]  * 52 / 3 / 12
  us$a_rent[us$a_rentwc == 4 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 4 & !is.na(us$a_rentwc)]  * 52 / 4 / 12
  us$a_rent[us$a_rentwc == 7 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 7 & !is.na(us$a_rentwc)]  / 2
  us$a_rent[us$a_rentwc == 8 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 8 & !is.na(us$a_rentwc)]  * 8  / 12
  us$a_rent[us$a_rentwc == 9 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 9 & !is.na(us$a_rentwc)]  * 9  / 12
  us$a_rent[us$a_rentwc == 10 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 10 & !is.na(us$a_rentwc)] * 10 / 12
  us$a_rent[us$a_rentwc == 13 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 13 & !is.na(us$a_rentwc)] / 3
  us$a_rent[us$a_rentwc == 26 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 26 & !is.na(us$a_rentwc)] / 6
  us$a_rent[us$a_rentwc == 52 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 52 & !is.na(us$a_rentwc)] / 12
  us$a_rent[us$a_rentwc == 90 & !is.na(us$a_rentwc)] <-
    us$a_rent[us$a_rentwc == 90 & !is.na(us$a_rentwc)] * 52 / 12
  us$a_rent[us$a_rentwc == 95 & !is.na(us$a_rentwc)] <- NA  # one-off payment
  us$a_rent[us$a_rentwc == 96 & !is.na(us$a_rentwc)] <- NA  # none of these

  us <- us %>%
    select(-f_rentwc, -e_rentwc, -d_rentwc, -c_rentwc, -b_rentwc, -a_rentwc)

  us$hh_cost <- us$f_xpmg
  us$hh_cost[is.na(us$hh_cost)] <- us$f_rent[is.na(us$hh_cost)]

  us$hh_cost[is.na(us$hh_cost)] <- us$e_xpmg[is.na(us$hh_cost)]
  us$hh_cost[is.na(us$hh_cost)] <- us$e_rent[is.na(us$hh_cost)]

  us$hh_cost[is.na(us$hh_cost)] <- us$d_xpmg[is.na(us$hh_cost)]
  us$hh_cost[is.na(us$hh_cost)] <- us$d_rent[is.na(us$hh_cost)]

  us$hh_cost[is.na(us$hh_cost)] <- us$c_xpmg[is.na(us$hh_cost)]
  us$hh_cost[is.na(us$hh_cost)] <- us$c_rent[is.na(us$hh_cost)]

  us$hh_cost[is.na(us$hh_cost)] <- us$b_xpmg[is.na(us$hh_cost)]
  us$hh_cost[is.na(us$hh_cost)] <- us$b_rent[is.na(us$hh_cost)]

  us$hh_cost[is.na(us$hh_cost)] <- us$a_xpmg[is.na(us$hh_cost)]
  us$hh_cost[is.na(us$hh_cost)] <- us$a_rent[is.na(us$hh_cost)]

  # top code to £20,000
  us$hh_cost[us$hh_cost > 20000] <- 20000

  us <- us %>%
    select(-f_xpmg, -e_xpmg, -d_xpmg, -c_xpmg, -b_xpmg, -a_xpmg) %>%
    select(-f_rent, -e_rent, -d_rent, -c_rent, -b_rent, -a_rent)


  # Benefits ====
  us$uc_replace <- 0
  us$uc_replace[us$f_benbase1 == 1] <- 1  # income support
  us$uc_replace[us$f_benbase2 == 1] <- 1  # JSA
  us$uc_replace[us$f_othben5  == 1] <- 1  # working tax credit
  us$uc_replace[us$f_othben8  == 1] <- 1  # housing benefit
  us$uc_replace[us$f_benctc   == 1] <- 1  # housing benefit (2 == no)
  us$uc_replace[us$f_bendis2  == 1] <- 1  # ESA

  us$uc_replace[is.na(us$uc_replace) & us$e_benunemp1 == 1] <- 1  # JSA
  us$uc_replace[is.na(us$uc_replace) & us$e_btype2    == 1] <- 1  # income sup
  us$uc_replace[is.na(us$uc_replace) & us$e_btype3    == 1] <- 1  # ESA
  us$uc_replace[is.na(us$uc_replace) & us$e_btype6    == 1] <- 1  # tax cred

  us$uc_replace[is.na(us$uc_replace) & us$d_benunemp1 == 1] <- 1  # JSA
  us$uc_replace[is.na(us$uc_replace) & us$d_btype2    == 1] <- 1  # income sup
  us$uc_replace[is.na(us$uc_replace) & us$d_btype3    == 1] <- 1  # ESA
  us$uc_replace[is.na(us$uc_replace) & us$d_btype6    == 1] <- 1  # tax cred

  us$uc_replace[is.na(us$uc_replace) & us$c_benunemp1 == 1] <- 1  # JSA
  us$uc_replace[is.na(us$uc_replace) & us$c_btype2    == 1] <- 1  # income sup
  us$uc_replace[is.na(us$uc_replace) & us$c_btype3    == 1] <- 1  # ESA
  us$uc_replace[is.na(us$uc_replace) & us$c_btype6    == 1] <- 1  # tax cred

  us$uc_replace[is.na(us$uc_replace) & us$b_benunemp1 == 1] <- 1  # JSA
  us$uc_replace[is.na(us$uc_replace) & us$b_btype2    == 1] <- 1  # income sup
  us$uc_replace[is.na(us$uc_replace) & us$b_btype3    == 1] <- 1  # ESA
  us$uc_replace[is.na(us$uc_replace) & us$b_btype6    == 1] <- 1  # tax cred

  us$uc_replace[is.na(us$uc_replace) & us$a_benunemp1 == 1] <- 1  # JSA
  us$uc_replace[is.na(us$uc_replace) & us$a_btype2    == 1] <- 1  # income sup
  us$uc_replace[is.na(us$uc_replace) & us$a_btype3    == 1] <- 1  # ESA
  us$uc_replace[is.na(us$uc_replace) & us$a_btype6    == 1] <- 1  # tax cred

  us <- us %>%
    select(-f_benbase1, -f_benbase2, -f_othben5, -f_othben8, -f_benctc,
           -f_bendis2,
           -e_benunemp1, -e_btype2,  -e_btype3, -e_btype6,
           -d_benunemp1, -d_btype2,  -d_btype3, -d_btype6,
           -c_benunemp1, -c_btype2,  -c_btype3, -c_btype6,
           -b_benunemp1, -b_btype2,  -b_btype3, -b_btype6,
           -a_benunemp1, -a_btype2,  -a_btype3, -a_btype6)


  # Save data frame ====
  save(us, file = "data/us.RData", compress = "xz")
  rm(us)

}
philmikejones/thesis documentation built on May 31, 2019, 2:50 p.m.