apsclean: Annual Population Survey Data Wrangling

Documented in aps_clean_demographic

#' Clean demographic data
#'
#' Cleans the raw data for age, gender, ethnicity, region, and marital status.
#'
#' @return Returns a new set of variables
#' @export
aps_clean_demographic <- function(
  data
) {

  # age

  data <- as.data.table(data)

  if("age" %in% colnames(data)) {

    data[ , age := as.double(age)]

    # Make agebands
    data[ , age_cat := c(
      "0-1",
      "2-4",
      "5-7",
      "8-10",
      "11-12",
      "13-15",
      "16-17",
      "18-19",
      "20-24",
      "25-29",
      "30-34",
      "35-39",
      "40-44",
      "45-49",
      "50-54",
      "55-59",
      "60-64",
      "65-69",
      "70-74",
      "75-79",
      "80-84",
      "85-89",
      "90+"
    )[findInterval(age, c(-10, 2, 5, 8, 11, 13, 16, 18, seq(20, 90, 5)))]]

  } else {

    data[ , age := NA_real_]

  }

  # Select ages up to 90 years
  data <- data[age_cat != "90+"]

  # Calculate birth cohort
  data[ , cohort := year - age]


  # gender

  data[ , sex := c("Male", "Female")[sex]]

  # ethnicity

  if ("ethukeul" %in% colnames(data)) {
    data[ethukeul == 1, ethnicity_4cat := "white"]
    data[ethukeul == 2, ethnicity_4cat := "mixed"]
    data[ethukeul == 8, ethnicity_4cat := "black"]
    data[ethukeul %in% c(3,4,5,6,7,9), ethnicity_4cat := "asian_other"]

    data[ethukeul == 1, ethnicity_2cat := "white"]
    data[ethukeul %in% seq(2,9,1) , ethnicity_2cat := "non-white"]

    data <- subset(data,select = -c(ethukeul))

  }
  if ("eth01" %in% colnames(data)) {
    data[eth01 == 1, ethnicity_4cat := "white"]
    data[eth01 == 2, ethnicity_4cat := "mixed"]
    data[eth01 == 4, ethnicity_4cat := "black"]
    data[eth01 %in% c(3,5,6), ethnicity_4cat := "asian_other"]

    data[eth01 == 1, ethnicity_2cat := "white"]
    data[eth01 %in% c(2,3,4,5,6) , ethnicity_2cat := "non-white"]

    data <- subset(data,select = -c(eth01))


  }

  # government office region and country variables

  data[govtof == 1 , region := "North East"]
  data[govtof == 2 , region := "North West"]
  data[govtof == 3 , region := "North West"]
  data[govtof == 4 , region := "Yorkshire and Humber"]
  data[govtof == 5 , region := "East Midlands"]
  data[govtof == 6 , region := "West Midlands"]
  data[govtof == 7 , region := "Eastern"]
  data[govtof == 8 , region := "London"]
  data[govtof == 9 , region := "South East"]
  data[govtof == 10, region := "South West"]
  data[govtof == 11, region := "Wales"]
  data[govtof == 12, region := "Scotland"]
  data[govtof == 13, region := "Northern Ireland"]

  data[govtof %in% c(1:10), country := "England"]
  data[govtof == 11, country := "Wales"]
  data[govtof == 12, country := "Scotland"]
  data[govtof == 13, country := "Northern Ireland"]

  data <- subset(data,select = -c(govtof))

  # marital status

  if ("marstt" %in% colnames(data)) {
  data[marsta == 1, relationship_status := "single"]
  data[marsta == 2, relationship_status := "married"]
  data[marsta == 3, relationship_status := "sep_div_wid"]
  data[marsta == 4, relationship_status := "sep_div_wid"]
  data[marsta == 5, relationship_status := "sep_div_wid"]

  data <- subset(data,select = -c(marstt))
}
  if ("marsta" %in% colnames(data)) {
    data[marsta == 1, relationship_status := "single"]
    data[marsta == 2, relationship_status := "married"]
    data[marsta == 3, relationship_status := "sep_div_wid"]
    data[marsta == 4, relationship_status := "sep_div_wid"]
    data[marsta == 5, relationship_status := "sep_div_wid"]
    data[marsta == 6, relationship_status := "married"]

    data <- subset(data,select = -c(marsta))

  }

  return(data)

}