data-raw/2-simulate-pilot.R

# Pilot simulation: limiting long-term illness or disability


# Individual-level data ====
if (!file.exists("data/pilot_ind.RData")) {

  load("data/us.RData")

  pilot_ind <- us %>%
    select(pidp,
           sex, qual, eth, ten, car, age,
           llid) %>%
    arrange(pidp) %>%
    na.omit()

  save(pilot_ind, file = "data/pilot_ind.RData", compress = "xz")

}


# Prepare constraints ====
if (!file.exists("data/pilot_con.RData")) {

  # Age
  census_age <- readr::read_csv(
    paste0("https://www.nomisweb.co.uk/api/v01/",
           "dataset/NM_145_1.data.csv?date=latest&",
           "geography=1254133554...1254134501,",
           "1254264241...1254264270&rural_urban=",
           "0&cell=1...16&measures=20100")) %>%
    select(GEOGRAPHY_CODE, CELL_NAME, OBS_VALUE) %>%
    rename(code = GEOGRAPHY_CODE,
           age  = CELL_NAME,
           freq = OBS_VALUE) %>%
    filter(age != "Age 0 to 4") %>%  # remove <16 to match US
    filter(age != "Age 5 to 7") %>%
    filter(age != "Age 8 to 9") %>%
    filter(age != "Age 10 to 14") %>%
    filter(age != "Age 15") %>%
    spread(age, freq) %>%
    arrange(code)

  colnames(census_age) <- c("code", "age_16_17", "age_18_19", "age_20_24",
                            "age_25_29", "age_30_44", "age_45_59", "age_60_64",
                            "age_65_74", "age_75_84", "age_85_89",
                            "age_90_plus")

  rakeR::check_constraint(census_age, 978)


  # Car
  census_car <- readr::read_csv(
    paste0("https://www.nomisweb.co.uk/api/v01/dataset/",
           "NM_1065_1.data.csv?date=latest&geography=1254133554",
           "...1254134501,1254264241...1254264270&c_carsno=1...3&",
           "economic_activity=0&measures=20100")) %>%
    select(GEOGRAPHY_CODE, C_CARSNO_NAME, OBS_VALUE) %>%
    rename(code = GEOGRAPHY_CODE,
           cars = C_CARSNO_NAME,
           freq = OBS_VALUE) %>%
    spread(cars, freq) %>%
    arrange(code)

  colnames(census_car) <- c("code", "car_1", "car_2_plus", "car_0")

  stopifnot(
    all.equal(census_age$code, census_car$code)
  )

  # The number of respondents in car does not match age and they must
  #
  #   Find which zone(s) don't match
  #   Find the probabilities/distribution for these zones based on age
  #   Reassign the 'real' population based on these probabilities
  #   Probabilities will match, and correct number of people imputed

  # Get the total population based on the _age variable
  census_age <- census_age %>%
    arrange(code)
  census_car <- census_car %>%
    arrange(code)

  stopifnot(
    all(census_age$code == census_car$code)
  )

  original_length <- nrow(census_car)

  census_car$actual_pop  <- rowSums(census_age[, 2:ncol(census_age)])
  census_car$current_pop <- rowSums(census_car[, 2:ncol(census_car)])
  census_car$current_pop <- census_car$current_pop - census_car$actual_pop
  census_car$diff        <- census_car$actual_pop  - census_car$current_pop

  max_col <- (grep("actual_pop", colnames(census_car))) - 1

  change     <- census_car[census_car$diff != 0, ]
  census_car <- census_car[census_car$diff == 0, ]

  change[, 2:max_col] <- lapply(change[, 2:max_col], function(x) {

    round((x / change[["current_pop"]]) * change[["actual_pop"]])

  })

  change$current_pop <- rowSums(change[, 2:max_col])
  change$diff        <- change$actual_pop - change$current_pop

  census_car <- rbind(census_car, change[change$diff == 0, ])
  change     <- change[change$diff != 0, ]

  change_col <- which.max(lapply(change[change$diff > 0, 2:max_col], mean))

  # use change_col + 1 because change_col created without column 1 (zone)
  change[change$diff > 0, change_col + 1] <-
    change[change$diff > 0, change_col + 1] + 1
  change[change$diff < 0, change_col + 1] <-
    change[change$diff < 0, change_col + 1] - 1

  change$current_pop <- rowSums(change[, 2:max_col])
  change$diff        <- change$actual_pop - change$current_pop

  census_car <- rbind(census_car, change[change$diff == 0, ])

  stopifnot(
    all.equal(rowSums(census_car[, 2:4]), census_car$actual_pop),
    all.equal(nrow(census_car), original_length)
  )

  # Order doesn't match individual level data; re-order
  census_car <- census_car %>%
    select(code, car_0, car_1, car_2_plus) %>%
    arrange(code)

  stopifnot(
    all.equal(sum(census_car[, 2:ncol(census_car)]),
              sum(census_age[, 2:ncol(census_age)])  )
  )

  rakeR::check_constraint(census_car, 978)


  # Ethnicity
  census_eth <- readr::read_csv(
    paste0("https://www.nomisweb.co.uk/api/v01/",
           "dataset/NM_818_1.data.csv?date=latest&",
           "geography=1254133554...1254134501,",
           "1254264241...1254264270&c_age=0&",
           "c_ecopuk11=0&c_ethpuk11=2...8&measures=",
           "20100")) %>%
    select(GEOGRAPHY_CODE, C_ETHPUK11_NAME, OBS_VALUE) %>%
    rename(code         = GEOGRAPHY_CODE,
           ethnic_group = C_ETHPUK11_NAME,
           freq         = OBS_VALUE) %>%
    spread(ethnic_group, freq) %>%
    arrange(code)

  # Rename to match ind_eth names
  colnames(census_eth) <- c("code", "asian_asian_british",
                            "black_african_caribbean_british",
                            "mixed_multiple_ethnic", "other_ethnicity",
                            "british", "irish", "other_white")

  # reorder ethnicity vars to match us data
  census_eth <- select(census_eth, code, british, irish, other_white,
                       mixed_multiple_ethnic, asian_asian_british,
                       black_african_caribbean_british, other_ethnicity)

  stopifnot(
    all.equal(sum(census_eth[, 2:ncol(census_eth)]),
              sum(census_age[, 2:ncol(census_age)])  )
  )

  rakeR::check_constraint(census_eth, 978)


  # Highest qualification
  census_qual <- readr::read_csv(
    paste0("https://www.nomisweb.co.uk/api/v01/",
           "dataset/NM_554_1.data.csv?date=latest&geography=",
           "1254133554...1254134501,1254264241...1254264270&",
           "rural_urban=0&cell=1...7&measures=20100")) %>%
    select(GEOGRAPHY_CODE, CELL_NAME, OBS_VALUE) %>%
    rename(code = GEOGRAPHY_CODE,
           qual = CELL_NAME,
           freq = OBS_VALUE) %>%
    spread(qual, freq) %>%
    arrange(code)

  # Combine some qualifications to make compatible with survey
  # see notes in 0-prep-understanding-society.R
  colnames(census_qual) <- c("code", "apprenticeship", "level_1",
                             "level_2", "level_3", "level_4", "no_qual",
                             "other_qual")
  census_qual$level_2 <- census_qual$level_1 + census_qual$level_2
  census_qual$level_1 <- NULL

  census_qual$other_qual <- census_qual$other_qual + census_qual$apprenticeship
  census_qual$apprenticeship <- NULL

  colnames(census_qual) <- c("code", "qual_2", "qual_3", "qual_4_plus",
                             "qual_0", "qual_other")

  # reorder to match ind_hiq
  census_qual <- select(census_qual, code, qual_0, qual_2, qual_3, qual_4_plus,
                        qual_other)

  stopifnot(
    all.equal(sum(census_qual[, 2:ncol(census_qual)]),
              sum(census_age[, 2:ncol(census_age)])  )
  )

  rakeR::check_constraint(census_qual, 978)


  # Sex
  census_sex <- readr::read_csv(
    paste0("https://www.nomisweb.co.uk/api/v01/",
           "dataset/NM_1414_1.data.csv?date=latest&geography=",
           "1254133554...1254134501,1254264241...1254264270&c_sex=1,2&",
           "c_age=17...39&measures=20100&signature=NPK-0c73734c0f725c979cee3a:",
           "0x7e92136e4376f41731ee5d07a91447ff2668e0ac")) %>%
    select(GEOGRAPHY_CODE, C_SEX_NAME, C_AGE_NAME, OBS_VALUE) %>%
    rename(code = GEOGRAPHY_CODE,
           sex  = C_SEX_NAME,
           age  = C_AGE_NAME,
           freq = OBS_VALUE) %>%
    spread(age, freq) %>%
    arrange(code)

  census_sex$freq <- rowSums(census_sex[, 3:ncol(census_sex)])

  census_sex <- census_sex %>%
    select(code, sex, freq) %>%
    spread(sex, freq) %>%
    arrange(code)

  colnames(census_sex) <- c("code", "female", "male")
  census_sex <- census_sex[, c("code", "male", "female")]  # match us

  stopifnot(
    all.equal(sum(census_sex[, 2:ncol(census_sex)]),
              sum(census_age[, 2:ncol(census_age)])  )
  )

  rakeR::check_constraint(census_sex, 978)


  # Tenure
  census_ten <- readr::read_csv(paste0("https://www.nomisweb.co.uk/api/v01/",
                                       "dataset/NM_1404_1.data.csv?date=latest&geography=1254133554...1254134501,",
                                       "1254264241...1254264270&c_tenhuk11=2,3,5,6&c_age=1...4&c_health=0&",
                                       "measures=20100")) %>%
    select(GEOGRAPHY_CODE, C_TENHUK11_NAME, C_AGE_NAME, OBS_VALUE) %>%
    rename(code   = GEOGRAPHY_CODE,
           tenure = C_TENHUK11_NAME,
           age    = C_AGE_NAME,
           freq   = OBS_VALUE) %>%
    spread(tenure, freq) %>%
    filter(age != "Age 0 to 15") %>%
    select(-age) %>%
    rename(
      owned_outright          = `Owned: Owned outright`,
      owned_mortgage_shared   = `Owned: Owned with a mortgage or loan or shared ownership`,
      rented_private_rentfree = `Rented: Private rented or living rent free`,
      rented_social           = `Rented: Social rented`) %>%
    # Combine rented_private_rentfree + rented_social to match us
    mutate(rented = rented_private_rentfree + rented_social) %>%
    group_by(code) %>%
    select(-rented_private_rentfree, -rented_social) %>%
    summarise(owned_outright        = sum(owned_outright),
              owned_mortgage_shared = sum(owned_mortgage_shared),
              rented                = sum(rented)) %>%
    arrange(code)

  # Number of respondents in tenure does not match other vars (e.g. age).
  # Impute 'correct' numbers from proportion and age population.
  original_length <- nrow(census_ten)

  census_ten$actual_pop  <- rowSums(census_age[, 2:ncol(census_age)])
  census_ten$current_pop <- rowSums(census_ten[, 2:ncol(census_ten)])
  census_ten$current_pop <- census_ten$current_pop - census_ten$actual_pop
  census_ten$diff        <- census_ten$actual_pop  - census_ten$current_pop

  max_col <- (grep("actual_pop", colnames(census_ten))) - 1

  change     <- census_ten[census_ten$diff != 0, ]
  census_ten <- census_ten[census_ten$diff == 0, ]

  change[, 2:max_col] <- lapply(change[, 2:max_col], function(x) {

    round((x / change[["current_pop"]]) * change[["actual_pop"]])

  })

  change$current_pop <- rowSums(change[, 2:max_col])
  change$diff        <- change$actual_pop - change$current_pop

  census_ten <- rbind(census_ten, change[change$diff == 0, ])
  change     <- change[change$diff != 0, ]

  change_col <- which.max(lapply(change[change$diff > 0, 2:max_col], mean))

  # use change_col + 1 because change_col created without column 1 (zone)
  change[change$diff > 0, change_col + 1] <-
    change[change$diff > 0, change_col + 1] + 1
  change[change$diff < 0, change_col + 1] <-
    change[change$diff < 0, change_col + 1] - 1

  change$current_pop <- rowSums(change[, 2:max_col])
  change$diff        <- change$actual_pop - change$current_pop

  census_ten <- rbind(census_ten, change[change$diff == 0, ])

  stopifnot(
    all.equal(rowSums(census_ten[, 2:4]), census_ten$actual_pop),
    all.equal(nrow(census_ten), original_length)
  )

  # reorder to match ind_ten
  census_ten <- census_ten %>%
    select(code, owned_outright, owned_mortgage_shared, rented) %>%
    arrange(code)

  rakeR::check_constraint(census_ten, 978)


  # Create final constraint table
  stopifnot(
    census_age[["code"]] == census_car[["code"]],
    census_age[["code"]] == census_eth[["code"]],
    census_age[["code"]] == census_qual[["code"]],
    census_age[["code"]] == census_sex[["code"]],
    census_age[["code"]] == census_ten[["code"]]
  )

  pilot_con <- inner_join(census_age, census_car, by = "code") %>%
    inner_join(census_eth, by = "code") %>%
    inner_join(census_qual, by = "code") %>%
    inner_join(census_sex, by = "code") %>%
    inner_join(census_ten, by = "code") %>%
    arrange(code)

  stopifnot(
    all.equal(pilot_con$code, unique(pilot_con$code)),
    all.equal(sum(census_age[, 2:ncol(census_age)]),
              (sum(pilot_con[, 2:ncol(pilot_con)])) / 6)
    # 6 no. of constraints
  )

  pilot_con <- pilot_con %>%
    select(code,
           female, male,
           qual_0, qual_2, qual_3, qual_4_plus, qual_other,
           british, irish, other_white, mixed_multiple_ethnic,
           asian_asian_british, black_african_caribbean_british,
           other_ethnicity,
           owned_outright, owned_mortgage_shared, rented,
           car_0, car_1, car_2_plus,
           age_16_17, age_18_19, age_20_24, age_25_29, age_30_44, age_45_59,
           age_60_64, age_65_74, age_75_84, age_85_89, age_90_plus) %>%
    rename(
      sex_female                          = female,
      sex_male                            = male,
      eth_british                         = british,
      eth_irish                           = irish,
      eth_other_white                     = other_white,
      eth_mixed_multiple_ethnic           = mixed_multiple_ethnic,
      eth_asian_asian_british             = asian_asian_british,
      eth_black_african_caribbean_british = black_african_caribbean_british,
      eth_other_ethnicity                 = other_ethnicity,
      ten_owned_outright                  = owned_outright,
      ten_owned_mortgage_shared           = owned_mortgage_shared,
      ten_rented                          = rented)

  save(pilot_con, file = "data/pilot_con.RData", compress = "xz")

  rm(list = ls(pattern = "census_"),
     change, max_col, change_col,
     us)

}


# Weight OA! ====
# fractional weights
if (!file.exists("data/weights_oa.RData")) {

  # weights_oa is huge and can't be uploaded to github without LFS
  # still worth saving to cache it (added to gitignore and build ignore)

  load("data/pilot_con.RData")
  load("data/pilot_ind.RData")

  vars <- colnames(pilot_ind)[2:(ncol(pilot_ind) - 1)]
  weights_oa <- rakeR::weight(pilot_con, pilot_ind,
                              vars = vars, iterations = 15)
  weights_oa$rownames <- row.names(weights_oa)
  save(weights_oa, file = "data/weights_oa.RData", compress = "xz")

}

# extract weights
if (!file.exists("data/pilot_oa_ext.RData")) {

  load("data/weights_oa.RData")
  load("data/pilot_ind.RData")

  weights_oa <- as.data.frame(weights_oa)
  row.names(weights_oa) <- weights_oa$rownames
  weights_oa$rownames <- NULL

  pilot_oa_ext <- rakeR::extract(weights_oa, pilot_ind, "pidp")

  save(pilot_oa_ext, file = "data/pilot_oa_ext.RData", compress = "xz")

}

# integerise weights
if (!file.exists("data/pilot_oa_int.RData")) {

  # Have to use the raw weights so pilot_oa_ext is not suitable
  load("data/weights_oa.RData")
  weights_oa <- as.data.frame(weights_oa)
  row.names(weights_oa) <- weights_oa$rownames
  weights_oa$rownames <- NULL

  pilot_oa_int <- rakeR::integerise(weights_oa, pilot_ind)

  save(pilot_oa_int, file = "data/pilot_oa_int.RData", compress = "xz")

}


# LSOAs ====
if (!file.exists("data/pilot_con_lsoa.RData")) {

  load("data/pilot_con.RData")
  lup <- geolookup::oa_lsoa_msoa_lad

  pilot_con_lsoa <- dplyr::inner_join(pilot_con, lup[, 1:2],
                                      by = c("code" = "oa_cd"))

  stopifnot(
    nrow(pilot_con_lsoa) == 978
  )

  pilot_con_lsoa <- pilot_con_lsoa %>%
    select(-code) %>%
    group_by(lsoa_cd) %>%
    summarise_all(funs(sum)) %>%
    rename(code = lsoa_cd)

  stopifnot(
    all.equal(sum(pilot_con_lsoa[, grep("sex_", colnames(pilot_con_lsoa))]),
              sum(pilot_con[, grep("sex_", colnames(pilot_con))]))
  )

  save(pilot_con_lsoa, file = "data/pilot_con_lsoa.RData", compress = "xz")

}

if (!file.exists("data/weights_lsoa.RData")) {

  load("data/pilot_ind.RData")
  load("data/pilot_con_lsoa.RData")
  vars <- colnames(pilot_ind)[2:(ncol(pilot_ind) - 1)]

  weights_lsoa <- rakeR::weight(pilot_con_lsoa, pilot_ind,
                                vars = vars, iterations = 15)

  weights_lsoa$row_names  <- row.names(weights_lsoa)
  row.names(weights_lsoa) <- NULL

  save(weights_lsoa, file = "data/weights_lsoa.RData", compress = "xz")

}

if (!file.exists("data/pilot_lsoa_ext.RData")) {

  load("data/weights_lsoa.RData")
  load("data/pilot_ind.RData")
  weights_lsoa <- as.data.frame(weights_lsoa)

  row.names(weights_lsoa) <- weights_lsoa$row_names
  weights_lsoa$row_names  <- NULL

  pilot_lsoa_ext <- rakeR::extract(weights_lsoa, pilot_ind, "pidp")

  save(pilot_lsoa_ext, file = "data/pilot_lsoa_ext.RData", compress = "xz")

}


# MSOAs ====
if (!file.exists("data/pilot_con_msoa.RData")) {

  load("data/pilot_con.RData")
  lup <- geolookup::oa_lsoa_msoa_lad

  pilot_con_msoa <- dplyr::inner_join(pilot_con, lup[, c(1, 4)],
                                      by = c("code" = "oa_cd"))

  stopifnot(
    nrow(pilot_con_msoa) == 978
  )

  pilot_con_msoa <- pilot_con_msoa %>%
    select(-code) %>%
    group_by(msoa_cd) %>%
    summarise_all(funs(sum)) %>%
    rename(code = msoa_cd)

  stopifnot(
    all.equal(sum(pilot_con[, grep("sex_", colnames(pilot_con))]),
              sum(pilot_con_msoa[, grep("sex_", colnames(pilot_con_msoa))]))
  )

  save(pilot_con_msoa, file = "data/pilot_con_msoa.RData", compress = "xz")

}

if (!file.exists("data/weights_msoa.RData")) {

  load("data/pilot_con_msoa.RData")
  load("data/pilot_ind.RData")
  vars <- colnames(pilot_ind)[2:(ncol(pilot_ind) - 1)]

  weights_msoa <- rakeR::weight(pilot_con_msoa, pilot_ind,
                                vars = vars, iterations = 15)

  weights_msoa$row_names  <- row.names(weights_msoa)
  row.names(weights_msoa) <- NULL

  save(weights_msoa, file = "data/weights_msoa.RData", compress = "xz")

}

if (!file.exists("data/pilot_msoa_ext.RData")) {

  load("data/weights_msoa.RData")
  load("data/pilot_ind.RData")
  weights_msoa <- as.data.frame(weights_msoa)

  row.names(weights_msoa) <- weights_msoa$row_names
  weights_msoa$row_names  <- NULL

  pilot_msoa_ext <- rakeR::extract(weights_msoa, pilot_ind, "pidp")

  save(pilot_msoa_ext, file = "data/pilot_msoa_ext.RData", compress = "xz")

}
philmikejones/thesis documentation built on May 31, 2019, 2:50 p.m.