R/load_data.R

Defines functions load_data_flows load_data_pums_vacant load_data_pums load_data_estimates load_data_decennial load_data_acs format_variables_acs

format_variables_acs <- function(variables) {

  # find code in 'data-raw/no_moe_vars.R' to pull these vars from api
  no_moes <- c("B00001_001", "B00002_001", "B98001_001", "B98001_002", "B98002_001",
               "B98002_002", "B98002_003", "B98003_001", "B98011_001", "B98012_001",
               "B98012_002", "B98012_003", "B98013_001", "B98013_002", "B98013_003",
               "B98013_004", "B98013_005", "B98013_006", "B98013_007", "B98014_001",
               "B98021_001", "B98021_002", "B98021_003", "B98021_004", "B98021_005",
               "B98021_006", "B98021_007", "B98021_008", "B98021_009", "B98021_010",
               "B98022_001", "B98022_002", "B98022_003", "B98022_004", "B98022_005",
               "B98022_006", "B98022_007", "B98022_008", "B98022_009", "B98022_010",
               "B98031_001", "B98032_001", "B99011_001", "B99011_002", "B99011_003",
               "B99012_001", "B99012_002", "B99012_003", "B99021_001", "B99021_002",
               "B99021_003", "B99031_001", "B99031_002", "B99031_003", "B99051_001",
               "B99051_002", "B99051_003", "B99051_004", "B99051_005", "B99051_006",
               "B99051_007", "B99052_001", "B99052_002", "B99052_003", "B99052_004",
               "B99052_005", "B99052_006", "B99052_007", "B99052PR_001", "B99052PR_002",
               "B99052PR_003", "B99052PR_004", "B99052PR_005", "B99052PR_006",
               "B99052PR_007", "B99053_001", "B99053_002", "B99053_003", "B99061_001",
               "B99061_002", "B99061_003", "B99071_001", "B99071_002", "B99071_003",
               "B99072_001", "B99072_002", "B99072_003", "B99072_004", "B99072_005",
               "B99072_006", "B99072_007", "B99080_001", "B99080_002", "B99080_003",
               "B99081_001", "B99081_002", "B99081_003", "B99081_004", "B99081_005",
               "B99082_001", "B99082_002", "B99082_003", "B99082_004", "B99082_005",
               "B99083_001", "B99083_002", "B99083_003", "B99083_004", "B99083_005",
               "B99084_001", "B99084_002", "B99084_003", "B99084_004", "B99084_005",
               "B99085_001", "B99085_002", "B99085_003", "B99086_001", "B99086_002",
               "B99086_003", "B99087_001", "B99087_002", "B99087_003", "B99087_004",
               "B99087_005", "B99088_001", "B99088_002", "B99088_003", "B99088_004",
               "B99088_005", "B99089_001", "B99089_002", "B99089_003", "B99092_001",
               "B99092_002", "B99092_003", "B99102_001", "B99102_002", "B99102_003",
               "B99103_001", "B99103_002", "B99103_003", "B99104_001", "B99104_002",
               "B99104_003", "B99104_004", "B99104_005", "B99104_006", "B99104_007",
               "B99121_001", "B99121_002", "B99121_003", "B99122_001", "B99122_002",
               "B99122_003", "B99123_001", "B99123_002", "B99123_003", "B99124_001",
               "B99124_002", "B99124_003", "B99125_001", "B99125_002", "B99125_003",
               "B99126_001", "B99126_002", "B99126_003", "B99131_001", "B99131_002",
               "B99131_003", "B99132_001", "B99132_002", "B99132_003", "B99141_001",
               "B99141_002", "B99141_003", "B99142_001", "B99142_002", "B99142_003",
               "B99151_001", "B99151_002", "B99151_003", "B99152_001", "B99152_002",
               "B99152_003", "B99161_001", "B99161_002", "B99161_003", "B99162_001",
               "B99162_002", "B99162_003", "B99162_004", "B99162_005", "B99162_006",
               "B99162_007", "B99163_001", "B99163_002", "B99163_003", "B99163_004",
               "B99163_005", "B99171_001", "B99171_002", "B99171_003", "B99171_004",
               "B99171_005", "B99171_006", "B99171_007", "B99171_008", "B99171_009",
               "B99171_010", "B99171_011", "B99171_012", "B99171_013", "B99171_014",
               "B99171_015", "B99172_001", "B99172_002", "B99172_003", "B99172_004",
               "B99172_005", "B99172_006", "B99172_007", "B99172_008", "B99172_009",
               "B99172_010", "B99172_011", "B99172_012", "B99172_013", "B99172_014",
               "B99172_015", "B99181_001", "B99181_002", "B99181_003", "B99182_001",
               "B99182_002", "B99182_003", "B99183_001", "B99183_002", "B99183_003",
               "B99184_001", "B99184_002", "B99184_003", "B99185_001", "B99185_002",
               "B99185_003", "B99186_001", "B99186_002", "B99186_003", "B99187_001",
               "B99187_002", "B99187_003", "B99187_004", "B99187_005", "B99187_006",
               "B99187_007", "B99191_001", "B99191_002", "B99191_003", "B99191_004",
               "B99191_005", "B99191_006", "B99191_007", "B99191_008", "B99192_001",
               "B99192_002", "B99192_003", "B99192_004", "B99192_005", "B99192_006",
               "B99192_007", "B99192_008", "B99193_001", "B99193_002", "B99193_003",
               "B99193_004", "B99193_005", "B99193_006", "B99193_007", "B99193_008",
               "B99194_001", "B99194_002", "B99194_003", "B99194_004", "B99194_005",
               "B99194_006", "B99194_007", "B99194_008", "B99201_001", "B99201_002",
               "B99201_003", "B99201_004", "B99201_005", "B99201_006", "B99201_007",
               "B99201_008", "B99211_001", "B99211_002", "B99211_003", "B99212_001",
               "B99212_002", "B99212_003", "B99221_001", "B99221_002", "B99221_003",
               "B99231_001", "B99231_002", "B99231_003", "B99232_001", "B99232_002",
               "B99232_003", "B99233_001", "B99233_002", "B99233_003", "B99233_004",
               "B99233_005", "B99234_001", "B99234_002", "B99234_003", "B99234_004",
               "B99234_005", "B99241_001", "B99241_002", "B99241_003", "B99242_001",
               "B99242_002", "B99242_003", "B99243_001", "B99243_002", "B99243_003",
               "B99244_001", "B99244_002", "B99244_003", "B99245_001", "B99245_002",
               "B99245_003", "B99246_001", "B99246_002", "B99246_003", "B992510_001",
               "B992510_002", "B992510_003", "B992511_001", "B992511_002", "B992511_003",
               "B992512_001", "B992512_002", "B992512_003", "B992513_001", "B992513_002",
               "B992513_003", "B992514_001", "B992514_002", "B992514_003", "B992515_001",
               "B992515_002", "B992515_003", "B992516_001", "B992516_002", "B992516_003",
               "B992518_001", "B992518_002", "B992518_003", "B992519_001", "B992519_002",
               "B992519_003", "B99252_001", "B99252_002", "B99252_003", "B992520_001",
               "B992520_002", "B992520_003", "B992521_001", "B992521_002", "B992521_003",
               "B992522_001", "B992522_002", "B992522_003", "B992522_004", "B992522_005",
               "B992522_006", "B992522_007", "B992523_001", "B992523_002", "B992523_003",
               "B99253_001", "B99253_002", "B99253_003", "B99254_001", "B99254_002",
               "B99254_003", "B99255_001", "B99255_002", "B99255_003", "B99256_001",
               "B99256_002", "B99256_003", "B99257_001", "B99257_002", "B99257_003",
               "B99258_001", "B99258_002", "B99258_003", "B99259_001", "B99259_002",
               "B99259_003", "B992701_001", "B992701_002", "B992701_003", "B992702_001",
               "B992702_002", "B992702_003", "B992703_001", "B992703_002", "B992703_003",
               "B992704_001", "B992704_002", "B992704_003", "B992705_001", "B992705_002",
               "B992705_003", "B992706_001", "B992706_002", "B992706_003", "B992707_001",
               "B992707_002", "B992707_003", "B992708_001", "B992708_002", "B992708_003",
               "B992709_001", "B992709_002", "B992709_003", "B99281_001", "B99281_002",
               "B99281_003", "B99282_001", "B99282_002", "B99282_003", "B99282_004",
               "B99282_005", "B99282_006", "B99282_007", "B99282_008", "B99282_009",
               "B99283_001", "B99283_002", "B99283_003", "B99283_004", "B99283_005")

  # First, remove E or M if user has put it in
  variables1 <- map_chr(variables, function(x) {
    if (str_sub(x, -1) %in% c("E", "M")) {
      x <- str_sub(x, 1, -2)
    } else {
      x <- x
    }
  })

  # Now, make unique
  variables2 <- unique(variables1)

  # The comparison profile doesn't have MOEs, so account for that
  if (!any(grepl("^CP[0-9].", variables2))) {

    # Next, separate into vars with and without MOEs
    variables2a <- variables2[!variables2 %in% no_moes]

    variables2_nomoe <- variables2[variables2 %in% no_moes]

    # Now, expand with both E and M if MOE is applicable
    variables3 <- map_chr(variables2a, function(y) paste0(y, c("E", "M"), collapse = ","))

    if (length(variables2_nomoe)) {
      variables3_nomoe <- paste0(variables2_nomoe, "E")

      variables3 <- c(variables3, variables3_nomoe)
    }
  } else {
    variables3 <- paste0(variables2, "E")
  }

  # Now, put together all these strings if need be
  var <- paste0(variables3, collapse = ",")

  return(var)

}

load_data_acs <- function(geography, formatted_variables, key, year, state = NULL,
                          county = NULL, zcta = NULL, survey, show_call = FALSE) {

  base <- paste("https://api.census.gov/data",
                  as.character(year), "acs",
                  survey, sep = "/")

  if (grepl("^DP", formatted_variables)) {
    message("Using the ACS Data Profile")
    base <- paste0(base, "/profile")
  }

  if (grepl("^S[0-9].", formatted_variables)) {
    message("Using the ACS Subject Tables")
    base <- paste0(base, "/subject")
  }

  if (grepl("^CP[0-9].", formatted_variables)) {
    message("Using the ACS Comparison Profile")
    base <- paste0(base, "/cprofile")
  }

  for_area <- paste0(geography, ":*")

  if (!is.null(zcta))  {

    # if (is.null(state)) {
    #   stop("The `zcta` argument requires specifying a state.", call. = FALSE)
    # }

    if (!is.null(county)) {
      stop("ZCTAs do not nest within counties, so `county` should not be specified.",
           call. = FALSE)
    }

    if (!is.null(state)) {
      state <- map_chr(state, function(x) {
        validate_state(x)
      })

      in_area <- paste0("state:", state)

    } else {
      in_area <- NULL
    }

    for_area <- paste0(zcta, collapse = ",")

    vars_to_get <- paste0(formatted_variables, ",NAME")

    call <- GET(base, query = list(get = vars_to_get,
                                   "for" = paste0(geography, ":", for_area),
                                   "in" = in_area,
                                   key = key))

  } else if (!is.null(state) && is.null(zcta)) {

    state <- map_chr(state, function(x) {
      validate_state(x)
    })

    if (length(state) > 1) {
      state <- paste0(state, collapse = ",")
    }

    if (geography == "state") {
      for_area <- paste0("state:", state)
    }

    if (!is.null(county)) {

      county <- map_chr(county, function(x) {
        validate_county(state, x)
      })

      if (length(county) > 1) {
        county <- paste0(county, collapse = ",")
      }

      if (geography == "county") {

        for_area <- paste0("county:", county)
        in_area <- paste0("state:", state)

      } else {

        in_area <- paste0("state:", state,
                          "+county:", county)

      }

    } else {

      if (geography == "block group" && is.null(county)) {
          in_area <- paste0("state:", state,
                            "&in=county:*")
      } else {
        in_area <- paste0("state:", state)
      }

    }

    vars_to_get <- paste0(formatted_variables, ",NAME")

    if (geography == "state" && !is.null(state)) {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     key = key))
    } else {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     "in" = in_area,
                                     key = key))
    }


  } else {

    vars_to_get <- paste0(formatted_variables, ",NAME")

    call <- GET(base, query = list(get = vars_to_get,
                                   "for" = paste0(geography, ":*"),
                                   key = key))
  }

  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", call_url))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }

  }

  # callStatus <- http_status(call)
  # if (callStatus$reason != "OK") {
  #   message(callStatus$category, " ", callStatus$reason, " ", callStatus$message)
  # } else {
  #   content <- content(call, as = "text")
  # }
  #
  # validate_call(content = content, geography = geography, year = year,
  #               dataset = survey)

  content <- content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  dat <- fromJSON(content)

  colnames(dat) <- dat[1,]

  dat <- as_tibble(dat)

  dat <- dat[-1,]

  var_vector <- unlist(strsplit(formatted_variables, split = ","))

  dat[var_vector] <- lapply(dat[var_vector], as.numeric)

  v2 <- c(var_vector, "NAME")

  # Get the geography ID variables
  id_vars <- names(dat)[! names(dat) %in% v2]

  # Paste into a GEOID column
  dat$GEOID <- do.call(paste0, dat[id_vars])

  # Now, remove them
  dat <- dat[, !(names(dat) %in% id_vars)]

  return(dat)

}


load_data_decennial <- function(geography, variables, key, year, sumfile, pop_group,
                                state = NULL, county = NULL, show_call = FALSE) {


  var <- paste0(variables, collapse = ",")

  if (year == 1990) {
    vars_to_get <- paste0(var, ",ANPSADPI")
  } else {
    vars_to_get <- paste0(var, ",NAME")
  }

  if (!is.null(pop_group)) {
    if (pop_group == "all") {
      vars_to_get <- paste0(vars_to_get, ",POPGROUP")
      pop_group <- NULL
    }

  }

  base <- paste0("https://api.census.gov/data/",
                                  year,
                                  "/dec/",
                                  sumfile)

  # if (year == 2010) {
  #   base <- paste0("https://api.census.gov/data/",
  #                  year,
  #                  "/dec/",
  #                  sumfile)
  # } else {
  #   base <- paste0("https://api.census.gov/data/",
  #                  year,
  #                  "/",
  #                  sumfile)
  # }




  for_area <- paste0(geography, ":*")

  if (!is.null(state)) {

    state <- map_chr(state, function(x) {
      validate_state(x)
    })

    if (length(state) > 1) {
      state <- paste0(state, collapse = ",")
    }

    if (geography == "state") {
      for_area <- paste0("state:", state)
    }

    if (!is.null(county)) {

      county <- map_chr(county, function(x) {
        validate_county(state, x)
      })

      if (length(county) > 1) {
        county <- paste0(county, collapse = ",")
      }

      if (geography == "county") {

        for_area <- paste0("county:", county)
        in_area <- paste0("state:", state)

      } else {

        in_area <- paste0("state:", state,
                          "+county:", county)

      }

    } else {

      if ((geography == "block group" || geography == "block") && is.null(county)) {
        in_area <- paste0("state:", state,
                          "&in=county:*")
      } else {
        in_area <- paste0("state:", state)
      }

    }

    if (geography == "state" && !is.null(state)) {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     key = key,
                                     "POPGROUP" = pop_group))
    } else {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     "in" = in_area,
                                     key = key,
                                     "POPGROUP" = pop_group))
    }
  }

  else {

    call <- GET(base, query = list(get = vars_to_get,
                                   "for" = paste0(geography, ":*"),
                                   key = key,
                                   "POPGROUP" = pop_group))
  }

  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", call_url))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  # Try to handle 204's here
  if (call$status_code == 204) {

    if (sumfile == "ddhca") {
      rlang::abort(c("Your DDHC-A request returned No Content from the API.",
                     "i" = "The DDHC-A file uses an 'adaptive design' where data availability varies by geography and by population group.",
                     "i" = "Read Section 3-1 at https://www2.census.gov/programs-surveys/decennial/2020/technical-documentation/complete-tech-docs/detailed-demographic-and-housing-characteristics-file-a/2020census-detailed-dhc-a-techdoc.pdf for more information.",
                     "i" = "In tidycensus, use the function `check_ddhca_groups()` to see if your data is available."))

    } else {
      rlang::abort("No content was returned from the API.  Please refine your selection.")

    }

  }

  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else if (grepl("unknown variable", msg) && length(variables) > 1 && year < 2010) {
      stop(sprintf("The Census API has returned the error message %s.\nThis may be due to mixing SF1 and SF3 variables. If so, separate your requests to SF1 and SF3 when using `get_decennial()` by using the `sumfile` argument in separate calls.", msg))
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }

  }


  # callStatus <- http_status(call)
  # if (callStatus$reason != "OK") {
  #   if (sumfile == "sf1") {
  #     message("Checking SF3 API for data...")
  #   } else {
  #     message(callStatus$category, " ", callStatus$reason, " ", callStatus$message)
  #   }
  # } else {
  #   content <- content(call, as = "text")
  # }

  content <- content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  # Fix issue in SF3 2000 API - https://github.com/walkerke/tidycensus/issues/22
  if (year == 2000 && sumfile == "sf3") {
    content <- str_replace(content, 'O"Brien', "O'Brien")
    content <- str_replace(content, 'Prince George"s', "Prince George's")
    content <- str_replace(content, 'Queen Anne"s', "Queen Anne's")
    content <- str_replace(content, 'St. Mary"s', "St. Mary's")

  }

  dat <- fromJSON(content)

  colnames(dat) <- dat[1,]

  dat <- as_tibble(dat)

  dat <- dat[-1,]

  if (year == 1990) {
    dat <- rename(dat, NAME = ANPSADPI)
  }

  vnum <- variables[variables != "POPGROUP"]

  dat[vnum] <- lapply(dat[vnum], as.numeric)

  v2 <- c(variables, "NAME", "POPGROUP")

  # Get the geography ID variables
  id_vars <- names(dat)[! names(dat) %in% v2]

  # Paste into a GEOID column
  # Apply tract band-aid (GH issue #317)
  if ("tract" %in% id_vars && year == 2000) {
    dat$tract <- stringr::str_pad(dat$tract, 6, "right", "0")
  }

  dat$GEOID <- do.call(paste0, dat[id_vars])


  # Fix issues with ZCTAs if necessary
  if (geography == "zip code tabulation area") {
    l <- unique(nchar(dat$GEOID))
    if (l == 7) {
      dat$GEOID <- str_sub(dat$GEOID, 3, 7)
    }
  }

  # Now, remove them
  dat <- dat[, !(names(dat) %in% id_vars)]

  return(dat)

}


load_data_estimates <- function(geography, product = NULL, variables = NULL, key, year,
                                time_series, state = NULL, county = NULL, show_call = FALSE) {

  if (!is.null(product)) {
    if (!product %in% c("population", "components",
                        "charagegroups", "housing")) {
      stop("You have selected an invalid product.  Valid requests are 'population', 'components', 'characteristics', and 'housing'.", call. = FALSE)
    }
  }

  for_area <- paste0(geography, ":*")

  if (year >= 2019) {
    geo_name <- "NAME"
  } else {
    geo_name <- "GEONAME"
  }

  if (is.null(variables)) {
    if (is.null(product)) {
      stop("Either a product or a vector of variables (from a single product) is required.", call. = FALSE)
    } else {
      if (product == "population") {
        vars_to_get <- paste0(geo_name, ",POP,DENSITY")
      } else if (product == "components") {
        vars_to_get <- paste0(geo_name, ",BIRTHS,DEATHS,DOMESTICMIG,INTERNATIONALMIG,NATURALINC,NETMIG,RBIRTH,RDEATH,RDOMESTICMIG,RINTERNATIONALMIG,RNATURALINC,RNETMIG")
      } else if (product == "housing") {
        vars_to_get <- paste0(geo_name, ",HUEST")
      } else {
        stop("Other products are not yet supported.", call. = FALSE)
      }
    }
  } else {
    if (!is.null(product)) {
      if (product != "charagegroups") {
        stop("Please specify either a product or a vector of variables, but not both.", call. = FALSE)
      }
    } else {
      if (all(variables %in% c("POP", "DENSITY"))) {
        product <- "population"
      } else if (all(variables %in% c("BIRTHS", "DEATHS", "DOMESTICMIG", "INTERNATIONALMIG", "NATURALINC", "NETMIG", "RBIRTH", "RDEATH", "RDOMESTICMIG", "RINTERNATIONALMIG", "RNATURALINC", "RNETMIG"))) {
        product <- "components"
      } else if (all(variables %in% c("HUEST"))) {
        product <- "housing"
      } else {
        stop(
          paste('Variables must be a subset of only one of the following sets of valid variables:',
                'for population estimates, c("POP", "DENSITY");',
                'for housing unit estimates, c("HUEST");',
                'and for components of change estimates, c("BIRTHS", "DEATHS", "DOMESTICMIG", "INTERNATIONALMIG", "NATURALINC", "NETMIG", "RBIRTH", "RDEATH", "RDOMESTICMIG", "RINTERNATIONALMIG", "RNATURALINC", "RNETMIG").'),
          call. = FALSE)
      }
    }
    vars_to_get <- paste0(geo_name, ",", paste0(variables, collapse = ","))
  }

  if (time_series) {
    if (product == "components") {
      if (year >= 2018) {
        vars_to_get <- paste0(vars_to_get, ",PERIOD_CODE")
      } else {
        vars_to_get <- paste0(vars_to_get, ",PERIOD")
      }
    } else {
      if (year >= 2018) {
        vars_to_get <- paste0(vars_to_get, ",DATE_CODE")
      } else {
        vars_to_get <- paste0(vars_to_get, ",DATE_")
      }
    }
  }

  base <- sprintf("https://api.census.gov/data/%s/pep/%s", year, product)

  if (!is.null(state)) {

    state <- map_chr(state, function(x) {
      validate_state(x)
    })

    if (length(state) > 1) {
      state <- paste0(state, collapse = ",")
    }

    if (geography == "state") {
      for_area <- paste0("state:", state)
    }

    if (!is.null(county)) {

      county <- map_chr(county, function(x) {
        validate_county(state, x)
      })

      if (length(county) > 1) {
        county <- paste0(county, collapse = ",")
      }

      if (geography == "county") {

        for_area <- paste0("county:", county)
        in_area <- paste0("state:", state)

      } else {

        in_area <- paste0("state:", state,
                          "+county:", county)
      }

    } else {
      in_area <- paste0("state:", state)
    }




    if (geography == "state" && !is.null(state)) {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     key = key))
    } else {

      call <- GET(base, query = list(get = vars_to_get,
                                     "for" = for_area,
                                     "in" = in_area,
                                     key = key))
    }


  }

  else {

    call <- GET(base, query = list(get = vars_to_get,
                                   "for" = paste0(geography, ":*"),
                                   key = key))
  }

  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", call_url))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }

  }


  content <- content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  dat <- fromJSON(content)

  colnames(dat) <- dat[1,]

  dat <- as_tibble(dat)

  dat <- dat[-1,]

  var_vector <- unlist(strsplit(vars_to_get, split = ","))

  var_vector <- var_vector[var_vector != geo_name]

  dat[var_vector] <- lapply(dat[var_vector], as.numeric)

  v2 <- c(var_vector, geo_name)

  if ("county" %in% names(dat)) {
    dat$county <- stringr::str_pad(dat$county, 3, "left", "0")
  }

  # Get the geography ID variables
  id_vars <- names(dat)[! names(dat) %in% v2]

  # Paste into a GEOID column
  dat$GEOID <- do.call(paste0, dat[id_vars])

  # Now, remove them
  dat <- dat[, !(names(dat) %in% id_vars)]

  return(dat)
}

load_data_pums <- function(variables, state, puma, key, year, survey,
                           variables_filter, recode, show_call) {

  # for which years is data dictionary available in pums_variables?
  # we'll use this a couple times later on
  recode_years <- 2017:2022

  base <- sprintf("https://api.census.gov/data/%s/acs/%s/pums",
                  year, survey)


  if (!is.null(puma)) {

    if (length(state) > 1) {
      stop('When requesting PUMAs for more than one state, you must set state to "multiple" and set puma to a named vector of state/PUMA pairs.', call. = FALSE)
    }

    # pumas in multiple states can be requested with a named vector of
    # state / puma pairs in the puma argument of get_pums()
    if (state == "multiple") {

      # print FIPS code of states used just once
      purrr::walk(unique(names(puma)), validate_state)

      geo <- purrr::map2_chr(names(puma), unname(puma), function(x, y) {
        paste0("7950000US", suppressMessages(validate_state(x)), y)
      })

      geo <- paste0(geo, collapse = ",")

    } else {
      # if PUMAs requested are in one state only
      state <- validate_state(state)
      geo <- purrr::map_chr(puma, function(x) {
        paste0("7950000US", state, x)
      })

      if (length(puma) > 1) {
        geo <- paste0(geo, collapse = ",")
      }
    }
  } else {
    # if no PUMAs specified, get all PUMAs in each state requested
    if (!is.null(state)) {
      geo <- purrr::map_chr(state, function(x) {
        paste0("0400000US", validate_state(x))
      })

    } else {
      geo <- NULL
    }

    if (length(state) > 1) {
      geo <- paste0(geo, collapse = ",")

    }
  }

  # Handle variables & variable filter
  # If a variables filter is supplied, those variables should be removed from
  # variables (if applicable)
  if (!is.null(variables_filter)) {
    filter_names <- names(variables_filter)

    variables <- variables[!variables %in% filter_names]

    if (length(variables) > 0) {
      var <- paste0(variables, collapse = ",")

      # Handle housing-only query
      if ("SPORDER" %in% filter_names) {
        vars_to_get <- paste0("SERIALNO,WGTP,PWGTP,", var)
      } else {
        vars_to_get <- paste0("SERIALNO,SPORDER,WGTP,PWGTP,", var)
      }
    } else {
      if ("SPORDER" %in% filter_names) {
        vars_to_get <- "SERIALNO,WGTP,PWGTP"
      } else {
        vars_to_get <- "SERIALNO,SPORDER,WGTP,PWGTP"
      }
    }

    # If geo is NULL, state should be added back in here
    if (is.null(geo)) {
      vars_to_get <- paste0(vars_to_get, ",ST")
    }

    # Combine the default query with the variables filter query
    query_default <- list(get = vars_to_get,
                          ucgid = geo,
                          key = key)

    # Collapse vectors if supplied
    variables_filter_collapsed <- purrr::map(variables_filter,
                                             ~{paste0(.x, collapse = ",")})

    query <- c(
      list(get = vars_to_get),
      variables_filter_collapsed,
      list(ucgid = geo, key = key)
    )

  } else {

    var <- paste0(variables, collapse = ",")

    vars_to_get <- paste0("SERIALNO,SPORDER,WGTP,PWGTP,", var)

    # If geo is NULL, state should be added back in here
    if (is.null(geo)) {
      vars_to_get <- paste0(vars_to_get, ",ST")
    }

    query <- list(get = vars_to_get,
                  ucgid = geo,
                  key = key)

  }

  call <- httr::GET(base,
                    query = query,
                    httr::progress())

  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", call_url))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }

  }


  content <- httr::content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  dat <- jsonlite::fromJSON(content)

  colnames(dat) <- dat[1,]

  dat <- dplyr::as_tibble(dat, .name_repair = "minimal")

  dat <- dat[-1,]

  # Convert the weights columns to numeric
  dat <- dat %>%
    dplyr::mutate(
      dplyr::across(
        .cols = dplyr::contains("WGTP"),
        as.numeric
      )
    )
  # dat$WGTP <- as.numeric(dat$WGTP)
  # dat$PWGTP <- as.numeric(dat$PWGTP)

  # Filter the pums lookup table for the selected year and survey
  pums_variables_filter <- tidycensus::pums_variables %>%
    filter(year == !!year, survey == !!survey)

  # Do some clean up of the API response: pad returned values with 0s when
  # necessary to match data dictionary codes and
  # convert variables to numeric according to data dictionary

  # Only works for years included in pums_variables data dictionary
  if (year %in% recode_years) {
    var_val_length <- pums_variables_filter %>%
      filter(!is.na(.data$val_length)) %>%
      distinct(.data$var_code, .data$val_length, .data$val_na)

    num_vars <- pums_variables_filter %>%
      filter(.data$data_type == "num") %>%
      distinct(.data$var_code) %>%
      pull()

    # For all variables in which we know what the length should be, pad with 0s
    dat_padded <- suppressWarnings(
      dat %>%
        select(.data$SERIALNO, .data$SPORDER, any_of(var_val_length$var_code)) %>%
        pivot_longer(
          cols = -c(.data$SERIALNO, .data$SPORDER),
          names_to = "var_code",
          values_to = "val"
        ) %>%
        left_join(var_val_length, by = "var_code") %>%
        mutate(
          val = ifelse(!is.na(.data$val_na) & .data$val_na == .data$val, strrep("b", .data$val_length), .data$val),
          val = ifelse(.data$var_code != "NAICSP", str_pad(.data$val, .data$val_length, pad = "0"), .data$val),
          val = ifelse(.data$var_code == "NAICSP" & .data$val == "*", "bbbbbbbb", .data$val),  # special NULL value returned by API for this var
        ) %>%
        select(-.data$val_length, -.data$val_na) %>%
        pivot_wider(
          names_from = .data$var_code,
          values_from = .data$val
        )
    )

    # Rejoin padded variables to API return
    dat <- dat %>%
      select(.data$SERIALNO, .data$SPORDER, !any_of(var_val_length$var_code)) %>%
      left_join(dat_padded, by = c("SERIALNO", "SPORDER")) %>%
      mutate_at(vars(any_of(num_vars)), as.double)
    }

  # Do you want to return value labels also?
  if (recode) {

    # Only works for years included in pums_variables data dictionary
    if (year %in% recode_years) {
      var_lookup <- pums_variables_filter %>%
        select(.data$var_code, val = .data$val_min, .data$val_label)

      # Vector of variables that are possible to recode
      vars_to_recode <- pums_variables_filter %>%
        filter(.data$recode) %>%
        distinct(.data$var_code) %>%
        pull()

      # Pivot to long format and join variable codes to lookup table with labels
      recoded_long <- suppressWarnings(
        dat %>%
        select(.data$SERIALNO, .data$SPORDER, any_of(vars_to_recode)) %>%
        pivot_longer(
          cols = -c(.data$SERIALNO, .data$SPORDER),
          names_to = "var_code",
          values_to = "val"
        ) %>%
        left_join(var_lookup, by = c("var_code", "val")) %>%
        select(-.data$val)
      )

      # Create a "pivot spec" with nicer names for the labeled columns
      # https://tidyr.tidyverse.org/articles/pivot.html#wider-1
      spec <- recoded_long %>%
        build_wider_spec(
          names_from = .data$var_code,
          values_from = .data$val_label
        ) %>%
        mutate(.name = paste0(.data$var_code, "_label", ""))

      recoded_wide <- recoded_long %>%
        pivot_wider_spec(spec)

      recode_v <- recoded_long %>% pull(var_code) %>% unique()

      for (var in recode_v){

        order_v <- var_lookup %>%
          filter(var_code==var) %>%
          pull(val_label)

        var_label <- str_c(var, "_label")

        recoded_wide[[var_label]] <-
          readr::parse_factor(recoded_wide[[var_label]], ordered=TRUE, levels=order_v)

      }


      # Join recoded columns to API return
      dat <- dat %>%
        left_join(recoded_wide, by = c("SERIALNO", "SPORDER"))
    } else {
      message(paste("Recoding is currently supported for",
                    min(recode_years), "-", max(recode_years),
                    "data. Returning original data only."))
      }
    }
  return(dat)
}

load_data_pums_vacant <- function(variables, state, puma, key, year, survey,
                           variables_filter, recode, show_call) {

  # for which years is data dictionary available in pums_variables?
  # we'll use this a couple times later on
  recode_years <- 2017:2022

  base <- sprintf("https://api.census.gov/data/%s/acs/%s/pums",
                  year, survey)

  # Parse the variables for housing vs. person variables
  housing_catalog <- pums_variables %>%
    dplyr::filter(level != "person" | is.na(level)) %>%
    dplyr::pull(var_code) %>%
    unique()

  housing_variables <- variables[variables %in% housing_catalog]

  if (!is.null(puma)) {

    if (length(state) > 1) {
      stop('When requesting PUMAs for more than one state, you must set state to "multiple" and set puma to a named vector of state/PUMA pairs.', call. = FALSE)
    }

    # pumas in multiple states can be requested with a named vector of
    # state / puma pairs in the puma argument of get_pums()
    if (state == "multiple") {

      # print FIPS code of states used just once
      purrr::walk(unique(names(puma)), validate_state)

      geo <- purrr::map2_chr(names(puma), unname(puma), function(x, y) {
        paste0("7950000US", suppressMessages(validate_state(x)), y)
      })

      geo <- paste0(geo, collapse = ",")

    } else {
      # if PUMAs requested are in one state only
      state <- validate_state(state)
      geo <- purrr::map_chr(puma, function(x) {
        paste0("7950000US", state, x)
      })

      if (length(puma) > 1) {
        geo <- paste0(geo, collapse = ",")
      }
    }
  } else {
    # if no PUMAs specified, get all PUMAs in each state requested
    if (!is.null(state)) {
      geo <- purrr::map_chr(state, function(x) {
        paste0("0400000US", validate_state(x))
      })

    } else {
      geo <- NULL
    }

    if (length(state) > 1) {
      geo <- paste0(geo, collapse = ",")

    }
  }

  # Handle variables & variable filter
  # In this scenario, the "filter" is for vacant households
  # This will be specified on the `get_pums()` side
  if (!is.null(variables_filter)) {
    filter_names <- names(variables_filter)

    housing_variables <- housing_variables[!housing_variables %in% filter_names]

    var <- paste0(housing_variables, collapse = ",")

    vars_to_get <- paste0("SERIALNO,WGTP,", var)

    # If geo is NULL, state should be added back in here
    if (is.null(geo)) {
      vars_to_get <- paste0(vars_to_get, ",ST")
    }

    # Combine the default query with the variables filter query
    query_default <- list(get = vars_to_get,
                          ucgid = geo,
                          key = key)

    # Collapse vectors if supplied
    variables_filter_collapsed <- purrr::map(variables_filter,
                                             ~{paste0(.x, collapse = ",")})

    query <- c(
      list(get = vars_to_get),
      variables_filter_collapsed,
      list(ucgid = geo, key = key)
    )

  } else {

    # Not implemented here
    # var <- paste0(variables, collapse = ",")
    #
    # vars_to_get <- paste0("SERIALNO,SPORDER,WGTP,PWGTP,", var)
    #
    # # If geo is NULL, state should be added back in here
    # if (is.null(geo)) {
    #   vars_to_get <- paste0(vars_to_get, ",ST")
    # }
    #
    # query <- list(get = vars_to_get,
    #               ucgid = geo,
    #               key = key)

  }

  call <- httr::GET(base,
                    query = query,
                    httr::progress())

  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", call_url))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }

  }


  content <- httr::content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  dat <- jsonlite::fromJSON(content)

  colnames(dat) <- dat[1,]

  dat <- dplyr::as_tibble(dat, .name_repair = "minimal")

  dat <- dat[-1,]

  # Convert the weights columns to numeric
  dat <- dat %>%
    dplyr::mutate(
      dplyr::across(
        .cols = dplyr::contains("WGTP"),
        as.numeric
      )
    )
  # dat$WGTP <- as.numeric(dat$WGTP)
  # dat$PWGTP <- as.numeric(dat$PWGTP)

  # Filter the pums lookup table for the selected year and survey
  pums_variables_filter <- tidycensus::pums_variables %>%
    filter(year == !!year, survey == !!survey)

  # Do some clean up of the API response: pad returned values with 0s when
  # necessary to match data dictionary codes and
  # convert variables to numeric according to data dictionary

  # Only works for years included in pums_variables data dictionary
  if (year %in% recode_years) {
    var_val_length <- pums_variables_filter %>%
      dplyr::filter(!is.na(.data$val_length)) %>%
      dplyr::distinct(.data$var_code, .data$val_length, .data$val_na)

    num_vars <- pums_variables_filter %>%
      dplyr::filter(.data$data_type == "num") %>%
      dplyr::distinct(.data$var_code) %>%
      dplyr::pull()

    # For all variables in which we know what the length should be, pad with 0s
    dat_padded <- suppressWarnings(
      dat %>%
        dplyr::select(.data$SERIALNO, dplyr::any_of(var_val_length$var_code)) %>%
        tidyr::pivot_longer(
          cols = -c(.data$SERIALNO),
          names_to = "var_code",
          values_to = "val"
        ) %>%
        dplyr::left_join(var_val_length, by = "var_code") %>%
        dplyr::mutate(
          val = ifelse(!is.na(.data$val_na) & .data$val_na == .data$val, strrep("b", .data$val_length), .data$val),
          val = ifelse(.data$var_code != "NAICSP", str_pad(.data$val, .data$val_length, pad = "0"), .data$val),
          val = ifelse(.data$var_code == "NAICSP" & .data$val == "*", "bbbbbbbb", .data$val),  # special NULL value returned by API for this var
        ) %>%
        dplyr::select(-.data$val_length, -.data$val_na) %>%
        tidyr::pivot_wider(
          names_from = .data$var_code,
          values_from = .data$val
        )
    )

    # Rejoin padded variables to API return
    dat <- dat %>%
      dplyr::select(.data$SERIALNO, !dplyr::any_of(var_val_length$var_code)) %>%
      dplyr::left_join(dat_padded, by = c("SERIALNO")) %>%
      dplyr::mutate_at(vars(dplyr::any_of(num_vars)), as.double)
  }

  # Do you want to return value labels also?
  if (recode) {

    # Only works for years included in pums_variables data dictionary
    if (year %in% recode_years) {
      var_lookup <- pums_variables_filter %>%
        dplyr::select(.data$var_code, val = .data$val_min, .data$val_label)

      # Vector of variables that are possible to recode
      vars_to_recode <- pums_variables_filter %>%
        dplyr::filter(.data$recode) %>%
        dplyr::distinct(.data$var_code) %>%
        dplyr::pull()

      # Pivot to long format and join variable codes to lookup table with labels
      recoded_long <- suppressWarnings(
        dat %>%
          dplyr::select(.data$SERIALNO, dplyr::any_of(vars_to_recode)) %>%
          tidyr::pivot_longer(
            cols = -c(.data$SERIALNO),
            names_to = "var_code",
            values_to = "val"
          ) %>%
          dplyr::left_join(var_lookup, by = c("var_code", "val")) %>%
          dplyr::select(-.data$val)
      )

      # Create a "pivot spec" with nicer names for the labeled columns
      # https://tidyr.tidyverse.org/articles/pivot.html#wider-1
      spec <- recoded_long %>%
        tidyr::build_wider_spec(
          names_from = .data$var_code,
          values_from = .data$val_label
        ) %>%
        dplyr::mutate(.name = paste0(.data$var_code, "_label", ""))

      recoded_wide <- recoded_long %>%
        tidyr::pivot_wider_spec(spec)

      recode_v <- recoded_long %>% dplyr::pull(var_code) %>% unique()

      for (var in recode_v){

        order_v <- var_lookup %>%
          dplyr::filter(var_code==var) %>%
          dplyr::pull(val_label)

        var_label <- stringr::str_c(var, "_label")

        recoded_wide[[var_label]] <-
          readr::parse_factor(recoded_wide[[var_label]], ordered=TRUE, levels=order_v)

      }


      # Join recoded columns to API return
      dat <- dat %>%
        dplyr::left_join(recoded_wide, by = c("SERIALNO"))
    } else {
      message(paste("Recoding is currently supported for",
                    min(recode_years), "-", max(recode_years),
                    "data. Returning original data only."))
    }
  }
  return(dat)
}

load_data_flows <- function(geography, variables, key, year, state = NULL,
                            county = NULL, msa = NULL, show_call = FALSE) {

  base <- paste("https://api.census.gov/data", year, "acs", "flows", sep = "/")

  # collapse variables in a single string for api call
  vars_to_get <- paste0(variables, collapse = ",")

  # get all in a given geography unless specified
  for_area <- paste0(geography, ":*")
  in_area <- NULL

  # turn text states into fips if specified, else get all states
  # collapse into string for api call
  if (!is.null(state)) {
    state <- paste(map_chr(state, ~ validate_state(.x)), collapse = ",")
  } else {
    state <- "*"
  }

  # turn text counties into fips if specified, validate with specified state
  # collapse into string for api call
  # can only filter counties for one state
  if (!is.null(county)) {
    county <- paste(map_chr(county, ~ validate_county(state, .x)), collapse = ",")
  }

  # if msa codes specified, collapse into string for api call
  if (!is.null(msa)) {
    msa <- paste0(msa, collapse = ",")
  }

  # define counties (for) within states (in) for filtering in api call
  if (geography == "county") {
    if (!is.null(county)) {
    for_area <- paste0("county:", county)
    }
    if (!is.null(state)) {
      in_area <- paste0("state:", state)
    }
  }

  # if county is defined, construct for call using state and county
  # otherwise filter only at state level
  if (geography == "county subdivision") {
    if (!is.null(county)) {
      in_area <- paste0("state:", state, " ", "county:", county)
    } else {
      in_area <- paste0("state:", state)
    }
  }

  # if msas are specified construct for call to filter by msa
  if (geography == "metropolitan statistical area/micropolitan statistical area") {

    # before 2016 msa geography name in api is different
    if (year <= 2015) {
      geography <- "metropolitan statistical areas"
    }

    if (!is.null(msa)) {
      for_area <- paste0(geography, ":", msa)
    } else {
      for_area <- paste0(geography, ":*")
    }
  }

  # build query for api call
  # for calls where in_area is not defined, "in" will be NULL and not included in GET()
  query <- list(get = vars_to_get,
                "for" = for_area,
                "in" = in_area,
                key = key)

  # execute api call
  call <- GET(base, query = query)

  # print nicely formatted api call if requested
  if (show_call) {
    call_url <- gsub("&key.*", "", call$url)
    message(paste("Census API call:", utils::URLdecode(call_url)))
  }

  # Make sure call status returns 200, else, print the error message for the user.
  if (call$status_code != 200) {
    msg <- content(call, as = "text")

    if (grepl("The requested resource is not available", msg)) {
      stop("One or more of your requested variables is likely not available at the requested geography.  Please refine your selection.", call. = FALSE)
    } else {
      stop(sprintf("Your API call has errors.  The API message returned is %s.", msg), call. = FALSE)
    }
  }

  # convert json response to text
  content <- content(call, as = "text")

  if (grepl("You included a key with this request", content)) {
    stop("You have supplied an invalid or inactive API key. To obtain a valid API key, visit https://api.census.gov/data/key_signup.html. To activate your key, be sure to click the link provided to you in the email from the Census Bureau that contained your key.", call. = FALSE)
  }

  # convert response to json
  dat <- fromJSON(content)

  # first row of response are colnames
  colnames(dat) <- dat[1, ]
  dat <- dat[-1, ]

  # convert to tibble
  dat <- as_tibble(dat)

  # remove columns that contains geographies automatically returned by api - we already have this as GEOID1
  dat <- dat[, !(names(dat) %in% c("state", "county", "county subdivision",
                                   "metropolitan statistical area/micropolitan statistical area",
                                   "metropolitan statistical areas"))]

  # convert vars to numeric, skip geoids and names
  str_vars <- variables[grepl("_NAME|GEOID|^STATE|^COUNTY|^MCD|^METRO", variables)]
  num_vars <- variables[!variables %in% str_vars]
  dat[num_vars] <- lapply(dat[num_vars], as.numeric)

  return(dat)

}
walkerke/tidycensus documentation built on March 28, 2024, 7:04 p.m.