knitr::opts_chunk$set(echo=TRUE)
library(glptools)
glp_load_packages()
library(arrow)
library(labelled)

This document reads in and processes both Census and IPUMUS Microdata to pull variables related to digital divide

This includes information on internet access (normal, highspeed, etc) and device access. These questions have been included in the ACS since 2013.

First we're going to work with Census Data

This calls in all of the relevant variables. A bit of context about each table is included below, also.

#B28002 - internet access at household level
#this details what type of internet subscription people have, if they have internet
#access without a subscription, or if they have no internet access.
#this corresponds with CINETHH, CIHISPEED, and CIDIAL in IPUMS
internet_household   <- build_census_var_df("acs5", "B28002")

#B28001 - computer in household
compdevice_household   <- build_census_var_df("acs5", "B28001")

#B28003 - computer and internet access at houeshold level
#Details if they have internet subscription (and what type) or not with a computer
#or if they have no computer
compinternet_household   <- build_census_var_df("acs5", "B28003")

#B28008 - computer and internet access at individual level
#Details if they have internet subscription (and what type) or not with a computer
#or if they have no computer
compinternet_individual   <- build_census_var_df("acs5", "B28008")


#b28004 - internet presence and type by income
#use acs 1 for FIPS, acs 5 for tracts 
incomeinternet_household <- build_census_var_df("acs1", "B28004")

#b28005 - age by computer and internet - individual
ageinternetcomp_individual <- build_census_var_df("acs5", "B28005")

#b28006 - computer and internet by education
eduinternetcomp_individual <- build_census_var_df("acs5", "B28006")

#b28009 - computer and internet by race
raceinternetcomp_individual <- build_census_var_df("acs5", "B28009")

Next we're going to work with the age table - pull down the data, remove total columns, and process it to make map files :

#Overall Digital Access (no disagg) (individual and household) ----
access_house <- get_census(compinternet_household, "tract")
access_indiv <- get_census(compinternet_individual, "tract")

# Individual digital access ----
#make column that will work for process census
#B28008_005 is that row that describes "having a computer with a fixed broadband internet subscription, this excludes folks that have a computer and only dial up or cellular data. 
#005 covers both 006 and 007 (is cumulative of those) 008 is dial up/cellular data
var_individ <- access_indiv %>%
  mutate(
    digitalaccess = case_when(
      variable == "B28008_001E"  ~ "total",
       variable == "B28008_001M"  ~ "total",
      variable == "B28008_005E" ~ "hasaccess",
      variable == "B28008_005M" ~ "hasaccess"
    )
  )

#process census
clean_individ <- var_individ %>%
  process_census(cat_var = "digitalaccess",
                 output_name = "digitalaccess")

#process map
map_access <- clean_individ %>%
  process_map("hasaccess",return_name = "digitalaccess_nodisagg", 
              maps=c("tract", "nh", "muw")) %>%
  list2env(.GlobalEnv)
rm(map_access)

#Household digital access comparison ----
#the %s here are going to be slightly higher because there is no differentiation between types of broadband like there is in individual. This makes me feel more confident about individual, will run this by Harrison though. Here's the process for household just in case it's wanted 
#B28003_004E is the "computer with a broadband internet subscription". There are no subcategories #the only options w/ a computer a dial-up, broadband, or without internet. Another arguemnt could be made that this is better because it is household and the ipums data is also household?
#looking at the totals for the whole data (here:https://data.census.gov/cedsci/table?q=B28&d=ACS%201-Year%20Estimates%20Detailed%20Tables&tid=ACSDT1Y2019.B28008)
#The % comes out to 86% for household and 77% for individual - i feel individual is a better descriptor of what is going on and peoples experiences 
var_house <- access_house %>%
  mutate(
    digitalaccess = case_when(
      variable == "B28003_001E"  ~ "total",
       variable == "B28003_001M"  ~ "total",
      variable == "B28003_004E" ~ "hasaccess",
      variable == "B28003_004M" ~ "hasaccess"
    )
  )

#process census
clean_house <- var_house %>%
  process_census(cat_var = "digitalaccess",
                 output_name = "digitalaccess")

#process map
map_access_house <- clean_house %>%
  process_map("hasaccess",return_name = "digitalaccess_house", 
              maps=c("tract", "nh", "muw")) %>%
  list2env(.GlobalEnv)

#By Age ----
age <- get_census(ageinternetcomp_individual, "tract")

#clear out totals
totals <- c("B28005_001E","B28005_001M","B28005_002E",
            "B28005_008E", "B28005_014E","B28005_002M",
            "B28005_008M", "B28005_014M",
            "B28005_009E", "B28005_015E", "B28005_003E", "B28005_009M", "B28005_015M", "B28005_003M")

age_nototal <- age %>%
  subset(variable %not_in% totals)

#make categorical variable
age_nototal  %<>% mutate(compinternet = if_else(str_detect(label,"broadband Internet subscription"), T, F))

#process census
clean_age <- age_nototal %>%
  process_census(cat_var = "compinternet",
                 output_name = "digitalaccess", age_groups = c("under_18", "18_64", "65_plus"),age_group_wide = TRUE) #< one option 

#process map
map_age <- clean_age %>%
  process_map("under_18", "18_64","65_plus",return_name = "digitalaccess_by_age", 
              maps=c("tract", "nh", "muw")) %>%
  list2env(.GlobalEnv)
rm(map_age)

#By Income ----
#by income we only have internet access not internet and computer access 
income <- get_census(incomeinternet_household, "FIPS")

#create variable column for each income class
income_access <- income %>%
  mutate(
    less10 = case_when(
      variable == "B28004_002E" ~"total",
      variable == "B28004_002M" ~"total",
      variable == "B28004_004E" ~"less10",
      variable == "B28004_004M" ~"less10"
    ),
    "10_19" =case_when(
      variable == "B28004_006E" ~"total",
      variable == "B28004_006M" ~"total",
      variable == "B28004_008E" ~"10_19",
      variable == "B28004_008M" ~"10_19"
    ),
    "20_34" =case_when(
      variable == "B28004_010E" ~"total",
      variable == "B28004_010M" ~"total",
      variable == "B28004_012E" ~"20_34",
      variable == "B28004_012M" ~"20_34"
    ),
    "35_49" =case_when(
      variable == "B28004_014E" ~"total",
      variable == "B28004_014M" ~"total",
      variable == "B28004_016E" ~"35_49",
      variable == "B28004_016M" ~"35_49"
    ),
    "50_74" =case_when(
      variable == "B28004_018E" ~"total",
      variable == "B28004_018M" ~"total",
      variable == "B28004_020E" ~"50_74",
      variable == "B28004_020M" ~"50_74"
    ),
    plus_75 =case_when(
      variable == "B28004_022E" ~"total",
      variable == "B28004_022M" ~"total",
      variable == "B28004_024E" ~"plus_75",
      variable == "B28004_024M" ~"plus_75"
    )
  )

#for loop to process_census
variables <- list("less10","10_19","20_34", "35_49","50_74","plus_75")

for(x in variables) {
  df <- income_access %>% process_census(cat_var=x)
  income_internet <- assign_col_join(income_internet,df)
  rm(df)
}

Now we're going to work with the IPUMS Microdata

The data we're calling in here has already been run through microdata.rmd

#read data - change this to wherever this is stored locally on your computer
acs_micro <- feather::read_feather("/Users/mraisle/Documents/OneDrive - University of North Carolina at Chapel Hill/glpdata/data-raw/microdata/acs_micro_repwts.feather")

#make logical variables for the metrics we care about
#the method i took here is a bit long winded .  . . case_when worked best to get the binary
#that I wanted for some variables but T,F was how I wanted it evaluated for survey_by_demog, hence this approach
acs_micro_internet <- acs_micro %>%
  filter(year > 2012) %>%
  filter(PERNUM == 1) %>% #only measure each household once
  mutate(
    MSA = as.character(MSA),
    int_acc = case_when(
      CINETHH == 0 ~ NA_real_,
      CINETHH == 1 | CINETHH == 2 ~ 1,
      CINETHH == 3 ~ 0),
    hspd_int = case_when(
      CINETHH == 0 ~ NA_real_,
      CINETHH == 3 | CIHISPEED == 0 ~ 0, #doing or doesn't remove columns, and does? something like that 
      CIHISPEED > 9 & CIHISPEED < 20 ~ 1,
      CIHISPEED == 20 ~ 0
      ),
    #internet but not highspeed - this one doesn't work - wrong order 
    #regular_internet = case_when(
     #  int_acc == T ~ 1,
       #CINETHH == 3 & CIHISPEED == 0 ~ 0,
      # CIHISPEED > 9 & CIHISPEED < 20 ~ 0,
   # ),
    #^this one is clearly not working so let's try a different configuration
    regular_internet = case_when(
      (hspd_int == 0 | is.na(hspd_int)) & int_acc == 1 ~ 1,
      hspd_int == 1 ~ 0,
      int_acc == 0 ~ 0), 

    computer = if_else(CILAPTOP == 1, T, F),
    computer = replace(computer, CILAPTOP == 0, NA),

    tablet = if_else(CITABLET == 1, T, F),
    tablet = replace(tablet, CITABLET == 0, NA),

    smrtphone = if_else(CISMRTPHN == 1, T, F),
    smrtphone = replace(smrtphone, CISMRTPHN == 0, NA),

   comp_tab = case_when(
      computer == 1 | tablet == 1 ~ 1,
      computer == 0 & tablet == 0 ~ 0,
      computer == 0 & is.na(tablet) ~ 0,
      TRUE ~ NA_real_
    ),
    #ok so below is if someone has a computer,tablet, or smartphone
    comp_tab_smrt = case_when(
      computer == 1 | tablet == 1 | smrtphone == 1 ~ 1,
      computer == 0 & tablet == 0 & smrtphone == 0 ~ 0,
      computer == 0 & is.na(tablet) & is.na(smrtphone) ~ 0,
      TRUE ~ NA_real_
      ),
    #this is seeing if someone has a computer or tablet and high speed internet (This is #1)
    internet_and_device1 = case_when(
      comp_tab == 1 & hspd_int == 1 ~ 1,
      comp_tab == 0 | hspd_int == 0 ~ 0,
      TRUE ~ NA_real_),
   #internet and computer 
   internet_and_computer = case_when(
      computer == 1 & hspd_int == 1 ~ 1,
      computer == 0 | hspd_int == 0 ~ 0,
      TRUE ~ NA_real_),
    #seeing if someone has a computer or tablet and non-highspeed internet (#2)
    nohiinternet_and_device2 = case_when(
      comp_tab == 1 & regular_internet == 1 ~ 1,
      comp_tab == 0 | regular_internet == 0 ~ 0,
      TRUE ~ NA_real_),
    #device but no internet or smartphone (#3)
    only_device3 = case_when(
      comp_tab == 1 & int_acc == F & (smrtphone == F |is.na(smrtphone))  ~ 1,
      comp_tab == 0 | int_acc == T | smrtphone == T ~ 0,
      TRUE ~ NA_real_),
    #have smartphone and internet but no device (#4)
    smrt_and_internet4 = case_when(
      comp_tab == 0 & int_acc == T & smrtphone == T ~ 1,
      comp_tab == 1 | int_acc == F | (smrtphone == F |is.na(smrtphone)) ~ 0,
      TRUE ~ NA_real_),
     #internet and no device or phone (#5)
    only_internet5 = case_when(
      int_acc == T & comp_tab_smrt== 0 ~ 1,
      int_acc == F | comp_tab_smrt == 1 ~ 0,
      TRUE ~ NA_real_),
    #no device or internet but yes smartphone (#6)
     only_smartphone6 = case_when(
      int_acc == F & comp_tab== 0 & smrtphone == T ~ 1,
      int_acc == T | comp_tab == 1 | (smrtphone == F |is.na(smrtphone)) ~ 0,
      TRUE ~ NA_real_),
    #you got nothing - no device, no internet, no phone (#7)
    nothing7 = case_when(
      int_acc == F & comp_tab_smrt == 0 ~ 1,
      int_acc == T | comp_tab_smrt == 1 ~ 0,
      TRUE ~ NA_real_),
    #no internet but you have a device and a phone (#8)
    device_smrt_only8 = case_when(
      int_acc == F & comp_tab== 1 & smrtphone == T ~ 1,
      int_acc == T | comp_tab == 0 | (smrtphone == F |is.na(smrtphone)) ~ 0,
      TRUE ~ NA_real_),
    total = internet_and_device1 + nohiinternet_and_device2 + only_device3 + smrt_and_internet4 +
    only_internet5 + only_smartphone6 + nothing7 + device_smrt_only8
  )

#check to make sure all the calculations turned up right by running:
#unique(acs_micro_internet$total) . . .the only values should be 1 or NA
#You can then remove the total column if you want, it is no longer necessary 
acs_micro_internet %<>% mutate(total=NULL)

#make column that has name of the group in actual cell 
#acs_micro_internet %<>% mutate(
#access_type = case_when(
   #   internet_and_device == 1 ~ "internet_and_device1",
     # nohiinternet_and_device2 == 1 ~ "nohiinternet_and_device2",
     # only_device3 == 1 ~ "only_device3",
      #smrt_and_internet4 == 1 ~ "smrt_and_internet4",
      #only_internet5 == 1 ~ "only_internet5",
     # only_smartphone6 == 1 ~ "only_smartphone6",
     # nothing7 == 1 ~ "nothing7",
     # device_smrt_only8 == 1 ~ "device_smrt_only8",
     # TRUE ~ NA_character_))


#convert necessary ones to true false - clean this up later - having trouble making a for loop work for this 
#add internet access here 
acs_micro_internet$int_acc %<>%
  as.logical(acs_micro_internet$int_acc)
acs_micro_internet$internet_and_device1 %<>%
  as.logical(acs_micro_internet$internet_and_device1)
acs_micro_internet$comp_tab_smrt %<>%
  as.logical(acs_micro_internet$comp_tab_smrt)
acs_micro_internet$comp_tab %<>%
  as.logical(acs_micro_internet$comp_tab)
acs_micro_internet$hspd_int %<>%
  as.logical(acs_micro_internet$hspd_int)
acs_micro_internet$regular_internet %<>%
  as.logical(acs_micro_internet$regular_internet)
acs_micro_internet$nohiinternet_and_device2 %<>%
  as.logical(acs_micro_internet$nohiinternet_and_device2)
acs_micro_internet$only_device3 %<>%
  as.logical(acs_micro_internet$only_device3)
acs_micro_internet$smrt_and_internet4 %<>%
  as.logical(acs_micro_internet$smrt_and_internet4)
acs_micro_internet$only_internet5 %<>%
  as.logical(acs_micro_internet$only_internet5)
acs_micro_internet$only_smartphone6 %<>%
  as.logical(acs_micro_internet$only_smartphone6)
acs_micro_internet$nothing7 %<>%
  as.logical(acs_micro_internet$nothing7)
acs_micro_internet$device_smrt_only8 %<>%
  as.logical(acs_micro_internet$device_smrt_only8)
acs_micro_internet$internet_and_computer %<>%
  as.logical(acs_micro_internet$internet_and_computer)


rm(acs_micro)

Now we're going to process this using survey by demographic, make dataframes for county and MSA level, and merge all the variables back into one dataframe for saving.

#need to remember to make NA everything 2013-2015 that include tablet or smartphone
#make vector of variables

variables <- list("int_acc","hspd_int", "regular_internet","comp_tab", "comp_tab_smrt",
                  "internet_and_device1", "internet_and_computer","nohiinternet_and_device2", "only_device3", "smrt_and_internet4","only_internet5","only_smartphone6","nothing7","device_smrt_only8")

#make county dataframe with all the variables 
for(x in variables) {
  df <- survey_by_demog(acs_micro_internet, x, weight_var = "HHWT")
  digitalaccess_county <- assign_col_join(digitalaccess_county, df)
  rm(df)
}

#make the categories that related to smartphones null for all years < 2015 for county

#variables2016 <-  list("device_smrt_only8","nothing7","only_device3","only_internet5",
                     #  "only_smartphone6","smrt_and_internet4")
#for(x in variables2016) {
 # digitalaccess_county[[x]] <- ifelse(digitalaccess_county$year > 2015,
    #                                  digitalaccess_county[[x]], NA)
#}

#make percent only dataframe - easier to work with graphs this way
percentdigitalaccess_county <- digitalaccess_county %>% 
  filter(var_type=="percent") %>% mutate(
  var_type = NULL)

#make MSA dataframe with all the variables 
for(x in variables) {
  df <- survey_by_demog(acs_micro_internet, x, geog="MSA", weight_var = "HHWT")
  digitalaccess_MSA5yr <- assign_col_join(digitalaccess_MSA5yr, df)
  rm(df)
}

#make the categories that related to smartphones null for all years < 2015 for MSA

#for(x in variables2016) {
  #digitalaccess_MSA5yr[[x]] <- ifelse(digitalaccess_MSA5yr$year > 2015,
                                   #   digitalaccess_MSA5yr[[x]], NA)
#}

Finally, we're going to save all of these files to glpdata

``` {r Saving_Data, eval=FALSE}

need to rename the map files to make more clear

usethis::use_data(digitalaccess_county, percentdigitalaccess_county, digitalaccess_MSA5yr, digitalaccess_by_age_muw, digitalaccess_by_age_nh, digitalaccess_by_age_tract, digitalaccess_nodisagg_muw,digitalaccess_nodisagg_nh, digitalaccess_nodisagg_tract, income_internet, overwrite = TRUE) ```



greaterlouisvilleproject/glpdata documentation built on Nov. 2, 2023, 8:50 a.m.