inst/doc/srvyr-database.R

## ---- echo = FALSE------------------------------------------------------------
if (!require(RSQLite)) {
  message("Could not find RSQLite so could not run vignette.")
  knitr::opts_chunk$set(eval = FALSE)
}

if (!file.exists("acs_m.Rdata")) {
  message("Could not find db file so could not run vignette.")
  knitr::opts_chunk$set(eval = FALSE)
}

## -----------------------------------------------------------------------------
suppressMessages({
  library(survey)
  library(srvyr)
  library(dplyr)
  library(dbplyr)
  library(RSQLite)
})

# Load data (Currently only Alaska and Hawaii to keep file size small and with 
# limited variables, butcode that downloaded the files is available here
# https://github.com/gergness/srvyr/blob/main/vignettes/save_acs_data.R
# and could easily be adapted to download all states.)
load("acs_m.Rdata") # acs_m data

# Set up database and table
db <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
acs_m_db <- copy_to(db, acs_m, "acs_m", temporary = FALSE)

# Or, if the data was already stored in the database, you could do this
# acs_m_data <- tbl(db, sql("SELECT * FROM acs_m")) 

## -----------------------------------------------------------------------------
# Same results
acs_m %>% 
  group_by(sex) %>%
  summarize(hicov = mean(hicov))

acs_m_db %>% 
  group_by(sex) %>%
  summarize(hicov = mean(hicov))

# But smaller object size
object.size(acs_m)
object.size(acs_m_db)


## -----------------------------------------------------------------------------
acs_m %>% 
  group_by(sex) %>%
  summarize(hicov = mean(hicov == 1))

# Works in RSQlite, but didn't in the now defunct MonetDB
# acs_m_db %>% 
#   group_by(sex) %>%
#   summarize(hicov = mean(hicov == 1))
#
# > Error in .local(conn, statement, ...) :
# > Unable to execute statement....
# > ....

# Creating the variable separately works as an integer works though
acs_m_db %>% 
  group_by(sex) %>%
  mutate(hicov = ifelse(hicov == 1, 1L, 0L)) %>%
  summarize(hicov = mean(hicov))

## -----------------------------------------------------------------------------
acs_m %>% 
  group_by(agecat = cut(agep, c(0, 19, 35, 50, 65, 200))) %>%
  summarize(hicov = mean(hicov == 1))

# acs_m_db %>% 
#   group_by(agecat = cut(agep, c(0, 19, 35, 50, 65, 200))) %>%
#   summarize(hicov = mean(hicov == 1))
#
# > Error in .local(conn, statement, ...) :
# > Unable to execute statement....
# > ...

acs_m_db %>% 
  mutate(agecat = ifelse(agep < 19, "0-18", 
                         ifelse(agep >= 19 & agep < 35, "19-34", 
                                ifelse(agep >= 35 & agep < 50, "35-49", 
                                       ifelse(agep >= 50 & agep < 65, "50-64", 
                                              ifelse(agep >= 65, "65+", NA)))))) %>%
  group_by(agecat) %>% 
  summarize(hicov = mean(hicov))

## -----------------------------------------------------------------------------
acs_m_db_svy <- acs_m_db %>% 
  as_survey_rep(
    weight = pwgtp,
    repweights = matches("pwgtp[0-9]+") ,
    scale = 4 / 80,
    rscales = rep(1 , 80),
    mse = TRUE,
    type = "JK1", 
    variables = -c(matches("^pwgtp"))
  )

acs_m_db_svy

## -----------------------------------------------------------------------------
object.size(acs_m_db_svy)

## -----------------------------------------------------------------------------
# You can calculate the population of the united states #
# by state
acs_m_db_svy %>%
  mutate(one = 1L) %>% # Note that because of weird behavior of MonetDB, need to use 1L not just 1
  group_by(st) %>% 
  summarize(count = survey_total(one))

# Or the average age of downloaded states
acs_m_db_svy %>%
  summarize(agep = survey_mean(agep, na.rm = TRUE))

# Average age by state
acs_m_db_svy %>%
  group_by(st) %>% 
  summarize(agep = survey_mean(agep, na.rm = TRUE))

# percent uninsured - nationwide (of downloaded states)
acs_m_db_svy %>%
  mutate(hicov = as.character(hicov)) %>% 
  group_by(hicov) %>% 
  summarize(pct = survey_mean(na.rm = TRUE))

# by state
acs_m_db_svy %>%
  mutate(hicov = as.character(hicov)) %>% 
  group_by(st, hicov) %>% 
  summarize(pct = survey_mean(na.rm = TRUE))


# 25th, median, and 75th percentile of age of residents of the united states (downloaded states)
acs_m_db_svy %>%
  summarize(agep = survey_quantile(agep, c(0.25, 0.5, 0.75), na.rm = TRUE))


# Filter works, so we can restrict the acs_m object to females only
acs_m_db_svy_female <- acs_m_db_svy %>%
  filter(sex == 2)

# Now any of the above commands can be re-run using the acs_m_female object
# instead of the acs_m object in order to analyze females only

# This is equivalent to using acs_m, and applying the filter every time.

# average age - nationwide (of downloaded states), restricted to females
acs_m_db_svy_female %>%
  summarize(agep = survey_mean(agep, na.rm = TRUE))

# median age - nationwide (of downloaded states), restricted to females
acs_m_db_svy_female %>%
  summarize(agep = survey_median(agep, na.rm = TRUE))


# Note that though some R functions are translated by dplyr into SQL, not
# all of them are. For example, when constructing a new age category 
# variable in the dataset neither findIntervals nor cut work on databases, 
# so we have to spell out groups with ifelse()
acs_m_db_svy %>% 
  mutate(agecat = ifelse(agep < 19, "0-18", 
                         ifelse(agep >= 19 & agep < 35, "19-34", 
                                ifelse(agep >= 35 & agep < 50, "35-49", 
                                       ifelse(agep >= 50 & agep < 65, "50-64", 
                                              ifelse(agep >= 65, "65+", NA)))))) %>%
  group_by(agecat) %>% 
  summarize(pct = survey_mean(na.rm = TRUE))

## -----------------------------------------------------------------------------
acs_m_db_svy %>%
  select(agep, hicov, sex) %>%
  collect() %>%
  {survey::svyglm(hicov ~ sex + agep, .)} %>%
  summary()

Try the srvyr package in your browser

Any scripts or data that you put into this service are public.

srvyr documentation built on March 7, 2023, 6:39 p.m.