Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup-packages, message=FALSE--------------------------------------------
# We set a random seed in this vignette only to ensure
# that our discussion will match the CV output;
# otherwise, each time we reran the vignette, we'd get different CV folds
set.seed(2022)
library(surveyCV)
data("NSFG_data")
library(survey)
data("api")
## -----------------------------------------------------------------------------
#stratified sample
cv.svy(apistrat, c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
nfolds = 5, strataID = "stype", weightsID = "pw", fpcID = "fpc")
## -----------------------------------------------------------------------------
# one-stage cluster sample
cv.svy(apiclus1, c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
nfolds = 5, clusterID = "dnum", weightsID = "pw", fpcID = "fpc")
## -----------------------------------------------------------------------------
# simple random sample
cv.svy(apisrs, c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
nfolds = 5, fpcID = "fpc")
## -----------------------------------------------------------------------------
# complex sample from NSFG
library(splines)
cv.svy(NSFG_data, c("income ~ ns(age, df = 1)",
"income ~ ns(age, df = 2)",
"income ~ ns(age, df = 3)",
"income ~ ns(age, df = 4)",
"income ~ ns(age, df = 5)",
"income ~ ns(age, df = 6)"),
nfolds = 4,
strataID = "strata", clusterID = "SECU",
nest = TRUE, weightsID = "wgt")
## -----------------------------------------------------------------------------
#stratified sample
dstrat <- svydesign(id = ~1, strata = ~stype, weights = ~pw, data = apistrat, fpc = ~fpc)
cv.svydesign(formulae = c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
design_object = dstrat, nfolds = 5)
# one-stage cluster sample
dclus1 <- svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~fpc)
cv.svydesign(formulae = c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
design_object = dclus1, nfolds = 5)
# simple random sample
dsrs <- svydesign(id = ~1, data = apisrs, fpc = ~fpc)
cv.svydesign(formulae = c("api00~ell",
"api00~ell+meals",
"api00~ell+meals+mobility"),
design_object = dsrs, nfolds = 5)
## -----------------------------------------------------------------------------
NSFG.svydes <- svydesign(id = ~SECU, strata = ~strata, nest = TRUE,
weights = ~wgt, data = NSFG_data)
cv.svydesign(formulae = c("income ~ ns(age, df = 1)",
"income ~ ns(age, df = 2)",
"income ~ ns(age, df = 3)",
"income ~ ns(age, df = 4)",
"income ~ ns(age, df = 5)",
"income ~ ns(age, df = 6)"),
design_object = NSFG.svydes, nfolds = 4)
## -----------------------------------------------------------------------------
#stratified sample
glmstrat <- svyglm(api00 ~ ell+meals+mobility, design = dstrat)
cv.svyglm(glmstrat, nfolds = 5)
# one-stage cluster sample
glmclus1 <- svyglm(api00 ~ ell+meals+mobility, design = dclus1)
cv.svyglm(glmclus1, nfolds = 5)
# simple random sample
glmsrs <- svyglm(api00 ~ ell+meals+mobility, design = dsrs)
cv.svyglm(glmsrs, nfolds = 5)
## -----------------------------------------------------------------------------
NSFG.svyglm <- svyglm(income ~ ns(age, df = 3), design = NSFG.svydes)
cv.svyglm(glm_object = NSFG.svyglm, nfolds = 4)
## -----------------------------------------------------------------------------
NSFG.svyglm.logistic <- svyglm(LBW ~ ns(age, df = 3), design = NSFG.svydes,
family = quasibinomial())
cv.svyglm(glm_object = NSFG.svyglm.logistic, nfolds = 4)
## -----------------------------------------------------------------------------
cv.svydesign(formulae = c("LBW ~ ns(age, df = 1)",
"LBW ~ ns(age, df = 2)",
"LBW ~ ns(age, df = 3)",
"LBW ~ ns(age, df = 4)",
"LBW ~ ns(age, df = 5)",
"LBW ~ ns(age, df = 6)"),
design_object = NSFG.svydes, nfolds = 4,
method = "logistic")
cv.svy(NSFG_data, c("LBW ~ ns(age, df = 1)",
"LBW ~ ns(age, df = 2)",
"LBW ~ ns(age, df = 3)",
"LBW ~ ns(age, df = 4)",
"LBW ~ ns(age, df = 5)",
"LBW ~ ns(age, df = 6)"),
nfolds = 4,
strataID = "strata", clusterID = "SECU",
nest = TRUE, weightsID = "wgt",
method = "logistic")
## ---- eval = FALSE------------------------------------------------------------
# # Based on example("rpms"):
# # model the mean of retirement account value `IRAX` among households with
# # reported retirement account values > 0,
# # predicted from householder education, age, and urban/rural location
# library(rpms)
# data(CE)
#
# # Generate fold IDs that account for clustering in the survey design
# # for the IRAX>0 subset of the CE dataset
# nfolds <- 5
# CEsubset <- CE[which(CE$IRAX > 0), ]
# CEsubset$.foldID <- folds.svy(CEsubset, nfolds = nfolds, clusterID = "CID")
#
# # Use CV to tune the bin_size parameter of rpms_forest()
# bin_sizes <- c(10, 20, 50, 100, 250, 500)
#
# # Create placeholder for weighted Sums of Squared Errors
# SSEs <- rep(0, length(bin_sizes))
#
# for(ff in 1:nfolds) { # For every fold...
# # Use .foldID to split data into training and testing sets
# train <- subset(CEsubset, .foldID != ff)
# test <- subset(CEsubset, .foldID == ff)
#
# for(bb in 1:length(bin_sizes)) { # For every value of the tuning parameter...
# # Fit a new model
# rf <- rpms_forest(IRAX ~ EDUCA + AGE + BLS_URBN,
# data = train,
# weights = ~FINLWT21, clusters = ~CID,
# bin_size = bin_sizes[bb], f_size = 50)
# # Get predictions and squared errors
# yhat <- predict(rf, newdata = test)
# res2 <- (yhat - test$IRAX)^2
# # Sum up weighted SSEs, not MSEs yet,
# # b/c cluster sizes may differ across folds and b/c of survey weights
# SSEs[bb] <- SSEs[bb] + sum(res2 * test$FINLWT21)
# }
# }
#
# # Divide entire weighted sum by the sum of weights
# MSEs <- SSEs / sum(CEsubset$FINLWT21)
#
# # Show results
# cbind(bin_sizes, MSEs)
# #> bin_sizes MSEs
# #> [1,] 10 204246617270
# #> [2,] 20 202870633392
# #> [3,] 50 201393921358
# #> [4,] 100 201085838446
# #> [5,] 250 201825549231
# #> [6,] 500 204155844501
## ---- eval = FALSE------------------------------------------------------------
# CE.svydes <- svydesign(id = ~CID, weights = ~FINLWT21, data = CEsubset)
# # Use update() to add variables to a svydesign object
# CE.svydes <- update(CE.svydes,
# .foldID = folds.svydesign(CE.svydes, nfolds = nfolds))
# # Now the fold IDs are available as CE.svydes$variables$.foldID
# table(CE.svydes$variables$.foldID)
# #> 1 2 3 4 5
# #> 813 923 928 885 960
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.