nhanes: NHANES Data Retrieval

## ----loadlibs, eval = FALSE, message = FALSE, warning = FALSE-----------------
# library("nhanesA")
# 
# demoj = nhanes("DEMO_J")
# dim(demoj)

## ----ll_print, echo = FALSE---------------------------------------------------
cat(sprintf("[1] 9254   46"))

## ----merge_example, eval = FALSE----------------------------------------------
# ## merge DEMO_J and BXP_J using SEQN.
# bpxj = nhanes("BPX_J")
# data = merge(demoj, bpxj, by="SEQN")
# dim(data)

## ----merge_print, echo = FALSE------------------------------------------------
cat(sprintf("[1] 8704   66"))

## ----survey, eval = FALSE, warning = FALSE, message = FALSE-------------------
# library("survey")
# nhanesDesign <- svydesign(id = ~SDMVPSU,  # Primary Sampling Units (PSU)
#                           strata  = ~SDMVSTRA, # Stratification used in the survey
#                           weights = ~WTMEC2YR,   # Survey weights
#                           nest    = TRUE,      # Whether PSUs are nested within strata
#                           data    = data)
# 

## ----surveydesign, eval = FALSE, message = FALSE, warning = FALSE-------------
# 
# # subset survey design object
# dfsub = subset(nhanesDesign, data$RIDAGEYR>=40)
# 
# # subset the original dataset
# datasub = data[data$RIDAGEYR>=40,]
# 

## ----ethtables, eval = FALSE, message = FALSE, warning=FALSE------------------
# 
# ## mean on total data set
# 
# mean(datasub$BPXDI1, na.rm = TRUE)

## ----ethtables_print, echo = FALSE--------------------------------------------
cat(sprintf("[1] 73.04455"))

## ----eth_means, eval = FALSE--------------------------------------------------
# ##split the data by ethnicity and calculate mean of the unweighted data
# 
# unweighted_means <- sapply(split(datasub$BPXDI1, datasub$RIDRETH1), mean, na.rm=TRUE)
# unweighted_means
# 

## ----eth_means_print, echo = FALSE--------------------------------------------
df <- data.frame(matrix(1,nrow=1,ncol=5))
names(df) <- c('   Mexican American','   Other Hispanic','   Non-Hispanic White',
               '   Non-Hispanic Black','   Other Race - Including Multi-Racial')
df[1,] <- list('72.41000','72.97611','70.84130','75.71466','74.41311')

print(df,row.names=FALSE)

## ----svyby, eval = FALSE, message = FALSE, warning = FALSE--------------------
# adjmns = svymean(~BPXDI1, dfsub, na.rm=TRUE)
# adjmns

## ----svyby_print, echo = FALSE------------------------------------------------
df <- data.frame(matrix(1,nrow=1,ncol=3))
names(df) <- c('','mean', 'SE')
df[1,] <- list('BPXDI1', '73.237', '0.5597')

print(df,row.names=FALSE)

## ----eth_combined, eval = FALSE-----------------------------------------------
# # By ethnicity
# adjmnsbyEth = svyby(~BPXDI1, ~RIDRETH1, dfsub, svymean, na.rm=TRUE)
# 
# weighted_means <- as.numeric(adjmnsbyEth$BPXDI1)
# 
# combined_data <- cbind(unweighted_means, weighted_means)
# colnames(combined_data) <- c("Unweighted", "Weighted")
# combined_data
# 

## ----eth_combined_print, echo = FALSE-----------------------------------------
df_uw <- data.frame(matrix(1,nrow=5,ncol=3))
names(df_uw) <- list('','Unweighted', 'Weighted')
df_uw[1,] <- list('Mexican American','72.41000','74.03194')
df_uw[2,] <- list('Other Hispanic','72.97611','74.73756')
df_uw[3,] <- list('Non-Hispanic White','70.84130','72.45422')
df_uw[4,] <- list('Non-Hispanic Black','75.71466','75.71874')
df_uw[5,] <- list('Other Race - Including Multi-Racial','74.41311','74.60215')

print(df_uw,row.names=FALSE)

## ----bygender, eval = FALSE, message = FALSE, warning = FALSE-----------------
# # By Gender
# mns = sapply(split(datasub$BPXDI1, datasub$RIAGENDR), mean, na.rm=TRUE)
# 
# adjmnsbyGen = svyby(~BPXDI1, ~RIAGENDR, dfsub, svymean, na.rm=TRUE)
# 
# combined_data <- cbind(mns, adjmnsbyGen$BPXDI1)
# colnames(combined_data) <- c("Unweighted", "Weighted")
# combined_data
# 

## ----bygender_print, echo = FALSE---------------------------------------------
df <- data.frame(matrix(1,nrow=2,ncol=3))
names(df) <- c('','Unweighted', 'Weighted')
df[1,] <- list('Male', '74.67330', '75.31134')
df[2,] <- list('Female', '71.43385', '71.31453')

print(df,row.names=FALSE)

## ----svq1, eval = FALSE, message = FALSE, warning = FALSE---------------------
# 
# # For the unweighted data
# 
# quantile(datasub$BPXDI1, c(0.25,0.5,.75), na.rm = TRUE)

## ----echo = FALSE-------------------------------------------------------------
df <- data.frame(matrix(1,nrow=1,ncol=3))
names(df) <- c('25%', '50%', '75%')
df[1,] <- list('66', '74', '82')

print(df,row.names=FALSE)

## ----svq2, eval = FALSE-------------------------------------------------------
# # For the survey weighted data
# svyquantile(~BPXDI1, dfsub, quantiles = c(0.25,0.5,0.75), na.rm=TRUE)

## ----svq2_print, message = FALSE, echo = FALSE--------------------------------
df <- data.frame(matrix(1,nrow=9,ncol=1))
names(df) <- '$BPXDI1                                '
df[1,] <- '    quantile ci.2.5 ci.97.5        se'
df[2,] <- '0.25       66     66      68 0.4691643'
df[3,] <- '0.5        74     74      76 0.4691643'
df[4,] <- '0.75       80     80      82 0.4691643'
df[5,] <- ''
df[6,] <- 'attr(,"hasci")                       '
df[7,] <- '[1] TRUE                             '
df[8,] <- 'attr(,"class")                       '
df[9,] <- '[1] "newsvyquantile"                 '

print(df, row.names=FALSE)

## ----svq3, eval = FALSE-------------------------------------------------------
# # By Gender
# svyby(~BPXDI1, ~RIAGENDR, dfsub, svyquantile, quantiles = c(0.5), na.rm=TRUE)
# 

## ----svq3_print, echo = FALSE-------------------------------------------------
df <- data.frame(matrix(1,nrow=2,ncol=3))
names(df) <- c('RIAGENDR','BPXDI1','se.BPXDI1')
df[1,] <- list('Male','76','0.9383286')
df[2,] <- list('Female','72','0.4691643')

print(df, row.names=FALSE)

## ----eval = FALSE, message = FALSE, warning = FALSE---------------------------
# 
# # For the entire dataset
# svyvar(~BPXDI1, dfsub, quantiles = c(0.25,0.5,0.75), na.rm=TRUE)

## ----echo=FALSE---------------------------------------------------------------
df <- data.frame(matrix(1,nrow=1,ncol=3))
names(df) <- c('', 'variance', 'SE')
df[1,] <- list('BPXDI1', '173.68', '12.279')

print(df,row.names=FALSE)

## ----eval = FALSE, message = FALSE, warning = FALSE---------------------------
#   # By ethnicity
# svyby(~BPXDI1, ~RIDRETH1, dfsub, svyvar, na.rm=TRUE)

## ----echo=FALSE---------------------------------------------------------------
df <- data.frame(matrix(1,nrow=5,ncol=3))
names(df) <- c('RIDRETH1','BPXDI1','se')
df[1,] <- list('Mexican American','200.8781','45.43074')
df[2,] <- list('Other Hispanic','160.0157','23.79938')
df[3,] <- list('Non-Hispanic White','168.3504','18.30355')
df[4,] <- list('Non-Hispanic Black','202.3565','37.12593')
df[5,] <- list('Other Race - Including Multi-Racial','156.9968','17.26037')

print(df,row.names=FALSE)

## ----eval = FALSE, message = FALSE, warning = FALSE---------------------------
#   # By Gender
# svyby(~BPXDI1, ~RIAGENDR, dfsub, svyvar, na.rm=TRUE)
# 

## ----echo=FALSE---------------------------------------------------------------
df <- data.frame(matrix(1,nrow=2,ncol=3))
names(df) <- c('RIAGENDR', 'BPXDI1', 'se')
df[1,] <- list('Male', '141.2704', '10.52796')
df[2,] <- list('Female', '196.1199', '20.65706')

print(df,row.names=FALSE)

## ----eval = FALSE, message = FALSE, warning = FALSE---------------------------
# weighted_model <- svyglm(BPXDI1 ~ RIDAGEYR + RIDRETH1, design = dfsub, family = gaussian())
# summary(weighted_model)

## ----svyglm, echo = FALSE-----------------------------------------------------
cat(sprintf(paste('Call:',
            '\n',
            '\nsvyglm(formula = BPXDI1 ~ RIDAGEYR + RIDRETH1, design = dfsub,',
            '\n    family = gaussian())',
            '\n',
            '\nSurvey design:',
            '\nsubset(nhanesDesign, data$RIDAGEYR >= 40)',
            '\n',
            '\nCoefficients:',
            '\n                                            Estimate Std. Error t value',
            '\n(Intercept)                                 90.96516    1.24720  72.935',
            '\nRIDAGEYR                                    -0.31522    0.01853 -17.012',
            '\nRIDRETH1Other Hispanic                       1.43894    1.25518   1.146',
            '\nRIDRETH1Non-Hispanic White                   0.49810    0.73132   0.681',
            '\nRIDRETH1Non-Hispanic Black                   2.92332    1.09594   2.667',
            '\nRIDRETH1Other Race - Including Multi-Racial  1.38532    0.63780   2.172',
            '\n                                            Pr(>|t|)',
            '\n(Intercept)                                 5.73e-15 ***',
            '\nRIDAGEYR                                    1.04e-08 ***',
            '\nRIDRETH1Other Hispanic                        0.2783',
            '\nRIDRETH1Non-Hispanic White                    0.5113',
            '\nRIDRETH1Non-Hispanic Black                    0.0236 *',
            '\nRIDRETH1Other Race - Including Multi-Racial   0.0550 .',
            '\n---',
            "\nSignif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1",
            '\n',
            '\n(Dispersion parameter for gaussian family taken to be 162.6653)',
            '\n',
            '\nNumber of Fisher Scoring iterations: 2'
            )))

## ----fig.width=7, eval = FALSE, message = FALSE, warning = FALSE--------------
# library(ggplot2)
# 
# ## recalculating means (same as above)
# 
# unweighted_means <- sapply(split(datasub$BPXDI1, datasub$RIDRETH1), mean, na.rm=TRUE)
# weighted_means <- as.numeric(adjmnsbyEth$BPXDI1)
# 
# 
# plot_data <- data.frame(
#   Ethnicity = factor(rep(names(unweighted_means), 2),
#                      levels = names(unweighted_means)),
#   Means = c(unweighted_means, weighted_means),
#   Type = factor(c(rep("Unweighted", length(unweighted_means)),
#                   rep("Weighted", length(weighted_means))),
#                levels = c("Unweighted", "Weighted"))
# )
# 
# # Creating the plot
# ggplot(plot_data, aes(x = Ethnicity, y = Means, fill = Type)) +
#   geom_bar(stat = "identity", position = "dodge", width = 0.6) +
#   labs(title = "Comparison of Diastolic Blood Pressure by Ethnicity",
#        y = "Mean Diastolic Blood Pressure") +
#   scale_x_discrete(labels = c('Mex-Amer','Other Hisp','White', 'Black', 'Other')) +
#   scale_fill_manual(values = c("blue", "red")) +
#   theme_minimal() +
#   theme(legend.title = element_blank())
# 
# 

## ----bp_by_eth_plot, echo = FALSE, fig.width=7, message = FALSE, warning = FALSE----
unweighted_means <- as.numeric(df_uw$Unweighted)
names(unweighted_means) <- df_uw[[1]]
weighted_means <- as.numeric(df_uw$Weighted)
plot_data <- data.frame(
  Ethnicity = factor(rep(names(unweighted_means), 2),
                     levels = names(unweighted_means)),
  Means = c(unweighted_means, weighted_means),
  Type = factor(c(rep("Unweighted", length(unweighted_means)),
                  rep("Weighted", length(weighted_means))),
               levels = c("Unweighted", "Weighted"))
)

# Creating the plot
library(ggplot2)
ggplot(plot_data, aes(x = Ethnicity, y = Means, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.6) +
  labs(title = "Comparison of Diastolic Blood Pressure by Ethnicity",
       y = "Mean Diastolic Blood Pressure") +
  scale_x_discrete(labels = c('Mex-Amer','Other Hisp','White', 'Black', 'Other')) +
  scale_fill_manual(values = c("blue", "red")) +
  theme_minimal() +
  theme(legend.title = element_blank())

## ----fig.width=7, eval = FALSE, message = FALSE, warning = FALSE--------------
# 
# unweighted_model <- glm(BPXDI1 ~ RIDAGEYR + RIDRETH1, data = datasub, family = gaussian())
# weighted_model <- svyglm(BPXDI1 ~ RIDAGEYR + RIDRETH1, design = dfsub, family = gaussian())
# 
# # For the unweighted model
# unweighted_summary <- summary(unweighted_model)
# unweighted_coefs <- unweighted_summary$coefficients[, "Estimate"]
# unweighted_se <- unweighted_summary$coefficients[, "Std. Error"]
# 
# # For the weighted model
# weighted_summary <- summary(weighted_model)
# weighted_coefs <- as.numeric(weighted_summary$coefficients[, "Estimate"])
# weighted_se <- as.numeric(weighted_summary$coefficients[, "Std. Error"])
# 
# comparison <- data.frame(
#   Variable = names(unweighted_coefs),
#   Unweighted = unweighted_coefs,
#   Weighted = as.numeric(weighted_coefs)
# )
# 
# print(comparison)

## ----visregcompare, echo = FALSE----------------------------------------------
unwt_coef_names <- c("(Intercept)", "RIDAGEYR", "RIDRETH1Other Hispanic", "RIDRETH1Non-Hispanic White", "RIDRETH1Non-Hispanic Black", "RIDRETH1Other Race - Including Multi-Racial")
unweighted_coefs <- c(92.2848149, -0.3460552,  1.2325637,  0.7553912,  4.2996735,  2.0667520)
weighted_coefs <- c(90.9651590, -0.3152158,  1.4389441,  0.4981020,  2.9233165,  1.3853204)

comparison <- data.frame(
  Variable = unwt_coef_names,
  Unweighted = unweighted_coefs,
  Weighted = as.numeric(weighted_coefs)
)

print(comparison)

## ----visregress, eval = FALSE-------------------------------------------------
# unweighted_df <- data.frame(Variable = names(unweighted_coefs),
#                             Estimate = unweighted_coefs,
#                             SE = unweighted_se,
#                             Type = "Unweighted")
# 
# weighted_df <- data.frame(Variable = names(unweighted_coefs),
#                           Estimate = weighted_coefs,
#                           SE = weighted_se,
#                           Type = "Weighted")
# 
# plot_data <- rbind(unweighted_df, weighted_df)
# 
# ggplot(subset(plot_data, Variable!='(Intercept)'), aes(x = Estimate, y = reorder(Variable, Estimate), color = Type)) +
#   geom_point(position = position_dodge(0.5), size = 2.5) +
#   geom_errorbarh(aes(xmin = Estimate - SE, xmax = Estimate + SE),
#                  height = 0.2, position = position_dodge(0.5)) +
#   geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
#   labs(title = "Comparison of Regression Coefficients",
#        x = "Coefficient Value", y = "Predictors") +
#   theme_minimal() +
#   scale_color_manual(values = c("Unweighted" = "blue", "Weighted" = "red")) +
#   theme(legend.title = element_blank())
# 

## ----regression_coeff, fig.width=7, echo = FALSE, message = FALSE, warning = FALSE----


unweighted_coefs <- c(92.2848149, -0.3460552,  1.2325637,  0.7553912,  4.2996735,  2.0667520)
weighted_coefs <- c(90.9651590, -0.3152158,  1.4389441,  0.4981020,  2.9233165,  1.3853204)
names(unweighted_coefs) <- c("(Intercept)", "RIDAGEYR", "RIDRETH1Other Hispanic", "RIDRETH1Non-Hispanic White", "RIDRETH1Non-Hispanic Black", "RIDRETH1Other Race - Including Multi-Racial")

unweighted_se <- c(1.36340677, 0.02061825, 1.04001732, 0.79478016, 0.83753015, 0.86954266)
weighted_se <- c(1.24720135, 0.01852892, 1.25517931, 0.73131800, 1.09593669, 0.63780221)


unweighted_df <- data.frame(Variable = names(unweighted_coefs), 
                            Estimate = unweighted_coefs, 
                            SE = unweighted_se, 
                            Type = "Unweighted")

weighted_df <- data.frame(Variable = names(unweighted_coefs), 
                          Estimate = weighted_coefs, 
                          SE = weighted_se, 
                          Type = "Weighted")

plot_data <- rbind(unweighted_df, weighted_df)

ggplot(subset(plot_data, Variable!='(Intercept)'), aes(x = Estimate, y = reorder(Variable, Estimate), color = Type)) +
  geom_point(position = position_dodge(0.5), size = 2.5) +
  geom_errorbarh(aes(xmin = Estimate - SE, xmax = Estimate + SE),
                 height = 0.2, position = position_dodge(0.5)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  labs(title = "Comparison of Regression Coefficients",
       x = "Coefficient Value", y = "Predictors") +
  theme_minimal() +
  scale_color_manual(values = c("Unweighted" = "blue", "Weighted" = "red")) +
  theme(legend.title = element_blank())