knitr::opts_chunk$set(echo = TRUE)

Introduction

This document shows how to reproduce the NHANES analysis described in the paper "Scalar on Function Regression\: Estimation and Inference Under Complex Survey Designs".

Load Packages and Create NHANES Analysis Dataset

The NHANES data described in the application section was pre-processed and stored in the rnhanesdata R package.

## Check for packages needed to run analyses/install the rnhanesdata package.
pckgs <- c("devtools","ggplot2","tidyverse","fields","refund","mgcv","surveySoFR")
sapply(pckgs, function(x) if(!require(x,character.only=TRUE,quietly=TRUE)) {
        install.packages(x)
        require(x, character.only=TRUE)
})
rm(list=c("pckgs"))
## Install the rnhanesdata package and dependencies.
## This may take a few minutes because of the size of the data package.
if(!require("rnhanesdata")){
        install_github("andrew-leroux/rnhanesdata",build_vignettes = FALSE)
        require("rnhanesdata")
}

After installing necessary R packages, we next extract NHANES data from the package and organize them into an analyzable format.

## load activity count, wear/non-wear flag, demographic/lifestly, and mortality data
data("PAXINTEN_C"); data("PAXINTEN_D")
data("Flags_C"); data("Flags_D")
data("Covariate_C"); data("Covariate_D")
data("Mortality_2015_C");data("Mortality_2015_D")

## re-code activity counts which are considered "non-wear" to be 0
## this doesn't impact much data, most estimated non-wear times correspond to 0 counts anyway
PAXINTEN_C[,paste0("MIN",1:1440)] <- PAXINTEN_C[,paste0("MIN",1:1440)]*Flags_C[,paste0("MIN",1:1440)]
PAXINTEN_D[,paste0("MIN",1:1440)] <- PAXINTEN_D[,paste0("MIN",1:1440)]*Flags_D[,paste0("MIN",1:1440)]


## merge 2003-2004 and 2005-2006 waves' data
## select only a few key variables of interest
vars_covar <- c("Age","Gender","BMI_cat","Age","Cancer","Diabetes","CHD","CHF","MobilityProblem","SmokeCigs","DrinkStatus","Stroke")
PAXINTEN <- bind_rows(PAXINTEN_C, PAXINTEN_D)
Flags    <- bind_rows(Flags_C, Flags_D)
Covariate <- bind_rows(Covariate_C, Covariate_D) %>% 
        mutate(Age = RIDAGEEX/12) %>% 
        dplyr::select(SEQN, one_of(vars_covar),
                      WTMEC2YR, WTINT2YR,SDMVPSU, SDMVSTRA) 
Mortality <- bind_rows(Mortality_2015_C, Mortality_2015_D)
## clear up the workspace
rm(PAXINTEN_C, PAXINTEN_D, Flags_C, Flags_D, Covariate_C, Covariate_D, Mortality_2015_C, Mortality_2015_D)

## 1) subset to good days of data (>= 10 hours estiamted wear time) and good quality data indicators (PAXCAL/PAXSTAT)
## 2) subset to >= 3 days of good data
PAXINTEN$nmins_wear <- rowSums(Flags[,paste0("MIN",1:1440)], na.rm=TRUE)
PAXINTEN <- 
        PAXINTEN %>% 
        ## define a good day as >= 10 hours of wear + well calibrated devicve
        mutate(good_day = as.numeric(nmins_wear >= 600 & PAXCAL %in% 1 & PAXSTAT %in% 1)) %>%
        ## remove "bad" days
        filter(good_day %in% 1) %>% 
        dplyr::select(-PAXCAL, -PAXSTAT) %>% 
        ## calculate subject specific number of good days
        group_by(SEQN) %>% 
        mutate(n_good_days = sum(good_day),
               day = 1:n()) %>% 
        ungroup() %>% 
        ## only consider individuals with at least 3 days of good data
        filter(n_good_days >= 3) %>%
        mutate(dow_fac = factor(WEEKDAY, levels=1:7, labels=c("Sun","Mon","Tue","Wed","Thu","Fri","Sat"))) %>% 
        left_join(Covariate, by="SEQN") %>% 
        left_join(Mortality, by="SEQN") %>% 
        arrange(SEQN, day) %>% 
        drop_na(one_of(vars_covar)) %>% 
        filter(!Cancer %in% "Don't know", !Diabetes %in% "Don't know", !CHD %in% "Don't know",
               !CHF %in% "Don't know", !Stroke %in% "Don't know")
rm(Flags, Covariate, Mortality)

## Calculate PA profiles features
NJ  <- nrow(PAXINTEN)
uid <- unique(PAXINTEN$SEQN)
nid <- length(uid)

## extract just activity counts
X <- as.matrix(PAXINTEN[,paste0("MIN",1:1440)])
## recode the few missing values as 0
X[is.na(X)] <- 0
## get log activity counts and smooth via fpca
lX    <- log(1+X)
lX_sm <- fpca.face(lX, knots=50)$Yhat

## get subject specific average profiles
inx_rows  <- split(1:NJ, factor(PAXINTEN$SEQN, levels=unique(PAXINTEN$SEQN)))
lX_sm_avg <- t(vapply(inx_rows, function(x) colMeans(lX_sm[x,]), numeric(1440)))

## create matrices for "time of day" and numeric integration via Riemann integration
sind <- seq(0,1,len=1440)
smat <- matrix(1, nid, 1) %x% matrix(sind, 1, 1440)
lmat <- matrix(1/1440, nid, 1440)

## get dataframe with one row per subejct,
## create indicator for 5-year mortality (Katia, I'm assuming this is what you want -- if not let me know)
df_fit <- 
        PAXINTEN %>% 
        dplyr::select(-one_of(paste0("MIN",1:1440))) %>% 
        group_by(SEQN) %>% 
        slice(1) %>% 
        ungroup() %>%
        ## note that because we have at least 5-years of follow-up for everyone I don't worry 
        ## about checking for that, but we do replace accidental deaths occuring within 5 years as "no event"
        mutate(mortstat = replace(mortstat, (ucod_leading %in% "004") & (permth_exm/12 <= 5), 0),
               mort_5yr = as.numeric(mortstat == 1 & (permth_exm/12 <= 5) ))  %>% 
        ## remove columns which were associated with individual days
        dplyr::select(-one_of(c("day","dow_fac","good_day","nmins_wear")))

## merge PA data into the dataframe
df_fit$smat <- I(smat)                      # used for fitting FGLM (functional domain matrix)
df_fit$lX_lmat <- I(lX_sm_avg * lmat)       # used for fitting FGLM (numeric integration times functional predictor)
## remove people under the age of 50, reweight
df_fit <- 
        df_fit %>% 
        filter(Age >= 50, !is.na(mort_5yr)) %>% 
        reweight_accel()
rm(lmat, smat, inx_rows, nid, uid, NJ, X, lX, lX_sm, lX_sm_avg)

Fit Survey-Weighted Scalar-on-Function Regression Models

We next compare the inference results using three approaches described in the paper: Fisher Information Based Standard Errors, BRR, and Survey Weighted Bootstrap.

Fisher Information Based Standard Errors

## create dataframe used for extracting predicted coefficient
ns_pred <- 1000
sind_pred <- seq(0,1,len=ns_pred)
df_pred <-data.frame(lX_lmat=1, smat=sind_pred, df_fit[1,vars_covar])
## fit the functional generalized linear models (FGLM)
kt_fglm       <- 30
fit_SOFR_w <- gam(mort_5yr ~ Age + Gender + BMI_cat + Cancer + Diabetes + CHD + CHF + MobilityProblem + SmokeCigs + DrinkStatus + Stroke + 
                          s(smat, by=lX_lmat, k=kt_fglm, bs="cc"), data=df_fit, family=binomial(), weights=wtmec4yr_adj_norm, method="REML")
fit_SOFR_uw <- gam(mort_5yr ~  Age + Gender + BMI_cat + Cancer + Diabetes + CHD + CHF + MobilityProblem + SmokeCigs + DrinkStatus + Stroke + 
                           s(smat, by=lX_lmat, k=kt_fglm, bs="cc"), data=df_fit, family=binomial(), method="REML")
## get the estimated coefficient plus intercept term: \beta_0 + \gamma(s)   for s \in [0,1]
est_w  <- predict(fit_SOFR_w, newdata=df_pred, type='terms', se.fit=TRUE)
est_uw <- predict(fit_SOFR_uw, newdata=df_pred, type='terms', se.fit=TRUE)

BRR

## create a variable which represents the combination of strata and psu
df_fit$STRATA_PSU <- paste0(df_fit$SDMVSTRA, "_", df_fit$SDMVPSU)

## get unique strata/PSU combinations
usp   <- unique(df_fit$STRATA_PSU)
n_usp <- length(usp)
## get unique strata
us   <- unique(df_fit$SDMVSTRA)
n_us <- length(us)
## get number of subjects 
N <- nrow(df_fit)

## set seed for reproducibility
set.seed(100)

df_fit$strata_PSU <- df_fit$STRATA_PSU
df_fit$weights <- df_fit$wtmec4yr_unadj_norm

## get BRR estimated SEs
fit <- doBRR(df=df_fit, fit=fit_SOFR_w, uPSU=1:2,us=29:58, nbrr=100, weights="weights",strata="SDMVSTRA",PSU="SDMVPSU")
gamma_hat <- fit$coefficients[-c(1:17),-c(1:17)]
vG <- var(gamma_hat)
Phi <- predict(fit_SOFR_w, newdata=df_pred, type='lpmatrix')[,-c(1:17)]
vgamma_s <- Phi %*% vG %*% t(Phi)
se_BRR_gamma <- sqrt(diag(vgamma_s))

Survey Weighted Bootstrap

## get weighted bootstrap SEs
nboot <- 100
coef_mat_boot <- matrix(NA, nboot, ns_pred)
# pb_boot <- txtProgressBar(min=0, max=nboot,style=3)
inq_q_mat <- matrix(NA, nboot, dim(df_fit)[1])
for(q in 1:nboot){
        inx_q <- sample(1:dim(df_fit)[1], size=dim(df_fit)[1], prob=df_fit$wtmec4yr_adj_norm, replace=TRUE)
        inq_q_mat[q,] <- inx_q

        df_fit_q <- df_fit[inx_q,]
        fit_gnq <- gam(fit_SOFR_w$formula, family=binomial(),method="REML", data=df_fit_q)
        coef_mat_boot[q,] <- predict(fit_gnq, newdata=df_pred, type="iterms")[,"s(smat):lX_lmat"]
        # setTxtProgressBar(pb_boot, value=q)
}
se_boot_gamma <- apply(coef_mat_boot, 2, sd, na.rm=TRUE)

Results

After obtaining the standard error estimates using each method, we visualize the results on a single plot. The plot is shown as Figure 5 in the manuscript.

## point estimates of the functional coefficient and various SE estimates,
## combine into a dataframe in long format for plotting via ggplot
df_plt <- 
        data.frame("estimate" = c(rep(est_w$fit[,"s(smat):lX_lmat"], 3), est_uw$fit[,"s(smat):lX_lmat"]), 
                   "se" = c(est_w$se.fit[,"s(smat):lX_lmat"], 
                            se_BRR_gamma, 
                            se_boot_gamma,
                            est_uw$se.fit[,"s(smat):lX_lmat"]),
                   "weighted" = c(rep("weighted", 3*ns_pred),
                                  rep("unweighted", ns_pred)),
                   "estimator" = c(rep("Fisher Information",ns_pred),
                                   rep("BRR", ns_pred),
                                   rep("Weighted Bootstrap", ns_pred), 
                                   rep("Fisher Information", ns_pred)),
                   "domain" = rep(sind_pred, 4)
                   )

xinx <- (c(1,6,12,18,23)*60+1)/1440
xinx_lab <- c("01:00","06:00","12:00","18:00","23:00")
plt_1 <- 
        df_plt %>% 
        filter(weighted == "weighted") %>% 
        mutate(LB = estimate - qnorm(0.975)*se,
               UB = estimate + qnorm(0.975)*se) %>% 
        ggplot() + 
        geom_line(aes(x=domain, y=estimate), color="black") + 
        geom_line(aes(x=domain, y=LB,color=estimator,lty=estimator)) + 
        geom_line(aes(x=domain, y=UB,color=estimator,lty=estimator)) + 
        theme_classic()  + 
        geom_hline(yintercept=0, col='grey',lty=2,lwd=1) +
        scale_x_continuous(breaks=xinx, labels=xinx_lab) + 
        xlab("Time of Day (s)") + ggtitle("Estimated Functional Coefficient with 95% CIs") + 
        ylab(expression(hat(gamma)(s) %+-% z[0.975] ~ SE(hat(gamma)(s)))) + 
        theme(legend.position=c(0.5, 0.85)) + 
        labs(color="SE Estimator", lty="SE Estimator")
plt_1


andrew-leroux/surveySoFR documentation built on Feb. 18, 2022, 1:42 a.m.