gwqs-vignette.R
In gWQS: Generalized Weighted Quantile Sum Regression

## ----setup, include=FALSE-----------------------------------------------------
# knitr::opts_chunk$set(cache = TRUE)

library(cowplot)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

library(gWQS)
library(ggplot2)
library(knitr)
library(kableExtra)
library(reshape2)

# we save the names of the mixture variables in the variable "PCBs"
PCBs <- names(wqs_data)[1:34]
# we run the model and save the results in the variable "results2i"
results2i <- gwqs(yLBX ~ pwqs + nwqs, mix_name = PCBs, data = wqs_data, 
                  q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                  family = "gaussian", seed = 2016)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

summary(results2i)


## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  
#  # bar plot
#  gwqs_barplot(results2i)
#  # scatter plot y vs wqs
#  gwqs_scatterplot(results2i)
#  # scatter plot residuals vs fitted values
#  gwqs_fitted_vs_resid(results2i)
#  # boxplot of the weights estimated at each repeated holdout step
#  gwqs_boxplot(results2i)
#  

## ----model1, results='asis', fig.show='hold', fig.height=8, fig.width=8, echo=FALSE, fig.cap="Plots displayed for linear outcomes.", message=FALSE----

PCBs <- names(wqs_data)[1:34]
results2i <- gwqs(yLBX ~ pwqs + nwqs, mix_name = PCBs, data = wqs_data, 
                q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                family = "gaussian", seed = 2016)

w_ord <- order(results2i$final_weights$`Estimate pos`)
mean_weight_pos <- results2i$final_weights$`Estimate pos`[w_ord]
mean_weight_neg <- results2i$final_weights$`Estimate neg`[w_ord]
mix_name <- factor(results2i$final_weights$mix_name[w_ord], 
                   levels = results2i$final_weights$mix_name[w_ord])
data_plot <- data.frame(mean_weight = c(mean_weight_pos, mean_weight_neg), 
                        mix_name = rep(mix_name, 2),
                        index = factor(rep(c("pwqs", "nwqs"), each = length(w_ord)), 
                                       levels = c("pwqs", "nwqs")))
bar_plot_h <- ggplot(data_plot, aes(x = mix_name, y = mean_weight)) + 
  geom_bar(stat = "identity", color = "black") + theme_bw() +
  theme(axis.ticks = element_blank(),
        axis.title = element_blank(),
        axis.text.x = element_text(color='black'),
        legend.position = "none") + coord_flip() + ggtitle("A") +
  geom_hline(yintercept = 1/length(PCBs), linetype="dashed", color = "red") +
  facet_wrap(~ index)

yadj_vs_wqs <- ggplot(melt(results2i$y_wqs_df, measure.vars = c("pwqs", "nwqs")), aes(value, y_adj)) + 
  geom_point() + facet_wrap(~ variable) + xlab("wqs") + 
  stat_smooth(method = "loess", se = FALSE, linewidth = 1.5) + theme_bw() + ggtitle("B")

fit_df <- data.frame(fitted = fitted(results2i), 
                     resid = residuals(results2i, type = "response"))
res_vs_fitted <- ggplot(fit_df, aes(x = fitted, y = resid)) + geom_point() + 
  theme_bw() + xlab("Fitted values") + ylab("Residuals") + ggtitle("C")

wboxplot <- melt(results2i$wmat, varnames = c("rh", "mix_name"))
wboxplot$mix_name <- factor(wboxplot$mix_name, levels = results2i$final_weights$mix_name)
wboxplot$L1 <- factor(wboxplot$L1, levels = c("wmatpos", "wmatneg"), labels = c("pwqs", "nwqs"))
box_plot <- ggplot(wboxplot, aes(x = mix_name, y = value)) +
  geom_boxplot(outlier.shape = " ") + theme_bw() + facet_wrap(~ L1, ncol = 1) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ylab("Weight") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3) + 
  geom_hline(yintercept = 1/length(PCBs), linetype="dashed", color = "red") +
  geom_jitter(alpha = 0.3) + ggtitle("D")

plot_grid(bar_plot_h, yadj_vs_wqs, res_vs_fitted, box_plot, nrow = 2, ncol = 2)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

head(results2i$final_weights)


## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  
#  gwqs_summary_tab(results2i)
#  

## ----sum1, results='asis', echo=FALSE-----------------------------------------

gwqs_summary_tab(results2i, caption = "Summary results of the WQS regression for linear outcomes.")


## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  
#  mf_df <- as.data.frame(signif(coef(summary(results2i)), 3))
#  kable_styling(kable(mf_df, row.names = TRUE))
#  

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  
#  gwqs_weights_tab(results2i)
#  

## ----w1, echo=FALSE, eval=TRUE------------------------------------------------

final_weight <- results2i$final_weights
final_weight[, -1] <- signif(final_weight[, -1], 3)
scroll_box(kable_styling(kable(final_weight, row.names = FALSE, caption = "Weights table of the WQS regression for linear outcomes.")), height = "400px")


## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  
#  final_weight <- results2i$final_weights
#  final_weight[, -1] <- signif(final_weight[, -1], 3)
#  kable_styling(kable(final_weight, row.names = FALSE))
#  

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  # bar plot
#  w_ord <- order(results2i$final_weights$`Estimate pos`)
#  mean_weight_pos <- results2i$final_weights$`Estimate pos`[w_ord]
#  mean_weight_neg <- results2i$final_weights$`Estimate neg`[w_ord]
#  mix_name <- factor(results2i$final_weights$mix_name[w_ord],
#                     levels = results2i$final_weights$mix_name[w_ord])
#  data_plot <- data.frame(mean_weight = c(mean_weight_pos, mean_weight_neg),
#                          mix_name = rep(mix_name, 2),
#                          index = factor(rep(c("pwqs", "nwqs"), each = length(w_ord)),
#                                         levels = c("pwqs", "nwqs")))
#  ggplot(data_plot, aes(x = mix_name, y = mean_weight)) +
#    geom_bar(stat = "identity", color = "black") + theme_bw() +
#    theme(axis.ticks = element_blank(),
#          axis.title = element_blank(),
#          axis.text.x = element_text(color='black'),
#          legend.position = "none") + coord_flip() +
#    geom_hline(yintercept = 1/length(PCBs), linetype="dashed", color = "red") +
#    facet_wrap(~ index)
#  #
#  # scatter plot y vs wqs
#  ggplot(melt(results2i$y_wqs_df, measure.vars = c("pwqs", "nwqs")), aes(value, y_adj)) +
#    geom_point() + facet_wrap(~ variable) + xlab("wqs") +
#    stat_smooth(method = "loess", se = FALSE, linewidth = 1.5) + theme_bw()
#  #
#  # scatter plot residuals vs fitted values
#  fit_df <- data.frame(fitted = fitted(results2i),
#                       resid = residuals(results2i, type = "response"))
#  res_vs_fitted <- ggplot(fit_df, aes(x = fitted, y = resid)) + geom_point() +
#    theme_bw() + xlab("Fitted values") + ylab("Residuals")

## ---- echo=TRUE, eval=TRUE----------------------------------------------------

# we run the model setting the penalization term equal to 90
results2i_l90 <- gwqs(yLBX ~ pwqs + nwqs, mix_name = PCBs, data = wqs_data, 
                    q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                    family = "gaussian", seed = 2016, lambda = 90)

# we run the model setting the penalization term equal to 900
results2i_l900 <- gwqs(yLBX ~ pwqs + nwqs, mix_name = PCBs, data = wqs_data, 
                     q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                     family = "gaussian", seed = 2016, lambda = 900)

# we run the model setting the penalization term equal to 900
results2i_l9000 <- gwqs(yLBX ~ pwqs + nwqs, mix_name = PCBs, data = wqs_data, 
                      q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                      family = "gaussian", seed = 2016, lambda = 9000)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

lambda_AIC_2i <- data.frame(lambda = c(0, 90, 900, 9000),
                            AIC = c(results2i$fit$aic, results2i_l90$fit$aic, 
                                    results2i_l900$fit$aic, results2i_l9000$fit$aic))
kable(lambda_AIC_2i) %>% kable_styling()


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

summary(results2i_l90)


## ---- results='asis', fig.show='hold', fig.height=8, fig.width=8, echo=TRUE, message=FALSE----

gwqs_barplot(results2i_l90)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

results1i <- gwqs(yLBX ~ wqs, mix_name = PCBs, data = wqs_data, 
                  q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                  family = "gaussian", seed = 2016)

# we run the model setting the penalization term equal to 90
results1i_l90 <- gwqs(yLBX ~ wqs, mix_name = PCBs, data = wqs_data, 
                    q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                    family = "gaussian", seed = 2016, lambda = 90)

# we run the model setting the penalization term equal to 900
results1i_l900 <- gwqs(yLBX ~ wqs, mix_name = PCBs, data = wqs_data, 
                     q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                     family = "gaussian", seed = 2016, lambda = 900)

# we run the model setting the penalization term equal to 900
results1i_l9000 <- gwqs(yLBX ~ wqs, mix_name = PCBs, data = wqs_data, 
                      q = 10, validation = 0.6, b = 5, b1_pos = TRUE, rh = 5,
                      family = "gaussian", seed = 2016, lambda = 9000)


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

lambda_AIC_1i <- data.frame(lambda = c(0, 90, 900, 9000),
                            AIC = c(results1i$fit$aic, results1i_l90$fit$aic, 
                                    results1i_l900$fit$aic, results1i_l9000$fit$aic))
kable(lambda_AIC_1i) %>% kable_styling()


## ---- echo=TRUE, eval=TRUE----------------------------------------------------

summary(results1i_l90)


## ---- results='asis', fig.show='hold', fig.height=8, fig.width=8, echo=TRUE----

gwqs_barplot(results1i_l90)