In nickjcroucher/progressionEstimation: R package for the estimation of microbial progression rates

knitr::opts_chunk$set(
  collapse = TRUE,
  fig.width = 8,
  comment = "#>"
)
options(rmarkdown.html_vignette.check_title = FALSE)

# Load dependencies
require(tidyverse)
require(magrittr)
require(rstan)
require(bridgesampling)
require(loo)
require(cowplot)
require(ggrepel)
require(xlsx)

# Load stan package
library(progressionEstimation)
#pkgbuild::compile_dll()
#roxygen2::roxygenize(package.dir = "..")

# Define serotype categories and palettes
vaccine_types <- list(
  "PCV7" = c("4","6B","9V","14","18C","19F","23F"),
  "PCV10" = c("1","5","7F"),
  "PCV13" = c("6A","3","19A"),
  "PCV15" = c("22F","33F"),
  "PCV20" = c("8","10A","11A","12F","15B/C"),
  "PPV23" = c("2","9N","17F","20")
)

vaccine_colours <- c("No PCV" = "blue",
                     "PCV7" = "red",
                     "PCV10" = "orange",
                     "PCV13" = "coral2",
                     "PCV15" = "pink",
                     "PCV20" = "purple",
                     "PPV23" = "black")

epi_colors <- c(
  "carriage" = "skyblue",
  "disease" = "cornflowerblue"
)

model_name_labels <-
  c("null Poisson",
    "null negative binomial",
    "type-specific Poisson",
    "type-specific negative binomial",
    "study-adjusted Poisson",
    "study-adjusted negative binomial",
    "study-adjusted type-specific Poisson",
    "study-adjusted type-specific negative binomial"
  )

Description and filtering of the dataset

The dataset used in this analysis is contained in the S_pneumoniae_infant_serotype object, which includes data from r S_pneumoniae_infant_serotype %>% dplyr::select(study) %>% dplyr::n_distinct() studies on r S_pneumoniae_infant_serotype %>% dplyr::select(type) %>% dplyr::n_distinct() serotypes. The distribution of samples between each study can be plotted, if they are ranked by the mean number of carriage and disease isolates:

``` {r Plot distribution by study}

studies_by_size <- S_pneumoniae_infant_serotype %>% group_by(study) %>% dplyr::mutate(total_count = sum(carriage)+sum(disease)) %>% dplyr::select(study,total_count) %>% dplyr::distinct() %>% dplyr::arrange(desc(total_count)) %>% dplyr::select(study) %>% dplyr::pull()

S_pneumoniae_infant_serotype %<>% dplyr::mutate(study = factor(study, levels = studies_by_size))

full_epi_data_plot <- ggplot(S_pneumoniae_infant_serotype %>% tidyr::pivot_longer(c(carriage,disease), names_to = "Sample type", values_to = "count"), aes(x = study, y = count, colour = Sample type, fill = Sample type)) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(values = epi_colors) + scale_colour_manual(values = epi_colors)

The total number of observations for each serotype can be similarly plotted:

``` {r Plot distribution by serotype}

serotype_by_count <-
  S_pneumoniae_infant_serotype %>%
    group_by(type) %>%
    dplyr::mutate(total_count = sum(carriage)+sum(disease)) %>%
    dplyr::select(type,total_count) %>%
    dplyr::distinct() %>%
    dplyr::arrange(desc(total_count)) %>%
    dplyr::select(type) %>%
    dplyr::pull()

S_pneumoniae_infant_serotype %<>%
  dplyr::mutate(type = factor(type, levels = serotype_by_count))

serotype_frequency_plot <-
  ggplot(S_pneumoniae_infant_serotype %>%
           tidyr::pivot_longer(c(carriage,disease),
                               names_to = "Sample type",
                               values_to = "count"),
         aes(x = type,
             y = count,
             colour = `Sample type`,
             fill = `Sample type`)) +
    geom_col() +
    xlab("Serotype") +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
    scale_fill_manual(values = epi_colors) +
    scale_colour_manual(values = epi_colors)

(combined_full_epi_data_plot <-
  cowplot::plot_grid(plotlist = list(full_epi_data_plot, serotype_frequency_plot),
                     nrow = 2,
                     ncol = 1,
                     labels = "AUTO")
)

ggsave(combined_full_epi_data_plot,
       file = "full_epi_data_plot.pdf",
       height = 12,
       width = 8
)

As many of these serotypes are rare, to facilitate model fitting, we exclude any serotype for which the maximum number of observations is below five:

``` {r Plot distribution by serotype in filtered dataset}

threshold_count <- 5

filtered_S_pneumoniae_infant_serotype <- S_pneumoniae_infant_serotype %>% dplyr::filter(carriage >= threshold_count | disease >= threshold_count)

%>%

dplyr::group_by(type)%>%

dplyr::mutate(study_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(study_count > threshold_count) %>%

dplyr::group_by(study) %>%

dplyr::mutate(serotype_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(serotype_count > threshold_count) %>%

dplyr::mutate(type = factor(type))

filtered_S_pneumoniae_infant_serotype %<>% dplyr::mutate(study = factor(study, levels = studies_by_size[studies_by_size %in% unique(filtered_S_pneumoniae_infant_serotype$study)]))

filtered_S_pneumoniae_infant_serotype %<>% dplyr::mutate(type = factor(type, levels = serotype_by_count[serotype_by_count %in% unique(filtered_S_pneumoniae_infant_serotype$type)]))

infant_serotype_invasiveness <- progressionEstimation::process_input_data(filtered_S_pneumoniae_infant_serotype)

filtered_epi_data_plot <- ggplot(filtered_S_pneumoniae_infant_serotype %>% tidyr::pivot_longer(c(carriage,disease), names_to = "Sample type", values_to = "count"), aes(x = study, y = count, colour = Sample type, fill = Sample type)) + geom_col() + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(values = epi_colors) + scale_colour_manual(values = epi_colors)

filtered_serotype_frequency_plot <- ggplot(filtered_S_pneumoniae_infant_serotype %>% tidyr::pivot_longer(c(carriage,disease), names_to = "Sample type", values_to = "count"), aes(x = type, y = count, colour = Sample type, fill = Sample type)) + geom_col() + xlab("Serotype") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + scale_fill_manual(values = epi_colors) + scale_colour_manual(values = epi_colors)

(combined_filtered_epi_data_plot <- cowplot::plot_grid(plotlist = list(filtered_epi_data_plot, filtered_serotype_frequency_plot), nrow = 2, ncol = 1, labels = "AUTO") )

ggsave(combined_filtered_epi_data_plot, file = "filtered_epi_data_plot.pdf", height = 12, width = 8 )

This filtered dataset comprises `r S_pneumoniae_infant_serotype %>% dplyr::select(type) %>% dplyr::n_distinct()` serotypes.

## Fitting the statistical models

The null model, in which each serotype $j$ causes invasive disease at the same rate in each location $i$, can be fitted using a Poisson distribution of disease isolates (**null Poisson** model):

$$d_{i,j} \sim Pois(\nu\rho_{i,j}N_{i}t_{i})$$

``` {r Null Poisson model fit, include = FALSE}

n_chains <- 2
n_iter <- 2.5e4
n_core <- 2

null_poisson_fit <- 
  progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness,
                                                    type_specific = FALSE,
                                                    location_adjustment = FALSE,
                                                    stat_model = "poisson",
                                                    num_chains = n_chains,
                                                    num_iter = n_iter,
                                                    num_cores = n_core)

The null model can be fitted using a negative binomial distribution of disease isolates (null negative binomial model):

$$d_{i,j} \sim NegBin(\nu\rho_{i,j}N_{i}t_{i},\phi)$$

``` {r Null negative binomial model fit, include = FALSE}

null_negbin_fit <- progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness, type_specific = FALSE, location_adjustment = FALSE, stat_model = "negbin", num_chains = n_chains, num_iter = n_iter, num_cores = n_core)

The Poisson model can be modified by allowing each serotype to have its own invasiveness, $\nu_{j}$ (**type-specific Poisson** model):

$$d_{i,j} \sim Pois(\nu_{j}\rho_{i,j}N_{i}t_{i})$$
``` {r Type-specific Poisson model fit, include = FALSE}

type_specific_poisson_fit <- 
  progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness,
                                                    type_specific = TRUE,
                                                    location_adjustment = FALSE,
                                                    stat_model = "poisson",
                                                    num_chains = n_chains,
                                                    num_iter = n_iter,
                                                    num_cores = n_core)

The negative binomial model can be modified in the same way (type-specific negative binomial model):

$$d_{i,j} \sim NegBin(\nu_{j}\rho_{i,j}N_{i}t_{i},\phi)$$

``` {r Type-specific negative binomial model fit, include = FALSE}

type_specific_negbin_fit <- progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness, type_specific = TRUE, location_adjustment = FALSE, stat_model = "negbin", num_chains = n_chains, num_iter = n_iter, num_cores = n_core)

Alternatively, the Poisson model can be modified by allowing invasiveness to be adjusted between locations by a scale factor, $\gamma_{i}$. In this model, all serotypes still have the same invasiveness (**adjusted Poisson** model):

$$d_{i,j} \sim Poisson(\gamma_{i}\nu\rho_{i,j}N_{i}t_{i})$$

``` {r Adjusted Poisson model fit, include = FALSE}

adjusted_poisson_fit <- 
  progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness,
                                                    type_specific = FALSE,
                                                    location_adjustment = TRUE,
                                                    stat_model = "poisson",
                                                    num_chains = n_chains,
                                                    num_iter = n_iter,
                                                    num_cores = n_core)

The negative binomial model can again be modified in the same way (adjusted negative binomial model):

$$d_{i,j} \sim NegBin(\gamma_{i}\nu\rho_{i,j}N_{i}t_{i},\phi)$$

``` {r Adjusted negative binomial model fit, include = FALSE}

adjusted_negbin_fit <- progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness, type_specific = FALSE, location_adjustment = TRUE, stat_model = "negbin", num_chains = n_chains, num_iter = n_iter, num_cores = n_core)

Both modifications can be combined in adjusted type-specific model fits. This can be used with the Poisson model of disease isolate counts (**adjusted type-specific Poisson** model):

$$d_{i,j} \sim Poisson(\gamma_{i}\nu_{j}\rho_{i,j}N_{i}t_{i})$$

``` {r Adjusted type-specific Poisson model fit, include = FALSE}

adjusted_type_specific_poisson_fit <- 
  progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness,
                                                    type_specific = TRUE,
                                                    location_adjustment = TRUE,
                                                    stat_model = "poisson",
                                                    num_chains = n_chains,
                                                    num_iter = n_iter,
                                                    num_cores = n_core)

This can also be used with the negative binomial model (adjusted type-specific negative binomial model):

$$d_{i,j} \sim NegBin(\gamma_{i}\nu_{j}\rho_{i,j}N_{i}t_{i},\phi)$$

``` {r Adjusted type-specific negative binomial model fit, include = FALSE}

adjusted_type_specific_negbin_fit <- progressionEstimation::fit_progression_rate_model(infant_serotype_invasiveness, type_specific = TRUE, location_adjustment = TRUE, stat_model = "negbin", num_chains = n_chains, num_iter = n_iter, num_cores = n_core)

To check the models have fitted correctly, we can plot the $\hat{R}$ values across MCMCs:

``` {r Plot r hat values}

fit_list <- list(null_poisson_fit,
                 null_negbin_fit,
                 type_specific_poisson_fit,
                 type_specific_negbin_fit,
                 adjusted_poisson_fit,
                 adjusted_negbin_fit,
                 adjusted_type_specific_poisson_fit,
                 adjusted_type_specific_negbin_fit)

plot_rhat <- function(fit) {
  rhat_plot <- plot(fit, plotfun = "rhat", binwidth = 0.00005) +
                theme(axis.text.x = element_text(angle = 90))
  return(rhat_plot)
}

rhat_plots <- lapply(fit_list, plot_rhat)

combined_rhat_plots <- cowplot::plot_grid(plotlist = rhat_plots, ncol = 4, nrow = 2, labels = "AUTO")

ggsave(combined_rhat_plots,
       file = "infant_combined_rhat_plots.pdf",
       width = 8,
       height = 6)

Convergence can also be checked using trace plots of the log likelihood values:

``` {r Plot log likelihood traces}

plot_lp <- function(fit) { pl <- rstan::traceplot(fit, pars = "lp__") no_leg_pl <- pl + theme(legend.position = "none", axis.text.x = element_text(angle = 90)) return(no_leg_pl) }

trace_plots <- lapply(fit_list, plot_lp)

combined_trace_plots <- cowplot::plot_grid(plotlist = trace_plots, ncol = 4, nrow = 2, labels = "AUTO")

ggsave(combined_trace_plots, file = "infant_combined_trace_plots.pdf", width = 8, height = 6)

## Compare model observations and predictions

For each model, the output can be processed, and the observations plotted against each model's predictions.

``` {r Process model outputs and plot predictions, fig.height = 18}

model_outputs <- lapply(fit_list,
                        progressionEstimation::process_progression_rate_model_output,
                        filtered_S_pneumoniae_infant_serotype)

prediction_plots <- lapply(model_outputs,
                           progressionEstimation::plot_case_carrier_predictions,
                           legend = FALSE,
                           n_label = 0)

no_leg_plot <- cowplot::plot_grid(plotlist = prediction_plots,
                                  ncol = 1,
                                  nrow = 8,
                                  labels = "AUTO",
                                  vjust = -0.01) +
      theme(plot.margin = unit(c(0.5,0,0,0), "cm"))

prediction_plot_legend <- progressionEstimation::plot_case_carrier_predictions(model_outputs[[1]], just_legend = TRUE) 

pred_v_obs_plot <- 
  cowplot::plot_grid(no_leg_plot,
                     prediction_plot_legend,
                     ncol = 1,
                     rel_heights = c(1,0.1))

ggsave(pred_v_obs_plot,
       file = "infant_serotypes_pred_v_obs_plot.pdf",
       height = 20,
       width = 10)

The deviation of the predicted and observed values can be simply evaluated using the distribution of absolute deviations:

``` {r Calculate absolute deviations}

obs_pred_df <- dplyr::bind_rows(model_outputs) %>% dplyr::select(model_name,carriage_abs_dev,disease_abs_dev,phi,phi_lower,phi_upper) %>% tidyr::pivot_longer(c(carriage_abs_dev,disease_abs_dev), names_to = "Observation", values_to = "Abs_dev") %>% dplyr::mutate(model_name = factor(model_name, levels = c( "null_poisson", "null_negbin", "type_specific_poisson", "type_specific_negbin", "adjusted_null_poisson", "adjusted_null_negbin", "adjusted_type_specific_poisson", "adjusted_type_specific_negbin" ) ) ) %>% dplyr::mutate(Observation = dplyr::case_when( Observation == "carriage_abs_dev" ~ "Carriage", Observation == "disease_abs_dev" ~ "Disease" ) )

(serotype_abs_dev_plot <- ggplot(obs_pred_df, aes(x = model_name, y = Abs_dev)) + #geom_boxplot(outlier.shape = NA) + geom_point(position = position_jitter(width = 0.25), alpha = 0.25, cex = 0.75, colour = "blue", pch = 4) + geom_violin(fill = NA) + scale_y_continuous(trans="log10") + theme_bw() + facet_grid(Observation ~ .) + scale_x_discrete(labels = model_name_labels) + ylab("Absolute deviation") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) )

ggsave(serotype_abs_dev_plot, file = "serotype_abs_dev_plot.pdf", width = 7, height = 7 )

The fits of the negative binomial can be evaluated through plotting the estimates of the precision parameter, $\phi$. As $\phi$ increases, the variance of the negative binomial distribution decreases, and the negative binomal model becomes more similar to the Poisson model.

``` {r Plot precision parameters for negative binomial models, fig.width = 6}
(phi_plot <-
  ggplot(obs_pred_df %>%
          dplyr::filter(!is.na(phi)) %>%
          dplyr::group_by(model_name) %>%
          dplyr::slice(1) %>%
          dplyr::ungroup(),
        aes(x = model_name,
            y = phi,
            ymin = phi_lower,
            ymax = phi_upper)) +
    geom_point() +
    geom_errorbar(width = 0.25) +
    theme_bw() +
    scale_y_continuous(trans="log10") +
    ylab(expression(phi)) +
    xlab("Model name") +
    scale_x_discrete(labels = model_name_labels[c(2,4,6,8)]) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
)

ggsave(phi_plot,
       file="serotype_phi_plot.pdf",
       height = 6,
       width = 6)

Run model comparison with leave-one-out cross-validation

The model likelihoods can be compared using leave-one-out cross-validation:

``` {r Comparison with LOO-CV using full log likelihood}

loo_comparison <- progressionEstimation::compare_model_fits_with_loo(fit_list, use_moments = FALSE, num_threads = n_core)

loo_comparison %>% kableExtra::kable() %>% kableExtra::kable_styling(latex_options = "scale_down")

write.csv(loo_comparison, file = "child_serotype_loo_comparison.csv", quote = FALSE, row.names = FALSE)

The Pareto distribution k values are too high for robust estimates of the ELPD. This reflects many of the points being 

``` {r Comparison with LOO-CV using disease distribution log likelihood}

disease_loo_comparison <- progressionEstimation::compare_model_fits_with_loo(fit_list,
                                                                     log_lik_param = "disease_log_lik",
                                                                     use_moments = FALSE,
                                                                     num_threads = n_core)

disease_loo_comparison %>%
  kableExtra::kable()  %>%
  kableExtra::kable_styling(latex_options = "scale_down")

write.csv(disease_loo_comparison,
          file = "child_disease_serotype_loo_comparison.csv",
          quote = FALSE,
          row.names = FALSE)

Run model comparison with Bayes factors

As the models can instead be compared with Bayes factors:

``` {r Comparison with BF}

bf_comparison <- progressionEstimation::compare_model_fits_with_bf(fit_list, num_iter = n_iter, num_threads = n_core)

bf_comparison %>% kableExtra::kable() %>% kableExtra::kable_styling(latex_options = "scale_down")

write.csv(bf_comparison, file = "child_disease_serotype_bf_comparison.csv", quote = FALSE, row.names = FALSE)

## Plot serotype invasiveness

The best-fitting model can then be applied to the overall dataset.

``` {r Plot invasiveness estimates from best-fitting model}

full_infant_serotype_invasiveness <- 
  progressionEstimation::process_input_data(
    S_pneumoniae_infant_serotype %<>%
      dplyr::mutate(study = factor(study, levels = studies_by_size)
      )
    )

full_adjusted_type_specific_negbin_fit <- 
  progressionEstimation::fit_progression_rate_model(full_infant_serotype_invasiveness,
                                                    type_specific = TRUE,
                                                    location_adjustment = TRUE,
                                                    stat_model = "negbin",
                                                    num_chains = n_chains,
                                                    num_iter = n_iter,
                                                    num_cores = n_core)

rhat_plot <- plot(full_adjusted_type_specific_negbin_fit, plotfun = "rhat", binwidth = 0.00005)

trace_plot <- rstan::traceplot(full_adjusted_type_specific_negbin_fit, pars = "lp__") +
                theme(axis.text.x = element_text(angle = 90))

best_fitting_model_validation_plot <- cowplot::plot_grid(plotlist = list(rhat_plot,trace_plot),
                   nrow = 1,
                   ncol = 2,
                   labels = "AUTO")

ggsave(best_fitting_model_validation_plot,
       file = "best_fitting_infant_model_validation_plot.pdf",
       width = 8,
       height = 6)

``` {r Plot invasiveness across all serotypes}

mixedrank = function(x) order(gtools::mixedorder(as.character(x)))

Annotate serotypes by status

best_fitting_output <- progressionEstimation::process_progression_rate_model_output(full_adjusted_type_specific_negbin_fit, S_pneumoniae_infant_serotype) %>% dplyr::mutate("classification" = dplyr::case_when( type %in% vaccine_types[["PCV7"]] ~ "PCV7", type %in% vaccine_types[["PCV10"]] ~ "PCV10", type %in% vaccine_types[["PCV13"]] ~ "PCV13", type %in% vaccine_types[["PCV15"]] ~ "PCV15", type %in% vaccine_types[["PCV20"]] ~ "PCV20", type %in% vaccine_types[["PPV23"]] ~ "PPV23", TRUE ~ "No PCV" ) ) %>% dplyr::mutate(classification = factor(classification, levels = c("PCV7", "PCV10", "PCV13", "PCV15", "PCV20", "PPV23", "No PCV")))

Order serotypes

serotype_levels <- best_fitting_output %>% dplyr::select(type,classification) %>% dplyr::distinct() %>% group_by(classification) %>% dplyr::arrange(mixedrank(type), .by_group = TRUE) %>% ungroup() %>% dplyr::select(type) %>% dplyr::pull()

best_fitting_output %<>% dplyr::mutate(type = factor(type, levels = serotype_levels))

Plot

(invasiveness_plot <- progressionEstimation::plot_progression_rates(best_fitting_output, unit_time = "year", type_name = "Serotype", colour_col = "classification", colour_palette = vaccine_colours, use_sample_size = TRUE))

ggsave(invasiveness_plot, file = "full_serotype_invasiveness_plot.pdf", height = 8, width = 8)

write.csv(best_fitting_output, file = "study_adjusted_type_specific_negbin_serotype_fit.csv", quote = F, row.names = F)

write.csv(best_fitting_output %>% dplyr::select(type,nu,nu_lower,nu_upper) %>% dplyr::rename(invasiveness = nu) %>% dplyr::rename(invasiveness_lower = nu_lower) %>% dplyr::rename(invasiveness_upper = nu_upper) %>% dplyr::distinct(), file = "Dataset_S1.csv", quote = F, row.names = F)

## Plot study-specific scale factors

``` {r Plot study-specific scale factor estimates from best-fitting model}

(scale_factor_plot <-
  progressionEstimation::plot_study_scale_factors(best_fitting_output))

ggsave(scale_factor_plot,
       file = "serotype_scale_factor_plot.png",
       width = 8,
       height = 6)

Compare a model fit that account for PCVs protecting against disease

If PCVs provide further protection against disease, relative to carriage, then their invasiveness may reduce post-PCV. Therefore we can treat them as different serotypes pre- and post-PCV.

``` {r Adjust invasiveness post-PCV}

pv_S_pneumoniae_infant_serotype <- S_pneumoniae_infant_serotype %>% dplyr::mutate(type = dplyr::case_when( type %in% vaccine_types[["PCV7"]] & grepl("post.PCV", study) ~ paste0(type,"_pv"), type == "6A" & grepl("post.PCV", study) ~ paste0(type,"_pv"), type %in% vaccine_types[["PCV10"]] & grepl("post.PCV10", study) ~ paste0(type,"_pv"), type %in% vaccine_types[["PCV10"]] & grepl("post.PCV13", study) ~ paste0(type,"_pv"), type %in% vaccine_types[["PCV13"]] & grepl("post.PCV13", study) ~ paste0(type,"_pv"), type == "6C" & grepl("post.PCV13", study) ~ paste0(type,"_pv"), TRUE ~ as.character(type) ) ) %>% dplyr::mutate(type = factor(type))

pv_S_pneumoniae_infant_serotype %<>% dplyr::mutate(study = factor(study, levels = studies_by_size))

pv_full_infant_serotype_invasiveness <- progressionEstimation::process_input_data(pv_S_pneumoniae_infant_serotype)

pv_full_adjusted_type_specific_negbin_fit <- progressionEstimation::fit_progression_rate_model(pv_full_infant_serotype_invasiveness, type_specific = TRUE, location_adjustment = TRUE, stat_model = "negbin", num_chains = n_chains, num_iter = n_iter, num_cores = n_core)

rhat_plot <- plot(pv_full_adjusted_type_specific_negbin_fit, plotfun = "rhat", binwidth = 0.00005)

trace_plot <- rstan::traceplot(pv_full_adjusted_type_specific_negbin_fit, pars = "lp__") + theme(axis.text.x = element_text(angle = 90, size = 8))

best_fitting_pv_model_validation_plot <- cowplot::plot_grid(plotlist = list(rhat_plot,trace_plot), nrow = 1, ncol = 2, labels = "AUTO")

ggsave(best_fitting_pv_model_validation_plot, file = "best_fitting_pv_infant_model_validation_plot.pdf", width = 8, height = 6)

``` {r Analyse the pv output}

pv_best_fitting_output <- 
  progressionEstimation::process_progression_rate_model_output(
    pv_full_adjusted_type_specific_negbin_fit,                                                 pv_S_pneumoniae_infant_serotype) %>%
      dplyr::mutate("classification" = dplyr::case_when(
          gsub("_pv","",type) %in% vaccine_types[["PCV7"]] ~ "PCV7",
          gsub("_pv","",type) %in% vaccine_types[["PCV10"]] ~ "PCV10",
          gsub("_pv","",type) %in% vaccine_types[["PCV13"]] ~ "PCV13",
          gsub("_pv","",type) %in% vaccine_types[["PCV15"]] ~ "PCV15",
          gsub("_pv","",type) %in% vaccine_types[["PCV20"]] ~ "PCV20",
          gsub("_pv","",type) %in% vaccine_types[["PPV23"]] ~ "PPV23",
          TRUE ~ "No PCV"
        )
      ) %>%
      dplyr::mutate(classification = factor(classification, levels = c("PCV7",
                                                                       "PCV10",
                                                                       "PCV13",
                                                                       "PCV15",
                                                                       "PCV20",
                                                                       "PPV23",
                                                                       "No PCV")))

# Order serotypes
pv_serotype_levels <-
  pv_best_fitting_output %>%
    dplyr::select(type,classification) %>%
    dplyr::distinct() %>%
    group_by(classification) %>%
    dplyr::arrange(mixedrank(type),
                   .by_group = TRUE) %>%
    ungroup() %>%
    dplyr::select(type) %>%
    dplyr::pull()

pv_best_fitting_output %<>%
  dplyr::mutate(type = factor(type, levels = pv_serotype_levels))

# Plot
(pv_invasiveness_plot <-
  progressionEstimation::plot_progression_rates(pv_best_fitting_output,
                                                unit_time = "year",
                                                type_name = "Serotype",
                                                colour_col = "classification",
                                                colour_palette = vaccine_colours,
                                                use_sample_size = TRUE))

(joint_invasiveness_plot <-
    cowplot::plot_grid(plotlist = list(invasiveness_plot + theme(legend.position = "none"), pv_invasiveness_plot),
                       nrow = 2,
                       ncol = 1,
                       rel_heights = c(1,1.2),
                       labels = "AUTO"
                       )
)

ggsave(joint_invasiveness_plot,
       file = "joint_infant_serotype_plot.pdf",
       height = 10,
       width = 9)

``` {r Plot study-specific scale factor estimates from best-fitting model, adjust for vaccine}

(pv_scale_factor_plot <- progressionEstimation::plot_study_scale_factors(pv_best_fitting_output))

## Comparison with odds ratio calculation

``` {r Calculate odds ratios}

require(ggpubr)
require(metafor)

get_serotype_stats <- function(serotype, df) {
  sero_df <-
    df %>%
      tidyr::pivot_longer(c(carriage,disease),
                          names_to = "clinical_manifestation",
                          values_to = "observation") %>%
      dplyr::group_by(study) %>%
      dplyr::mutate(matching_type = dplyr::if_else(type == serotype,1,0)) %>%
      dplyr::mutate(matching_type_disease =
                      sum(observation[clinical_manifestation == "disease" & matching_type == 1])) %>%
      dplyr::mutate(matching_type_carriage =
                      sum(observation[clinical_manifestation == "carriage" & matching_type == 1])) %>%
      dplyr::mutate(non_matching_type_disease =
                      sum(observation[clinical_manifestation == "disease" & matching_type == 0])) %>%
      dplyr::mutate(non_matching_type_carriage =
                      sum(observation[clinical_manifestation == "carriage" & matching_type == 0])) %>%
      dplyr::ungroup() %>%
    dplyr::select(study,matching_type_disease,matching_type_carriage,non_matching_type_disease,non_matching_type_carriage) %>%
    dplyr::distinct() %>%
    dplyr::mutate(serotype = serotype) %>%
    dplyr::filter(matching_type_carriage > 0 | matching_type_disease > 0)
  return(sero_df)
}

per_serotype_estimate <- function(test_serotype, df, method_type = "FE") {
  per_serotype_out <- metafor::rma(yi = yi,
                                   vi = vi,
                                   data = df,
                                   slab = study,
                                   subset = serotype == test_serotype,
                                   method = method_type,
                                   control=list(stepadj=0.5, maxiter=10000))
  summary_df <-
    data.frame(
      "Analysis" = method_type,
      "Serotype" = test_serotype,
      "OR" = as.numeric(per_serotype_out$b),
      "OR_upper" = per_serotype_out$ci.ub,
      "OR_lower" = per_serotype_out$ci.lb,
      "tau_squared" = per_serotype_out$tau2,
      "i_squared" = per_serotype_out$I2,
      "h_squared" = per_serotype_out$H2
    )
  return(summary_df)
}

serotype_or_df <- dplyr::bind_rows(lapply(
                                   serotype_levels,
                                   get_serotype_stats,
                                   S_pneumoniae_infant_serotype)
)

serotype_effect_size_df <- metafor::escalc(measure = "OR",
                                            ai = matching_type_disease,
                                            bi = matching_type_carriage,
                                            ci = non_matching_type_disease,
                                            di = non_matching_type_carriage,
                                            slab = study,
                                            data = serotype_or_df)

per_serotype_out <- dplyr::bind_rows(
                    lapply(
                      serotype_levels,
                      per_serotype_estimate,
                      serotype_effect_size_df
                    ),
                    lapply(
                      serotype_levels,
                      per_serotype_estimate,
                      serotype_effect_size_df,
                      method_type = "REML"
                    )
)

sample_sizes <-
  best_fitting_output %>%
      dplyr::group_by(type) %>%
      dplyr::mutate(num_samples = sum(carriage+disease)) %>%
      dplyr::mutate(Sample_count = dplyr::case_when(
                        num_samples < 11 ~ "0-10",
                        num_samples < 51 ~ "11-50",
                        num_samples < 101 ~ "51-100",
                        TRUE ~ ">100"
                      )
                    ) %>%
      dplyr::ungroup() %>%
      dplyr::select(type, Sample_count) %>%
      dplyr::distinct() %>%
      dplyr::mutate(Sample_count = factor(Sample_count,
                                          levels = c("0-10",
                                                     "11-50",
                                                     "51-100",
                                                     ">100")))

# Combine the invasiveness estimates
combined_invasiveness_df <-
  dplyr::bind_rows(
    per_serotype_out %>%
      dplyr::select(Analysis, Serotype, OR, OR_upper, OR_lower) %>%
      dplyr::rename(Invasiveness = OR) %>%
      dplyr::rename(Invasiveness_upper = OR_upper) %>%
      dplyr::rename(Invasiveness_lower = OR_lower),
    best_fitting_output %>%
      dplyr::select(model_name, nu, nu_lower, nu_upper, type) %>%
      dplyr::distinct() %>%
      dplyr::rename(Analysis = model_name) %>%
      dplyr::rename(Serotype = type) %>%
      dplyr::rename(Invasiveness = nu) %>%
      dplyr::rename(Invasiveness_upper = nu_upper) %>%
      dplyr::rename(Invasiveness_lower = nu_lower)
  ) %>%
  tidyr::pivot_wider(id_cols = Serotype,
                     names_from = Analysis,
                     values_from = c(Invasiveness,Invasiveness_upper,Invasiveness_lower),
                     names_sep = "_") %>%
    dplyr::mutate("classification" = dplyr::case_when(
        Serotype %in% vaccine_types[["PCV7"]] ~ "PCV7",
        Serotype %in% vaccine_types[["PCV10"]] ~ "PCV10",
        Serotype %in% vaccine_types[["PCV13"]] ~ "PCV13",
        Serotype %in% vaccine_types[["PCV15"]] ~ "PCV15",
        Serotype %in% vaccine_types[["PCV20"]] ~ "PCV20",
        Serotype %in% vaccine_types[["PPV23"]] ~ "PPV23",
        TRUE ~ "No PCV"
      )
    ) %>%
  dplyr::left_join(sample_sizes, by = c("Serotype" = "type"))

# Plot comparison of invasiveness estimates
(fixed_effects_comparison <-
  ggplot(combined_invasiveness_df,
       aes(x = Invasiveness_adjusted_type_specific_negbin,
           xmin = Invasiveness_lower_adjusted_type_specific_negbin,
           xmax = Invasiveness_upper_adjusted_type_specific_negbin,
           y = Invasiveness_FE,
           ymin = Invasiveness_lower_FE,
           ymax = Invasiveness_upper_FE,
           colour = classification)) +
  geom_errorbar(alpha = 0.25) +
  geom_errorbarh(alpha = 0.25) +
  geom_point() +
  scale_x_continuous(trans = "log10") +
  theme_bw() +
  scale_color_manual(values = vaccine_colours) +
  theme(legend.position = "bottom") +
  ylab("Invasiveness estimated from log odds ratio") +
  xlab("Invasivess estimate from adjusted type-specific negative binomial (cases per carrier per year)") +
  facet_wrap(~Sample_count) +
  stat_cor(aes(x = Invasiveness_adjusted_type_specific_negbin,
               y = Invasiveness_FE),
           inherit.aes = FALSE,
           method = "kendall",
           cor.coef.name = "tau") +
  ggrepel::geom_label_repel(data = combined_invasiveness_df %>%
                                    dplyr::filter((Invasiveness_adjusted_type_specific_negbin < 0.0005 & Invasiveness_FE > 0) | Serotype == "46"),
                            aes(label = Serotype)
                            )
)

``` {r Plot comparison with mixed effects}

Plot comparison of invasiveness estimates

(random_effects_comparison <- ggplot(combined_invasiveness_df, aes(x = Invasiveness_adjusted_type_specific_negbin, xmin = Invasiveness_lower_adjusted_type_specific_negbin, xmax = Invasiveness_upper_adjusted_type_specific_negbin, y = Invasiveness_REML, ymin = Invasiveness_lower_REML, ymax = Invasiveness_upper_REML, colour = classification)) + geom_errorbar(alpha = 0.25) + geom_errorbarh(alpha = 0.25) + geom_point() + scale_x_continuous(trans = "log10") + theme_bw() + scale_color_manual(values = vaccine_colours) + theme(legend.position = "bottom") + ylab("Invasiveness estimated from log odds ratio") + xlab("Invasivess estimate from adjusted type-specific negative binomial (cases per carrier per year)") + facet_wrap(~Sample_count) + stat_cor(aes(x = Invasiveness_adjusted_type_specific_negbin, y = Invasiveness_REML), inherit.aes = FALSE, method = "kendall", cor.coef.name = "tau") + ggrepel::geom_label_repel(data = combined_invasiveness_df %>% dplyr::filter((Invasiveness_adjusted_type_specific_negbin < 0.0005 & Invasiveness_FE > 0) | Serotype == "46"), aes(label = Serotype) ) )

combined_odds_ratio_plot <- cowplot::plot_grid( plotlist = list(fixed_effects_comparison + theme(legend.position = "none"), random_effects_comparison), ncol = 1, nrow = 2, rel_heights = c(1,1.2), labels = "AUTO" )

ggsave(combined_odds_ratio_plot, file = "combined_odds_ratio_plot.pdf", width = 8, height = 12)

This comparison can be extended to location-specific invasiveness factors, which incorporate the uncertainty of the study-specific adjustment factor:

``` {r Comparison with location-specific invasiveness}

netherland_pre_pcv_invasiveness_df <-
  progressionEstimation::get_type_invasiveness_for_study(S_pneumoniae_infant_serotype,
                         full_adjusted_type_specific_negbin_fit,
                         study = "Netherlands.post.PCV10")

 netherland_pre_pcv_invasiveness_df %<>%
      dplyr::mutate("classification" = dplyr::case_when(
          gsub("_pv","",type) %in% vaccine_types[["PCV7"]] ~ "PCV7",
          gsub("_pv","",type) %in% vaccine_types[["PCV10"]] ~ "PCV10",
          gsub("_pv","",type) %in% vaccine_types[["PCV13"]] ~ "PCV13",
          gsub("_pv","",type) %in% vaccine_types[["PCV15"]] ~ "PCV15",
          gsub("_pv","",type) %in% vaccine_types[["PCV20"]] ~ "PCV20",
          gsub("_pv","",type) %in% vaccine_types[["PPV23"]] ~ "PPV23",
          TRUE ~ "No PCV"
        )
      ) %>%
      dplyr::mutate(classification = factor(classification, levels = c("PCV7",
                                                                       "PCV10",
                                                                       "PCV13",
                                                                       "PCV15",
                                                                       "PCV20",
                                                                       "PPV23",
                                                                       "No PCV")))

# Order serotypes
serotype_levels <-
  netherland_pre_pcv_invasiveness_df %>%
    dplyr::select(type,classification) %>%
    dplyr::distinct() %>%
    group_by(classification) %>%
    dplyr::arrange(mixedrank(type),
                   .by_group = TRUE) %>%
    ungroup() %>%
    dplyr::select(type) %>%
    dplyr::pull()

netherland_pre_pcv_invasiveness_df %<>%
  dplyr::mutate(type = factor(type, levels = serotype_levels))

# Plot
(netherlands_invasiveness_plot <-
  progressionEstimation::plot_progression_rates(netherland_pre_pcv_invasiveness_df,
                                                unit_time = "year",
                                                type_name = "Serotype",
                                                colour_col = "classification",
                                                colour_palette = vaccine_colours,
                                                use_sample_size = TRUE))

This subset of serotypes can be compared with the odds ratios estimated using fixed effects:

``` {r Comparison of Netherlands invasiveness with odds ratios calculated with fixed effects}

netherlands_or_comparison_df <- netherland_pre_pcv_invasiveness_df %>% dplyr::rename(Serotype = type) %>% dplyr::rename(Netherlands_invasiveness = nu) %>% dplyr::rename(Netherlands_invasiveness_lower = nu_lower) %>% dplyr::rename(Netherlands_invasiveness_upper = nu_upper) %>% dplyr::select(-classification) %>% dplyr::left_join(combined_invasiveness_df, by = c("Serotype"))

Plot comparison of invasiveness estimates

(netherlands_fixed_or_comparison_plot <- ggplot(netherlands_or_comparison_df, aes(x = Netherlands_invasiveness, xmin = Netherlands_invasiveness_lower, xmax = Netherlands_invasiveness_upper, y = Invasiveness_FE, ymin = Invasiveness_lower_FE, ymax = Invasiveness_upper_FE, colour = classification)) + geom_errorbar(alpha = 0.25) + geom_errorbarh(alpha = 0.25) + geom_point() + scale_x_continuous(trans = "log10") + theme_bw() + scale_color_manual(values = vaccine_colours) + theme(legend.position = "bottom") + ylab("Invasiveness estimated from log odds ratio") + xlab("Invasivess estimate from adjusted type-specific negative binomial (cases per carrier per year)") + facet_wrap(~Sample_count) + stat_cor(aes(x = Invasiveness_adjusted_type_specific_negbin, y = Invasiveness_FE), inherit.aes = FALSE, method = "kendall", cor.coef.name = "tau") )

The same comparison can be undertaken against odds ratios calculated with random effects:

``` {r Comparison of Netherlands data with odds ratios calculated with random effects}

# Plot comparison of invasiveness estimates
(netherlands_random_or_comparison_plot <-
  ggplot(netherlands_or_comparison_df,
         aes(x = Netherlands_invasiveness,
             xmin = Netherlands_invasiveness_lower,
             xmax = Netherlands_invasiveness_upper,
             y = Invasiveness_REML,
             ymin = Invasiveness_lower_REML,
             ymax = Invasiveness_upper_REML,
             colour = classification)) +
    geom_errorbar(alpha = 0.25) +
    geom_errorbarh(alpha = 0.25) +
    geom_point() +
    scale_x_continuous(trans = "log10") +
    theme_bw() +
    scale_color_manual(values = vaccine_colours) +
    theme(legend.position = "bottom") +
    ylab("Invasiveness estimated from log odds ratio") +
    xlab("Invasivess estimate from adjusted type-specific negative binomial (cases per carrier per year)") +
    facet_wrap(~Sample_count) +
    stat_cor(aes(x = Invasiveness_adjusted_type_specific_negbin,
                 y = Invasiveness_REML),
             inherit.aes = FALSE,
             method = "kendall",
             cor.coef.name = "tau")
)

combined_netherlands_or_comparison_plot <-
  cowplot::plot_grid(plotlist = list(netherlands_fixed_or_comparison_plot,
                                     netherlands_random_or_comparison_plot),
                     nrow = 1,
                     ncol = 2,
                     labels = "AUTO")

ggsave(combined_netherlands_or_comparison_plot,
       file = "netherlands_or_comparison.pdf",
       height = 8,
       width = 14)

Comparison with carriage duration

``` {r Carriage duration comparison}

carriage_duration.df <- data.frame( "type" = c("19F","23F","6B","14","21","19B","18C","29","3","4","24F","NT","5","6A","6C"), "carriage_duration" = c(46.9,21,16.2,7.2,1.6,-0.1,-1.9,-4.3,-4.5,-7.2,-8.5,-12.3,-18.6,0,0) )

carriage_duration.df %<>% dplyr::inner_join(pv_best_fitting_output, by = "type") %>% dplyr::select(type,carriage_duration,classification,nu,nu_lower,nu_upper) %>% dplyr::distinct()

(invasiveness_v_duration_plot <- ggplot(carriage_duration.df, aes(x = carriage_duration, y = nu, ymin = nu_lower, ymax = nu_upper, colour = classification)) + geom_point() + geom_errorbar() + scale_colour_manual(values = vaccine_colours) + ggrepel::geom_label_repel(aes(label = type)) + scale_y_continuous(trans = "log10") + ylab(expression(nu)) + xlab("Carriage duration relative to serotype 6A/C") + theme_bw() )

ggsave(invasiveness_v_duration_plot, file = "invasiveness_v_duration.pdf", height = 8, width = 8)

```

nickjcroucher/progressionEstimation documentation built on April 15, 2023, 3:53 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

nickjcroucher/progressionEstimation
R package for the estimation of microbial progression rates

In nickjcroucher/progressionEstimation: R package for the estimation of microbial progression rates

Description and filtering of the dataset

%>%

dplyr::group_by(type)%>%

dplyr::mutate(study_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(study_count > threshold_count) %>%

dplyr::group_by(study) %>%

dplyr::mutate(serotype_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(serotype_count > threshold_count) %>%

dplyr::mutate(type = factor(type))

Run model comparison with leave-one-out cross-validation

Run model comparison with Bayes factors

Annotate serotypes by status

Order serotypes

Plot

Compare a model fit that account for PCVs protecting against disease

Plot comparison of invasiveness estimates

Plot comparison of invasiveness estimates

Comparison with carriage duration

R Package Documentation

Browse R Packages

We want your feedback!

nickjcroucher/progressionEstimation R package for the estimation of microbial progression rates

In nickjcroucher/progressionEstimation: R package for the estimation of microbial progression rates

Description and filtering of the dataset

%>%

dplyr::group_by(type)%>%

dplyr::mutate(study_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(study_count > threshold_count) %>%

dplyr::group_by(study) %>%

dplyr::mutate(serotype_count = n()) %>%

dplyr::ungroup() %>%

dplyr::filter(serotype_count > threshold_count) %>%

dplyr::mutate(type = factor(type))

Run model comparison with leave-one-out cross-validation

Run model comparison with Bayes factors

Annotate serotypes by status

Order serotypes

Plot

Compare a model fit that account for PCVs protecting against disease

Plot comparison of invasiveness estimates

Plot comparison of invasiveness estimates

Comparison with carriage duration

R Package Documentation

Browse R Packages

We want your feedback!

nickjcroucher/progressionEstimation
R package for the estimation of microbial progression rates