knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = TRUE,
  error = TRUE
)

Univariate distribution checks

This section reports a series of univariate summary checks of the CRASH-2 dataset.

library(here)
library(tidyverse)
library(Hmisc)
source(here("R", "ida_plot_univar.R"))  ## function to plot univariate summaries. 

# Define sets of predictors by type - helpful for selection
demog_vars <- c("age", "sex")
pyhiso_vars <- c("sbp", "hr", "rr", "gcs", "cc")
injury_vars <- c("injurytime", "injurytype")

## Load the dataset.
load(here::here("data", "a_crash2.rda"))

Data set overview

Using the Hmisc describe function, we provide an overview of the data set. The descriptive report also provides histograms of continuous variables. For ease of scanning the information, we group the report by measurement type.

Demographic variables

a_crash2 %>%
  dplyr::select(all_of(demog_vars)) %>%
  Hmisc::describe(descript = "Demographic variables") %>%
  Hmisc::html(size = 80)

Physiological measurements

a_crash2 %>%
  dplyr::select(all_of(pyhiso_vars)) %>%
  Hmisc::describe(descript = "Physiological measurements") %>%
  Hmisc::html(size = 80)

Characteristics of injury

a_crash2 %>%
  dplyr::select(all_of(injury_vars)) %>%
  Hmisc::describe(descript = "Characteristics of injury") %>%
  Hmisc::html(size = 80)

Categorical variables

We now provide a closer visual examination of the categorical predictors.

a_crash2 %>%
  dplyr::select(sex, injurytype) %>%
  dplyr::mutate_all(forcats::as_factor) %>%   
  dplyr::mutate_all(forcats::fct_explicit_na, "NA") %>%
  tidyr::pivot_longer(
    dplyr::everything(),
    names_to = "var",
    values_to = "value",
    values_drop_na = FALSE
  ) %>%
  dplyr::group_by(var, value) %>%
  dplyr::summarize(N = n()) %>%
  dplyr::mutate(
    freq = N / sum(N),
    pct = round((freq * 100), 1),
    axis_lab = paste0(value, ' ', '(N = ', N, ')'),
    var_label = case_when(var == "sex" ~ "Sex",
                          var == "injurytype" ~ "Injury type")
  ) %>%
  ggplot(aes(
    x = reorder(axis_lab, pct),
    y = pct,
    label = pct
  )) +
  geom_text(nudge_y = 7) +
  geom_pointrange(aes(ymin = 0, ymax = pct), alpha = 1, size = 1, color = "grey") +
  geom_point(color = "firebrick2",
             alpha = 0.6,
             size = 3) +
  ylab("Percentage (%)") +
  scale_y_continuous(limits = c(0, 100)) +
  labs(caption = "Number of subjects with a non-missing value reported in brackets.\nNA = missing") +
  facet_wrap(~ var_label, ncol = 1, scales = "free_y") +
  coord_flip() +
  theme_minimal(base_size = 12) +
  theme(axis.title.y = element_blank(),
        panel.grid.minor = element_blank())

Categorical ordinal plots

The Glasgow coma score, an ordinal categorical variable, is also displayed separately.

a_crash2 %>%
  dplyr::select(gcs) %>%
  dplyr::mutate(gcs = as_factor(gcs)) %>%
  tidyr::pivot_longer(
    dplyr::everything(),
    names_to = "var",
    values_to = "value",
    values_drop_na = FALSE
  ) %>%
  dplyr::group_by(var, value) %>%
  dplyr::summarize(N = n()) %>%
  dplyr::mutate(
    freq = N / sum(N),
    pct = round((freq * 100), 1),
    value = as_factor(value),
    axis_lab = as_factor(paste0(value, ' ', '(N = ', N, ')'))
  ) %>%
  ggplot(aes(
    x = axis_lab,
    y = pct,
    label = pct
  )) +
  geom_text(nudge_y = 7) +
  geom_pointrange(aes(ymin = 0, ymax = pct), alpha = 1, size = 1, color = "grey") +
  geom_point(color = "firebrick2",
             alpha = 0.6,
             size = 3) +
  ylab("Percentage (%)") +
  scale_y_continuous(limits = c(0, 100)) +
  labs(title = "Glasgow coma score (point scale)",
    caption = "Number of subjects with a non-missing value reported in brackets.\nNA = missing") +
  coord_flip() +
  theme_minimal(base_size = 12) +
  theme(axis.title.y = element_blank(),
        panel.grid.minor = element_blank())

Continuous variables

A closer visual examination of continuous predictors.

a_crash2 %>%
  dplyr::select(age, sbp, rr, cc, hr, injurytime) %>%
  dplyr::mutate() %>%
  dplyr::mutate_all(as.numeric) %>%
  tidyr::pivot_longer(dplyr::everything(),
                      names_to = "var",
                      values_to = "value") %>%
  dplyr::mutate(
    var_label = case_when(
      var == "age" ~ "Age (years)",
      var == "sbp" ~ "Systolic blood pressure (mmHg)",
      var == "hr" ~  "Heart rate (1/min)",
      var ==  "rr" ~ "Respiratory rate (1/min)",
      var ==  "cc" ~ "Central capillary refill time (seconds)", 
      var == "injurytime" ~ "Injury time (days)"
    )) %>%
  ggplot(aes(value)) +
  ylab("Number of subjects") +
  geom_histogram(
    binwidth = 0.5,
    center = 0,
    alpha = 0.6,
    fill = "firebrick2"
  ) +
  facet_wrap(~ var_label, scales = "free", ncol = 2) +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        panel.grid.minor = element_blank())

There is evidence of digit preference. Explore further with targeted summaries. A more detailed univariate summaries for the variables of interest are also provided below.

Age

## This is a function the plots a strip plot, histogram and boxplot, including five number summary. 
ida_plot_univar(a_crash2, "age")

Five patients under the age of 17, the inclusion criteria for the study, with one patient aged 1.

Blood pressure

ida_plot_univar(a_crash2, "sbp", bin_width = 0.7)

Respiratory rate

ida_plot_univar(a_crash2, "rr", n_dodge = 2)

Heart rate

ida_plot_univar(a_crash2, "hr")

Central capillary refill time

ida_plot_univar(a_crash2, "cc", bin_width = 0.5)

Hours since injury

ida_plot_univar(a_crash2, "injurytime", n_dodge = 3, bin_width = 0.25)

Section session info

sessionInfo()


bailliem/ida22 documentation built on Jan. 2, 2022, 12:14 a.m.