In jlucasmckay/JLmisc: A Package of Miscellaneous R Functions

knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 7, fig.align = "center", echo = FALSE)

Datasets used for this analysis

library(JLmisc)
pctEmpty = function(x) scales::percent((sum(is.na(x))/sum(length(x))))
m = pctEmpty(dystonia$`Biomarker Level`)
n = sum(!is.na(dystonia$`Biomarker Level`))
nMiss = sum(is.na(dystonia$`Biomarker Level`))

All data in these analyses were taken from the dystonia dataset bundled with the JLmisc R package. This dataset was assembled from .csv files provided by Laura Scorr, M.D.

Biomarker data

Analyses considered measurements of r length(unique(dystonia$Assay)) unique biomarkers in r length(unique(dystonia$Sample)) individuals. Biomarkers were measured in duplicate. The initial sample available for analysis included r n unique measurements. In addition, N=r nMiss additional measurements were missing, and were assummed to be missing at random for purposes of analysis. Biomarker data were inspected visually to confirm the absence of outlier measurements. Duplicate observations were then averaged so that the level of each biomarker for each participant was represented by a single measurement.^[ In this dataset, r m of the individual readings were missing. Biomarkers for which only one observation of a given patient was available were flagged to enable post-hoc sensitivity analyses if appropriate. In the dataset, the data are saved two ways. In both cases, the average of the two replicates is used if possible. If only one replicate is available, `Biomarker Level` takes that value; `Biomarker Level (Duplication Required)` is empty. ]

Analytic plan

Descriptive statistics for each biomarker were calculated for each biomarker stratified in two ways: 1) stratification by Cervical Dystonia (CD) vs. Control, and, 2) stratification by Cervical Dystonia + Hypothyroidism (CD/Hypo) vs. CD vs. Control.

Variation in average expression of each biomarker with the presence of CD were assessed with linear models.^[function lm in R software] Main models included CD as a dichotomous variable. No formal adjustment for multiple comparisons was performed in primary analyses. P values adjusted for False Discovery Rate are presented for reference. Addtional linear models were calculated incorporating an interaction term coding for the additive effect of hypothyroid disorder in addition to that of CD.

library(tidyverse)
library(modelr)
library(tidyverse)
library(broom)

`%contains%` = function(s, pattern) grepl(pattern, s)
`%omits%` = function(s, pattern) !grepl(pattern, s)

dys = left_join(
  dystonia %>% group_by(CD, Group, Assay, Sample) %>% summarize(`Biomarker Level` = mean(`Biomarker Level`, na.rm = TRUE)) %>%  ungroup(),
  dystonia %>% group_by(CD, Group, Assay, Sample) %>% summarize(`Biomarker Level (Duplication Required)` = mean(`Biomarker Level`, na.rm = FALSE)) %>%  ungroup()
) %>% mutate(`Assay Name` = make.names(Assay))

dysWide = dys %>% select(CD, Group, Sample, `Assay Name`, `Biomarker Level`) %>% group_by(CD, Group, Sample) %>% pivot_wider(names_from = `Assay Name`, values_from = `Biomarker Level`)
biomarkerVariableNames = names(dysWide)[-1:-3]

# this model takes a dataset and returns the p value of a t-test comparing any CD to control.
cdVsControlModel = function(d = dys %>% filter(biomarker == "IL.10")){
  # add call to relevel to make the reference level first
  mod = lm(`Biomarker Level` ~ CD, data = d %>% ungroup() %>% mutate(CD = relevel(CD, "Control"))) %>% tidy()
  mod %>% pull(p.value) %>% tail(1)
}

# this model takes a dataset and returns two p values: any CD to control, and an interaction term for presence of hypothyroid.
interactionModel = function(d = dys %>% filter(biomarker == "IL.10")){
  # create variables for any CD and an interaction term for hypothyroid. 
  d = d %>%
    ungroup() %>% 
    mutate(CD = Group %contains% "CD") %>%
    mutate(Hypo = Group %contains% "Hypo") %>% 
    mutate(Group = relevel(Group, "Control"))
  # threeGroupModel = lm(`Biomarker Level` ~ Group, data = d) %>% tidy()
  mod = lm(`Biomarker Level` ~ CD + Hypo, data = d) %>% tidy()
  mod %>% pull(p.value) %>% tail(2)
}

# create a nested data structure
byBiomarker = dys %>% group_by(`Assay Name`) %>% nest()

# apply the linear model to each biomarker (this is a two-sample t-test)
byBiomarker = byBiomarker %>% mutate(cdVsControlPValue = purrr::map(data, cdVsControlModel)) %>% select(`Assay Name`, cdVsControlPValue, everything()) %>% mutate(cdVsControlPValue = unlist(cdVsControlPValue))

# Adjust the p values with FDR
byBiomarker$cdVsControlPValueFDR = p.adjust(byBiomarker$cdVsControlPValue, method = "fdr")

# rename the p values to be more descriptive
byBiomarker = byBiomarker %>% rename(pTTest = cdVsControlPValue) %>% rename(pTTestFDR = cdVsControlPValueFDR)

# apply the interaction model to each biomarker (this is a two-sample t-test)
byBiomarker = byBiomarker %>% mutate(interactionPValue = purrr::map(data, interactionModel)) %>% mutate(pLinearModelCD = interactionPValue[[1]][1]) %>% mutate(pLinearModelHypo = interactionPValue[[1]][2])

# sort and rename variables 
results = byBiomarker %>% select(`Assay Name`, pTTest, pTTestFDR, pLinearModelCD, pLinearModelHypo) %>% 
  arrange(pTTest) %>% 
  rename(
    `CD vs. Control` = pTTest,
    `CD vs. Control, FDR` = pTTestFDR,
    `CD vs. Control, interaction model` = pLinearModelCD,
    `CD/Hypothyroid vs. CD, interaction model` = pLinearModelHypo
  )

Graphical comparison of biomarkers across Cervical Dystonia (CD) and control samples

# calculate which group is higher for each biomarker
groupLevels = dys %>%
  group_by(`Assay Name`, CD) %>% summarize(maxVal = max(`Biomarker Level`, na.rm = TRUE)) %>% ungroup() %>%
  group_by(`Assay Name`) %>% summarize(greaterVal = max(maxVal), lesserGroup = CD[which.min(maxVal)]) %>% mutate(CD = factor(lesserGroup)) %>% rename(`Biomarker Level` = greaterVal) %>%
  full_join(results %>% mutate(pStr = paste0(
    "pt=",round(`CD vs. Control`,3),"\n",
    "pCD=",round(`CD vs. Control, interaction model`,3),"\n",
    "pHyp=",round(`CD/Hypothyroid vs. CD, interaction model`,3))
  ) %>% select(`Assay Name`, pStr) %>% rename(label = pStr))

twoGroupPlot = function(biomarkersToPlot = head(results$`Assay Name`, 6)){

  dataset = dys %>% filter(`Assay Name` %in% biomarkersToPlot) %>% mutate(`Assay Name` = ordered(`Assay Name`, levels = biomarkersToPlot))
  labels = groupLevels %>% filter(`Assay Name` %in% biomarkersToPlot) %>% mutate(`Assay Name` = ordered(`Assay Name`, levels = biomarkersToPlot))

  # %>% mutate(biomarker = factor(biomarker, levels = biomarkersToPlot, ordered = TRUE))
  ggplot(
    dataset,
    aes(x = CD, y = `Biomarker Level`, color = CD)) + 
    geom_boxplot(outlier.shape = NA, na.rm = TRUE) + 
    geom_jitter(width = 0.2, na.rm = TRUE, mapping = aes(shape = Group)) +
    facet_wrap(~`Assay Name`, scales = "free") + 
    theme_minimal() +
    theme(axis.text.x = element_blank()) +
    geom_text(aes(label = label), data = labels, size = 2, show.legend = FALSE, color = "black", hjust = "right", vjust = "top", position = position_nudge(x = +0.4))
}
twoGroupPlot()

Linear model numerical results

library(kableExtra)
results %>% 
  mutate_if(is.numeric, function(x) round(x,3)) %>% 
  kable() %>%
  kable_styling() %>% 
  add_header_above(c(" " = 1, "P Values, Two-Level Model" = 2, "P Values, Interaction Model" = 2))

Descriptive statistics

Stratification by CD vs. Control

suppressMessages(library(table1))
table1Formula = paste(paste("~", paste(biomarkerVariableNames, collapse = " + ")),"| factor(CD)") %>% as.formula()
table1(table1Formula, data = dysWide, overall = NULL)

Stratification by CD/Hypo vs. CD vs. Control

suppressMessages(library(table1))
table1Formula = paste(paste("~", paste(biomarkerVariableNames, collapse = " + ")),"| factor(Group)") %>% as.formula()
table1(table1Formula, data = dysWide, overall = NULL)