knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

# run this to update the pdf, as well
# rmarkdown::render("vignettes/varameta.Rmd", "all")
# for reproducibility
set.seed(39)

# packages
library(varameta)
library(tidyverse)
library(simeta)
library(panda)

# move to parameter when done with chunk-wise?
sample_size <- 10

Objective of the varameta:: package

The varameta:: package aims to bridge the toolchain gap from a dataset containing medians to a software package, such as metafor:: [@viechtbauerConductingMetaanalysesMetafor2010], for meta-analysis.

panda("varameta:: is for meta-analysing medians")

A minimal demonstration of calculating the variance of the sample median

panda("show me the code!")

Compute standard error of the sample median

# get a sample
a_sample <- rlnorm(sample_size,-1, 0.1)

# get standard error of the sample median
effect_se(
  centre = median(a_sample),
  spread = IQR(a_sample),
  n = length(a_sample),
  centre_type = "median",
  spread_type = "iqr"
)

Vectorised calculations for dataframes

Here we borrow a function from the companion simeta:: package. See below for details.

# generate random meta-analysis dataset
# one row per study
(ma_sample <- sim_stats() %>%
  # filter down to one group per study
  dplyr::filter(group == "control"))


ma_sample %>%
  # append a column with the standard error of the median for each study
  mutate(effect_se = pmap_dbl(
    list(centre = effect,
         spread = effect_spread,
         n = n),
    effect_se,
    centre_type = "median",
    spread_type = "iqr"
  ))

Calculate the standard error of mean or median based on effect, spread, and sample size

A wrapper function effect_se provides a quick method of calculating the error of an effect based on its measure of effect, spread, sample size.

Consider a randomly generated sample.

(a_sample <- rlnorm(sample_size,-1, 0.1))

# summary statistics for this sample
a_sample %>% log() %>% summary()

Taken from a lognormal distribution $\mathrm{lognormal}(-1, 0.1^2)$.

# taken from lognormal distribution
tibble(x = c(0, 1)) %>%
  ggplot(aes(x = x)) +
  geom_density(
    data = tibble(x = a_sample),
    aes(x = x),
    alpha = 0.5,
    fill = "darkgrey",
    colour = "darkgrey"
  ) +
  stat_function(
    fun = dlnorm,
    args = list(meanlog = -1,
                sdlog = 0.1),
    linetype = "dotted"
  ) + 
  labs(title = stringr::str_wrap("Sample from log-normal density", 30),
       subtitle = stringr::str_wrap("Sample density in grey fill; true density, black dotted line"),
       caption = "lognormal(-1, 0.01)")
# for comparison we shall store the true distribution stats
true_stats <- list(
  mean = exp(-1 + 0.01/2),
  median = exp(-1),
  variance = (exp(0.01) - 1) * exp(2*-1 + 0.01)
)

Estimate the standard error of the sample median

With the varameta:: package we execute the following code to estimate the standard error of the sample median, with sample summary statistics: median; IQR (interquartile range); length for sample size.

# standard error based on the median, interquartile range, and sample size
effect_se(
  centre = median(a_sample),
  spread = IQR(a_sample),
  n = length(a_sample),
  centre_type = "median",
  spread_type = "iqr"
)

The calculation can be adapted for the range.

# standard error based on the median, range, and sample size
effect_se(
  centre = median(a_sample),
  spread = abs(diff(range(a_sample))),
  n = length(a_sample),
  centre_type = "median",
  spread_type = "range"
)
panda("show me the math!")

This calculation

todo: eqn to finish off section! words around eqns to finish off section blah blah see companion manuscript..

[ \mathrm v(M) := \frac 1 {4n\left[ g\left(M; \hat{\theta}\right)\right]^2} ]

[ \hat \mu := \log(M). ]

[ G^{-1}(p; \mu, \sigma) = \exp(\sigma\Phi^{-1}(p) + \mu). ]

[ \hat{\sigma}^{(1)} := \frac 1 {\Phi^{-1}\left(\frac 3 4\right)} \log\left(\frac {\mathrm{iqr} e^{- \hat \mu} \pm \sqrt{\mathrm{iqr}^2 e^{-2 \hat \mu} + 4} } {2}\right) ]

[ \hat{\sigma}^{(2)} := \frac 1 {\Phi^{-1}\left(\frac {n - \frac 1 2} n\right)} \log\left[\frac {(x_{[n]} - x_{[1]}) e^{- \hat \mu} \pm \sqrt{(x_{[n]} - x_{[1]})^2 e^{-2\hat \mu} + 4} } {2}\right]. ]

Calculate the standard error of the sample mean

Now, we wish to calculate the standard error $s/\sqrt n$ of the sample mean, calcualted with the the sample standard deviation and the squared-root of the sample size.

# mean and sd
effect_se(
  centre = mean(a_sample),
  spread = sd(a_sample),
  n = length(a_sample),
  centre_type = "mean",
  spread_type = "sd"
)

# compare
sd(a_sample) /
  sqrt(length(a_sample))

# mean and var
effect_se(
  centre = mean(a_sample),
  spread = var(a_sample),
  n = length(a_sample),
  centre_type = "mean",
  spread_type = "var"
)

# compare
sqrt(var(a_sample) /
  length(a_sample))

Vectorised calculations for meta-analysis datasets

The functions of the varameta:: package integrate with tidyverse:: [@wickham_tidyverse_2017]. In this section, we show details for how to caculate the standard error of the effect when there are medians, but also when there are means included.

panda("vectorisation is my favourite part of R", panda = 14)

Here's how we might calculate the standard error for when all reported statistics are medians.

# borrowing from sister package simeta:: to simulate a dataset
(meta_data <- sim_stats() %>% 
  dplyr::filter(group == "control"))

# each row of the dataset is a study

# add a column with the standar error for each study:
meta_data %>% 
  mutate(
    effect_se = pmap_dbl(
      list(centre = effect, spread = effect_spread, n = n),
      effect_se,
      centre_type = "median",
      spread_type = "iqr"
    )
  )

# todo function this (after report)

This can be adapted for more complex situations by treating the centre_type and spread_type arguments as dynamic variables that can change from study to study. For this we turn to a real-life dataset, taken from a study in d-dimer levels in pre-eclampsia [@pinheiroDdimerPreeclampsiaSystematic2012].

pinheiro_data %>% 
  kableExtra::kable() %>% 
  kableExtra::kable_styling()

In these data, each row contains study and year variables to identify which manuscript the data were drawn from, and the measures of centre and spread reported. For control, (j = c), and treatment (or intervention), (j = t), groups, reported summary statistcs are provided: sample size n_j, median m_j, spread in reported form s_j and as a range s_j_d.

Suppose we are interested in the log-ratio of sample medians for this dataset.

todo: do calculation

Other estimators

panda(panda = 49,
      "varameta:: provides several estimators for the variances of the sample median")

When developing the estimator provided for the variance of the sample median, several other estimators were coded and included in the varameta:: package. In this section, we provide information about how compare and understand the estimators, and what inputs they work on.

Estimator key

The estimators provided in the varameta:: package each use a particular set of summary statistics as inputs.

minimum | first quartile | median | third quartile | maximum | sample size -|-|-|-|-|- (a)|(q_1)|(m)|(q_3)|(b)|(n)

The output is a variance estimation for each, but for all but effect_se, these provide estimates for mean and standard deviation, from the quartiles reported.

publication | input statistics | mean estimator | sd estimator - | -| - | - Prendergast (forthcoming, 2020) |(C_3 := {q_1, m, q_3; n}) and (C_1 := {a, m, b; n}) | NA | effect_se [@hozoEstimatingMeanVariance2005] | (C_1 := {a, m, b; n}) | hozo_mean | hozo_se [@blandEstimatingMeanStandard2014] | (C_2 := {a, q_1, m, q_3, b; n})| bland_mean | bland_se [@wanEstimatingSampleMean2014] | (C_1 := {a, m, b; n})| wan_mean_C1 | wan_se_C1 [@wanEstimatingSampleMean2014] | (C_2 := {a, q_1, m, q_3, b; n})| wan_mean_C2 | wan_se_C2 [@wanEstimatingSampleMean2014]| (C_3 := {q_1, m, q_3; n})| wan_mean_C3 | wan_se_C3

The References section has details of all publications referred to in the above table.

Hozo et al.'s estimators

Hozo et al. [@hozoEstimatingMeanVariance2005] provide a method for estimating the sample mean and the standard deviation from the sample minimum, maximum, and sample size.

hozo_mean(
  min(a_sample),
  median(a_sample),
  max(a_sample)
)

hozo_se(
  min(a_sample),
  median(a_sample),
  max(a_sample),
  n = length(a_sample)
)

Bland's estimators

Wan et al.'s estimators

These estimators take various sample quartiles and return estimates for the mean and the standard deviation.

Wan et al. provide estimators for the same summary sample sets as Hozo et al. and Bland's methods, as well as a novel method for when the interquartile range, but not the range, is available. All estimators require the sample size for the estimate of the standard error of the mean.

For comparison, we recall the true statistics we calculated earlier.

# mean for the distribution sampled from
true_stats %>% pluck("mean")

# variance for the distribution sampled from
true_stats %>% pluck("variance")

The estimator for set $C_1 = {a, m, b; n}$ requires the range and the median. This is their improvement on Hozo's method.

wan_mean_C1(
  min(a_sample),
  median(a_sample),
  max(a_sample)
)

wan_se_C1(
  min(a_sample),
  max(a_sample),
  n = length(a_sample)
)

The second estimator requires the same summary statistics, (C_2 := {a, q_1, m, q_3, b; n}), as Bland's method.

wan_mean_C2(
  min(a_sample),
  quantile(a_sample, 0.25),
  median(a_sample),
  quantile(a_sample, 0.75),
  max(a_sample)
)

wan_se_C2(
  min(a_sample),
  quantile(a_sample, 0.25),
  median(a_sample),
  quantile(a_sample, 0.75),
  max(a_sample),
  n = length(a_sample)
)

The third method requries the summary statistics (C_3 := {q_1, m, q_3; n}), a case not considered by Hozo or Bland.

wan_mean_C3(
  quantile(a_sample, 0.25),
  median(a_sample),
  quantile(a_sample, 0.75)
)

wan_se_C3(
  quantile(a_sample, 0.25),
  median(a_sample),
  quantile(a_sample, 0.75),
  n = length(a_sample)
)

References



softloud/varameta documentation built on Feb. 8, 2023, 12:12 a.m.