In bsaul/elktoeChemistry: Data and code for Elktoe Chemistry project

knitr::opts_chunk$set(
  collapse = TRUE,
  cache    = FALSE,
  comment = "#>"
)

Preparing analysis data

library(elktoeChemistry)
library(dplyr)
valve_data   <- readRDS(params$inputs$valve_data)
element_info <- readRDS(params$inputs$element_info)
lod          <- readRDS(params$inputs$lod)

Settings

@ Signal Filters

A list containing the chosen signal filters used as sensitivity analyses:

signal_filters <- list(

  base   = function(x, ds) identity(x),

  mad_10 = function(x, ds) {
    pracma::hampel(x, k = 10, t0 = 3)$y
  },

  avg5_trunc_3sd = function(x, ds){
    rm <- zoo::rollmean(c(rep(0, 4), x), k = 5)
    rs <- zoo::rollapply(c(rep(0, 4), x), width = 5, FUN = sd)
    outliers <- abs(x - rm) > 3*sd(x)
    x[outliers] <- rm[outliers]
    x
  },

  gam = function(x, ds){
    ds$y <- x
    p <- mgcv::gam(log(y) ~ s(distance, bs = "ts") + layer, data = ds)
    as.numeric(exp(predict(p)))
  }
)

Create analysis data function

The output of the following workflow produces a function that returns data ready to be analyzed. This function has four arguments:

contrast determines how the experimental units are groups. Options are:
- "all": each site is an exposure level
- "nobaseline": each site is an exposure level, baseline units are removed
- "baseline_v_sites": baseline units are compared to sites as a single unit
- "sites": compares Little Tennessee vs Tuckasegee, grouping sites within each river
- "litn": compares sites within the Little Tennessee only
- "tuck": compares sites within the Tuckasegee only
group_by_valve: an indicator of whether to group observations by valve (i.e. by specimen) (TRUE) or by transect (FALSE). Defaults to FALSE.
test_data_FUN: a function(data){...} applied within the pipeline used to create test statistic data. See example.
...: dots are passed to filter_analysis_data to filter observations by specimens, elements, analysis_group, transect layers, or signal (see above).

valve_data %>%
  dplyr::ungroup() %>%
  ## Keep only necessary variables ####
  dplyr::select(
    # identifiers
    id, transect, river, site, site_num, species, 
    # baseline covariates
    starts_with("baseline"), n_annuli, drawer,
    # outcomes
    dead, prop_weight_lost, moribund, final_status,
    # analysis groupings
    starts_with("agrp"),
    # chemistry and measurements
    chemistry, distance, lod
  ) %>%

  ## Make character variables easier to type ####
  dplyr::mutate(
    species = if_else(species == "A. raveneliana", "Arav", "Lfas")
  ) %>%

  ## Censor concentrations at lower limit of detection ####
  # Drop unneeded variables (e.g. censoring)
  dplyr::mutate(
    chemistry  = purrr::map2(
      .x = chemistry,
      .y = lod,
      .f = ~ {
        apply_lod(.x, .y) %>% dplyr::select(-ends_with("_censor"))
      })
  ) %>%

  ## Clean up distance data ####
  # Keep only needed variables
  dplyr::mutate(
    distance = purrr::map(
      .x = distance,
      .f = ~ dplyr::select(.x, distance, layer, annuli, obs)
    )
  )  %>%
  ## Apply signal filters ####
  dplyr::mutate(
    chemistry = purrr::map2(
      .x = chemistry,
      .y = distance,
      .f = function(elements, distance){
        purrr::map(
          .x = elements[-1],
          .f = function(el){
            purrr::map(
              .x = signal_filters,
              .f = ~ .x(el, distance))
          }
        )
      }
    )
  ) %>%

  ## Append LOD data to chemistry data ####
  # used for IPCW estimators
  # Note that at this stage, this simply creates a copy of the LOD data for 
  # each observation; however, at the next stage (conversion to different scale),
  # the lod will vary depending on the layer.
  dplyr::mutate(
    chemistry = purrr::map2(
      .x = chemistry,
      .y = lod,
      .f = function(ch, ld){
        purrr::imap(
          .x = ch,
          .f = ~ {
            lod_dat <- rep(ld[ld$element == .y, "lod", drop = TRUE], length(ch[[1]][[1]]))
            c(.x, list(lod = lod_dat))
          } 
        )
      }
    )
  ) %>%

  ## Convert values to mmmol per Ca mol ####
  dplyr::mutate(
    chemistry = purrr::map2(
      .x = chemistry,
      .y = distance,
      .f = function(ch, ds){
        cawtpct <- dplyr::if_else(ds[["layer"]] == "ncr", 39.547395, 40.078)
        # apply to each element
        purrr::imap(
          .x = ch,
          .f = function(vals, element){
            # apply to each signal
            mass <- element_info[element_info[["element"]] == element, "mass", drop = TRUE]
            purrr::map(
              .x = vals,
              .f = ~ ppm_to_mmol_camol(ppm = .x, gmol = mass, ca_wt_pct = cawtpct)
            )
          }
        )
      }
    )
  ) %>%

  ## Add the transect filter functions #### 
  dplyr::mutate(
    transect_filter = purrr::map(distance, ~ create_transect_filter(.x))
  )  %>%
  create_analysis_data_preparer() ->
  analysis_data 

saveRDS(analysis_data, file = params$outputs$analysis)

Example usage of analysis data `function`

res <- 
analysis_data(
  contrast = "all",
  test_data_FUN = function(data) data,
  agrp      = "all",
  elements  = "Ba_ppm_m138", # "all" returns all elements
  signals   = c("base", "gam"), 
  group_by_valve = TRUE,
  transect_opts  = list(
    .layers    = "psm",
    .min_n_obs = 15L
  )
)