title: "Modeling Example for BioDataCore Workflow" author: "Mir Henglin" date: "r Sys.Date()" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Modeling Setup for BioDataCore Workflow} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8}


# Data Generation

First, let's generate a fake dataset to do our modeling on:

set.seed(12345)
library(dplyr)
library(biodatacoreUtils)
library(purrr)
library(biodatacoreMTM)
library(tidyr)

dat$metabolites <- dat <-  list()


dat$metabolites$data <- 
data_frame(
id_no = 1:100,
mzid_1 = c(runif(90, 0, 10000), rep(0, 10)), 
mzid_2 = c(runif(50, 1000, 20000), rep(0, 50)), 
mzid_3 = c(runif(90, 0, 1000), rep(0, 10))
)

dat$metabolites$info <-
data_frame(
label = glue::glue('mzid_{1:3}'),
metabolite_type = c('Eicosanoids', 'Eicosanoids', 'Bile Acids'),
mz = runif(3, 200, 700),
RT = runif(3, 1, 7)
)

dat$clinical <-
data_frame(
id_no = 1:100,
sex = rbinom(100, 1, 0.5),
age = runif(100, 18, 85),
diab =  c(rbinom(80, 1, 0.25), rep(NA, 20)),
sbp = runif(100, 80, 160)
)

# shuffle to distribute NAs
dat$metabolites$data %<>% mutate_all(~sample(., length(.)))
dat$clinical         %<>% mutate_all(~sample(., length(.)))

dat$merged <-
left_join(dat$metabolites$data, dat$clinical, by = 'id_no')


walk(dat, ~print(head(.)))

Data cleaning

Lets say we want to look at the effects of Eicosanoids on Diab and SBP, controlling for age and sex. Lets get a dataset that contains only those variables.

eics <-
  dat$metabolites$info %>%
  filter(metabolite_type == 'Eicosanoids') %>% 
  pull(label)

control_vars <-
  c('age', 'sex')

responses <-
  c('sbp', 'diab')

modeling_dat <- 
  dat$merged %>%
  select(!!responses, !!control_vars, !!eics)

head(modeling_dat)

The first thing we should do is check that each of our variables are of the proper form. We expect the Diabetes and Sex variables to be factors. We expect age, sbp to be numeric and non-negative. We expect that the metabolite variables metabolite variables to be numeric, and to have no NAs or 0s in them. We can use the check_at function to accomplish all of these things easily

factor_vars <- c('diab', 'sex')
positive_numeric_vars <- c('sbp', 'age', eics)

checks <-
  list(
    factors = list(vars = factor_vars, f = is.factor),
    numeric = list(vars = positive_numeric_vars, f = is.numeric),
    positive = list(vars = positive_numeric_vars, f = ~is_positive(., na_rm = TRUE)),
    no_na = list(vars = eics, f = ~!anyNA(.)),
    no_zeros = list(vars = eics, f = ~!any(. == 0))
  )



check_results <- check_at(modeling_dat, zip(checks)[[1]], zip(checks)[[2]])


# 
# # Convert diab and sex to factors
# modeling_dat %<>%
#   mutate_at(vars(diab, sex), as.factor)

We can take a peek at the results using dplyr::glimpse

glimpse(check_results)

As we can see, not all checks are met! We need to convert diab and sex to factors. And we need to handle the 0s in the mzid variables.

modeling_dat %<>%
  mutate_at(factor_vars, as.factor)

Convention for BDC data anlalyses says that missing values in metabolite varaibles should be imputed with one-quarter the minimum non-zero value.

min_over_4 <- function(x) {
  min(x, na.rm = TRUE) / 4
}

modeling_dat %<>%
  mutate_at(eics, fill_zero, .fill = NA) %>%
  mutate_at(eics, fill_na, .fill = min_over_4)

Now we check again.

check_results <- check_at(modeling_dat, zip(checks)[[1]], zip(checks)[[2]])
glimpse(check_results)

# make this more informative

Alternatively, using code:

all(unlist(check_results))

Other preprocessing:

modeling_dat %<>%
  mutate_at(eics, log) %>%
  mutate_if(is.numeric, scale2)

modeling_dat

Done!

Model Specification

As a reminder, we want to look at the effects of Eicosanoids on Diab and SBP, controlling for age and sex. If we wrote those modeling formulas out by hand, it would be messy, and require a lot of repeated typing. That is a recipe for mistakes. Instead, we will use biodatacoreUtils::cross_at and biodatacoreUtils::f_cross_sub to create a list of formulas.

models <- list(sample_model = response ~ control_vars + eic)

substitution_rules <-
  cross_at(list(response = responses, eic = eics, control_vars = control_vars), 1:2)


models <- f_cross_sub(models, substitution_rules)

models

Another way. More obvious organization, in exchange for a little more typing.

models <- list(
  sbp = sbp ~ control_vars + eic,
  diabetes = diab ~ control_vars + eic
)


substitution_rules <-
  cross_at(list(eic = eics, control_vars = control_vars), 1) 

models <- f_cross_sub(models, substitution_rules)

models

Modeling

Now that we have our dataset and formulas, we can model!

.model <- function(fo, data) {
  response <- deparse2(rlang::f_lhs(fo))
  fam <- if (response == 'sbp') gaussian() else binomial()
  out <- glm(fo, family = fam, data)

  fam_sub <- parse(text = glue::glue('{fam$family}(link = {fam$link})'))[[1]]
  out$call <- biodatacoreUtils::substitute_q(out$call, list(fo = fo, fam = fam_sub, data = substitute(data)))
  out
}


linear_models <-
  modify_depth(models, 2, ~.model(., modeling_dat))


linear_models

Tidying

tidy_models <- 
  linear_models %>%
  modify_depth(2, tidy2)

contexts <- 
  linear_models %>%
  modify_depth(2, contextualize)


tidy_models
contexts
bindr <- 
  . %>%
  map(bind_rows, .id = 'model_number') %>% 
  bind_rows(.id = 'model_name')

tidy_models %<>% bindr
contexts %<>% bindr

out <- left_join(tidy_models, contexts)

out

Add Ons

out %<>%
  add_bonf_pv(p_value = 'p.value') %>%
  add_fdr_pv(p_value = 'p.value') %>%
  add_nlog10_pv(p_value = 'p.value')

out


biodatacore/biodatacoreMTM documentation built on May 12, 2019, 8:41 a.m.