title: "Modeling Example for BioDataCore Workflow"
author: "Mir Henglin"
date: "r Sys.Date()
"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Modeling Setup for BioDataCore Workflow}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
# Data Generation
First, let's generate a fake dataset to do our modeling on:
set.seed(12345) library(dplyr) library(biodatacoreUtils) library(purrr) library(biodatacoreMTM) library(tidyr) dat$metabolites <- dat <- list() dat$metabolites$data <- data_frame( id_no = 1:100, mzid_1 = c(runif(90, 0, 10000), rep(0, 10)), mzid_2 = c(runif(50, 1000, 20000), rep(0, 50)), mzid_3 = c(runif(90, 0, 1000), rep(0, 10)) ) dat$metabolites$info <- data_frame( label = glue::glue('mzid_{1:3}'), metabolite_type = c('Eicosanoids', 'Eicosanoids', 'Bile Acids'), mz = runif(3, 200, 700), RT = runif(3, 1, 7) ) dat$clinical <- data_frame( id_no = 1:100, sex = rbinom(100, 1, 0.5), age = runif(100, 18, 85), diab = c(rbinom(80, 1, 0.25), rep(NA, 20)), sbp = runif(100, 80, 160) ) # shuffle to distribute NAs dat$metabolites$data %<>% mutate_all(~sample(., length(.))) dat$clinical %<>% mutate_all(~sample(., length(.))) dat$merged <- left_join(dat$metabolites$data, dat$clinical, by = 'id_no') walk(dat, ~print(head(.)))
Lets say we want to look at the effects of Eicosanoids on Diab and SBP, controlling for age and sex. Lets get a dataset that contains only those variables.
eics <- dat$metabolites$info %>% filter(metabolite_type == 'Eicosanoids') %>% pull(label) control_vars <- c('age', 'sex') responses <- c('sbp', 'diab') modeling_dat <- dat$merged %>% select(!!responses, !!control_vars, !!eics) head(modeling_dat)
The first thing we should do is check that each of our variables are of the proper form. We expect the Diabetes and Sex variables to be factors. We expect age, sbp to be numeric and non-negative. We expect that the metabolite variables metabolite variables to be numeric, and to have no NAs or 0s in them. We can use the check_at
function to accomplish all of these things easily
factor_vars <- c('diab', 'sex') positive_numeric_vars <- c('sbp', 'age', eics) checks <- list( factors = list(vars = factor_vars, f = is.factor), numeric = list(vars = positive_numeric_vars, f = is.numeric), positive = list(vars = positive_numeric_vars, f = ~is_positive(., na_rm = TRUE)), no_na = list(vars = eics, f = ~!anyNA(.)), no_zeros = list(vars = eics, f = ~!any(. == 0)) ) check_results <- check_at(modeling_dat, zip(checks)[[1]], zip(checks)[[2]]) # # # Convert diab and sex to factors # modeling_dat %<>% # mutate_at(vars(diab, sex), as.factor)
We can take a peek at the results using dplyr::glimpse
glimpse(check_results)
As we can see, not all checks are met! We need to convert diab
and sex
to factors. And we need to handle the 0
s in the mzid variables.
modeling_dat %<>% mutate_at(factor_vars, as.factor)
Convention for BDC data anlalyses says that missing values in metabolite varaibles should be imputed with one-quarter the minimum non-zero value.
min_over_4 <- function(x) { min(x, na.rm = TRUE) / 4 } modeling_dat %<>% mutate_at(eics, fill_zero, .fill = NA) %>% mutate_at(eics, fill_na, .fill = min_over_4)
Now we check again.
check_results <- check_at(modeling_dat, zip(checks)[[1]], zip(checks)[[2]]) glimpse(check_results) # make this more informative
Alternatively, using code:
all(unlist(check_results))
Other preprocessing:
modeling_dat %<>% mutate_at(eics, log) %>% mutate_if(is.numeric, scale2) modeling_dat
Done!
As a reminder, we want to look at the effects of Eicosanoids on Diab and SBP, controlling for age and sex. If we wrote those modeling formulas out by hand, it would be messy, and require a lot of repeated typing. That is a recipe for mistakes. Instead, we will use biodatacoreUtils::cross_at
and biodatacoreUtils::f_cross_sub
to create a list of formulas.
models <- list(sample_model = response ~ control_vars + eic) substitution_rules <- cross_at(list(response = responses, eic = eics, control_vars = control_vars), 1:2) models <- f_cross_sub(models, substitution_rules) models
Another way. More obvious organization, in exchange for a little more typing.
models <- list( sbp = sbp ~ control_vars + eic, diabetes = diab ~ control_vars + eic ) substitution_rules <- cross_at(list(eic = eics, control_vars = control_vars), 1) models <- f_cross_sub(models, substitution_rules) models
Now that we have our dataset and formulas, we can model!
.model <- function(fo, data) { response <- deparse2(rlang::f_lhs(fo)) fam <- if (response == 'sbp') gaussian() else binomial() out <- glm(fo, family = fam, data) fam_sub <- parse(text = glue::glue('{fam$family}(link = {fam$link})'))[[1]] out$call <- biodatacoreUtils::substitute_q(out$call, list(fo = fo, fam = fam_sub, data = substitute(data))) out } linear_models <- modify_depth(models, 2, ~.model(., modeling_dat)) linear_models
tidy_models <- linear_models %>% modify_depth(2, tidy2) contexts <- linear_models %>% modify_depth(2, contextualize) tidy_models contexts
bindr <- . %>% map(bind_rows, .id = 'model_number') %>% bind_rows(.id = 'model_name') tidy_models %<>% bindr contexts %<>% bindr out <- left_join(tidy_models, contexts) out
out %<>% add_bonf_pv(p_value = 'p.value') %>% add_fdr_pv(p_value = 'p.value') %>% add_nlog10_pv(p_value = 'p.value') out
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.