#' Make study function
#'
#' This is the main study function and runs the entire study.
#' @param data_path Path to data set. Should be a character vector of length 1. Defaults to c("../data/mdf.csv")
#' @param bs_samples The number of bootstrap samples to be generated as int. Defaults to 10
#' @export
make.study <- function(
data_path = c("./extdata/sample.csv"),
bs_samples = 1000
)
{
## Set seed for reproducability
set.seed(123)
## Load all required packages (remove when turned into package)
load.required.packages()
## Import study data
study_data <- read.csv(data_path, stringsAsFactors = FALSE)
## Drop obsevations collected before all centres started collecting triage
## category data and observations later than one month prior to creating
## this dataset
study_data <- drop.observations(study_data, test = TRUE)
## Get data dictionary
data_dictionary <- get.data.dictionary()
## Keep only variables relevant to this study
study_data <- keep.relevant.variables(study_data, data_dictionary)
## Define 999 as missing
study_data[study_data == 999] <- NA
## Prepare study data using the data dictionary
study_data <- prepare.study.data(study_data, data_dictionary, test = TRUE)
## Set patients to dead if dead at discharge or at 24 hours
## and alive if coded alive and admitted to other hospital
study_data <- set.to.outcome(study_data)
## Replace age >89 with 90 and make age numeric
study_data$age[study_data$age == ">89"] <- "90"
study_data$age <- as.numeric(study_data$age)
## Collapse mechanism of injury
study_data <- collapse.moi(study_data)
## Add time between injury and arrival and drop date and time variables from
## study data
study_data <- add.time.between.injury.and.arrival(study_data, data_dictionary)
## Apply exclusion criteria, i.e. drop observations with missing outcome
## data and save exclusions to results list
results <- list() # List to hold results
study_data <- apply.exclusion.criteria(study_data)
## Create missing indicator variables and save table of number of missing
## values per variable
study_data <- add.missing.indicator.variables(study_data)
## Prepare data for SuperLearner predictions
prepped_data <- prep.data.for.superlearner(study_data, test = TRUE)
## Create table of sample characteristics
tables <- create.table.of.sample.characteristics(prepped_data, data_dictionary)
results$table_of_sample_characteristics <- tables$formatted
results$raw_table_of_sample_characteristics <- tables$raw
## Transform factors into dummy variables
prepped_data <- to.dummy.variables(prepped_data)
## Train and review SuperLearner on study sample
study_sample <- predictions.with.superlearner(prepped_data)
## Bootstrap samples
samples <- generate.bootstrap.samples(study_data,
bs_samples)
## Prepare samples
prepped_samples <- prep.bssamples(samples)
## Train and review SuperLearner on bootstrap samples
samples <- train.predict.bssamples(prepped_samples)
## Create list of analysis to conduct
funcList <- list(list(func = 'model.review.AUROCC',
model_or_pe = c('pred_cat',
'tc'),
diffci_or_ci = "diff"),
list(func = 'model.review.reclassification',
model_or_pe = c('NRI+',
'NRI'),
diffci_or_ci = "ci"))
## Generate confidence intervals around point estimates from funcList
CIs <- lapply(funcList,
function(i) generate.confidence.intervals(study_sample,
func = get(i$func),
model_or_pointestimate = i$model_or_pe,
samples = samples,
diffci_or_ci = i$diffci_or_ci))
## Set names of cis
names(CIs) <- c('AUROCC',
'reclassification')
## Compile manuscript
compile.manuscript(results, "superlearner_vs_clinicians_manuscript")## Set seed
set.seed(14813)
## Load all required packages (remove when turned into package)
load.required.packages()
## Import study data
study_data <- read.csv(data_path, stringsAsFactors = FALSE)
## Get data dictionary
data_dictionary <- get.data.dictionary()
## Keep only variables relevant to this study
study_data <- keep.relevant.variables(study_data, data_dictionary)
## Define 999 as missing
study_data[study_data == 999] <- NA
## Prepare study data using the data dictionary
study_data <- prepare.study.data(study_data, data_dictionary)
## Set patients to dead if dead at discharge or at 24 hours
## and alive if coded alive and admitted to other hospital
study_data <- set.to.outcome(study_data)
## Replace age >89 with 90 and make age numeric
study_data$age[study_data$age == ">89"] <- "90"
study_data$age <- as.numeric(study_data$age)
## Collapse mechanism of injury
study_data <- collapse.moi(study_data)
## Add time between injury and arrival and drop date and time variables from
## study data
study_data <- add.time.between.injury.and.arrival(study_data, data_dictionary)
## Apply exclusion criteria, i.e. drop observations with missing outcome
## data and save exclusions to results list
results <- list() # List to hold results
study_data <- apply.exclusion.criteria(study_data)
## Create missing indicator variables and save table of number of missing
## values per variable
study_data <- add.missing.indicator.variables(study_data)
## Prepare data for SuperLearner predictions
prepped_sample <- prep.data.for.superlearner(study_data)
## Create table of sample characteristics
tables <- create.table.of.sample.characteristics(prepped_sample, data_dictionary)
results$table_of_sample_characteristics <- tables$formatted
results$raw_table_of_sample_characteristics <- tables$raw
results$n_training_sample <- nrow(prepped_sample$sets$x_train)
results$n_test_sample <- nrow(prepped_sample$sets$x_review)
## Transform factors into dummy variables
prepped_sample <- to.dummy.variables(prepped_sample)
## Save original sample to disk
saveRDS(prepped_sample, "original_sample.rds")
## Train and review SuperLearner on study sample
study_sample <- predictions.with.superlearner(prepped_sample,
save_breaks = TRUE,
save_all_predictions = TRUE,
save_superlearner = TRUE,
sample = FALSE,
gridsearch_parallel = TRUE,
n_cores = 4,
log = TRUE,
write_to_disk = TRUE,
clean_start = TRUE)
## Save point estimates to disk
saveRDS(study_sample, "point_estimates.rds")
## Bootstrap samples
bootstrap_samples <- generate.bootstrap.samples(study_data,
bs_samples)
## Prepare samples
prepped_samples <- prep.bssamples(bootstrap_samples)
## Save prepped samples to disk
saveRDS(prepped_samples, "bootstrap_samples.rds")
## Train and review SuperLearner on boostrap samples
samples <- train.predict.bssamples(prepped_samples,
parallel = TRUE,
n_cores = 4,
log = TRUE,
boot = TRUE,
write_to_disk = TRUE)
## Save bootstrapped estimates
saveRDS(samples, "bootstrapped_estimates.rds")
## Create list of analyses to conduct
## Base analysis settings
base <- list(study_sample = study_sample,
outcome = "outcome_test")
## Define main auc analysis
auc_main <- c(list(func = 'model.review.AUROCC',
model_or_pe = c("tc", "pred_cat_test"),
diffci_or_ci = "diff"),
base)
## And main nri analysis
nri_main <- c(list(func = 'model.review.reclassification',
model_or_pe = c("NRI+", "NRI-"),
diffci_or_ci = "ci"),
base)
## Define analysis to get point estimates from training set
auc_train <- auc_main
auc_train$model_or_pe <- c("pred_con_train", "pred_cat_train")
auc_train$diffci_or_ci <- "none"
auc_train$outcome <- "outcome_train"
## And analysis to get point estimate of pred_con in test set
auc_test <- auc_main
auc_test$model_or_pe <- "pred_con_test"
auc_test$diffci_or_ci <- "none"
## Put all analysis in one list
funcList <- list(auc_main = auc_main,
nri_main = nri_main,
auc_train = auc_train,
auc_test = auc_test)
## Generate confidence intervals around point estimates from funcList
pe_and_ci <- lapply(funcList,
function(i) generate.confidence.intervals(study_sample = study_sample,
func = get(i$func),
model_or_pointestimate = i$model_or_pe,
samples = samples,
diffci_or_ci = i$diffci_or_ci,
outcome_name = i$outcome))
## Extract from pe_and_ci and put in results
results <- c(results, extract.from.pe.and.ci(pe_and_ci))
## Create classification tables
results <- c(results, create.classification.tables(study_sample))
## Create roc plots
create.ROCR.plots(study_sample, "ROC")
## Create precision/recall curve
create.ROCR.plots(study_sample, "prec_rec")
## Create roc plots for all models
create.ROCR.all(prepped_sample)
## Create mortality plot
create.mortality.plot(study_sample)
## Generate coefficiets table for all models
results$coeff_risk_table <- coefficients.table(prepped_sample)
## Save results to disk
saveRDS(results, "results.rds")
## Compile manuscript
compile.manuscript("plos_superlearner_vs_clinicians_manuscript")
## Compile supporting information
compile.supporting.information("S3_Fig")
compile.supporting.information("S4_Table")
## Message
message("All done")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.