title: "Create model_grid
"
author: Benny Salo
output: github_document
date: "2019-02-14"
Here we create the data frame model_grid
. It describes, and helps keep track of, all the models we want to run. We will select subsets of this models grid when training using different algorithms.
We first create a model grid for the main analyses and then a model grid for the analyses with single RITA-factors as predictors. We then combine these two to a single grid.
rm(list = ls())
devtools::load_all(".") # Loading recidivismsl
library(dplyr)
Define the possible factor levels for the model characteristics:
outcomes <- c("General recidivism", "Violent recidivism")
predictors <- c("Rita-items",
"Static",
"All at start of sentence",
"All including term",
"Alcohol problem",
"Resistance to change",
"Employment problems",
"Problems managing economy",
"Aggressiveness",
"Current drug use and its effects")
model_type <- c("Logistic regression",
"Elastic net",
"Random forest")
analysis <- c("Main analyses", "Dimension analyses")
model_grid_mains
analysis_m <- c("Main analyses")
predictors_m <- c("Rita-items",
"Static",
"All at start of sentence",
"All including term")
model_grid_mains <- expand.grid(outcomes,
predictors_m,
model_type,
analysis_m,
stringsAsFactors = FALSE)
model_grid_dims
predictors_d <- c("Alcohol problem",
"Resistance to change",
"Employment problems",
"Problems managing economy",
"Aggressiveness",
"Current drug use and its effects")
model_type_d <- "Logistic regression"
analysis_d <- "Dimension analyses"
model_grid_dims <- expand.grid(outcomes,
predictors_d,
model_type_d,
analysis_d,
stringsAsFactors = FALSE)
Combine and name columns
model_grid <- dplyr::bind_rows(model_grid_mains, model_grid_dims)
colnames(model_grid) <- c("outcome", "predictors",
"model_type", "analysis")
Write a columns with compact model names of the style "outc_pred_modty". Start with writing three columns (outc_, pred_, and modty) that are later pasted to one.
#Intitate new columns
model_grid$outc_ <- vector("character", length = nrow(model_grid))
model_grid$pred_ <- vector("character", length = nrow(model_grid))
model_grid$modty <- vector("character", length = nrow(model_grid))
# Model name begins with abbreviation of outcome (gen_ or vio_)
outc_ <- c("General recidivism" = "gen_",
"Violent recidivism" = "vio_")
model_grid$outc_ <- outc_[model_grid$outcome]
# continues with a abbreviation for predictor set
pred_ <- c(
"Alcohol problem" = "alcohol_",
"Resistance to change" = "change_",
"Employment problems" = "employment_",
"Problems managing economy" = "economy_",
"Aggressiveness" = "aggression_",
"Current drug use and its effects" = "drugs_",
"Rita-items" = "rita_",
"Static" = "stat_",
"All at start of sentence" = "bgnn_",
"All including term" = "allp_"
)
model_grid$pred_ <- pred_[model_grid$predictors]
# And ends with the model type / algorithm
modty <- c(
"Logistic regression" = "glm",
"Elastic net" = "glmnet",
"Random forest" = "rf"
)
model_grid$modty <- modty[model_grid$model_type]
# Paste them together to a model name
model_grid <-
model_grid %>%
mutate(model_name = paste0(outc_, pred_, modty)) %>%
# Delete auxilliary columns created above
select(-outc_, -pred_, -modty)
Add columns of character strings for outcome (lhs)
# Intiate columns
model_grid$lhs <- vector("character", length = nrow(model_grid))
lhs <- c("General recidivism" = "reoffenceThisTerm",
"Violent recidivism" = "newO_violent")
model_grid$lhs <- lhs[model_grid$outcome]
Add columns of character strings for predictors (rhs). First, extract sets of predictors from the predefined variable table.
predset_static <-
variable_table$Variable[variable_table$Role == "predictor_static"]
predset_RITA <-
variable_table$Variable[variable_table$Role == "predictor_dynamic"]
predset_begin <-
variable_table$Variable[variable_table$Role %in% c("predictor_static",
"predictor_dynamic")]
predset_all <-
variable_table$Variable[variable_table$Role %in% c("predictor_static",
"predictor_dynamic",
"predictor_term")]
Then add these sets to the column rhs
in model_grid.
model_grid$rhs <- vector("list", length = nrow(model_grid))
rhs <- list(
"Rita-items" = predset_RITA,
"Static" = predset_static,
"All at start of sentence" = predset_begin,
"All including term" = predset_all,
"Alcohol problem" = "sum_alcohol_problems",
"Resistance to change" = "sum_resistance_change",
"Employment problems" = "sum_employment_probl",
"Problems managing economy" = "sum_economy_problems",
"Aggressiveness" = "sum_aggressiveness",
"Current drug use and its effects" = "sum_current_drug_probl"
)
model_grid$rhs <- rhs[model_grid$predictors]
Assertions
# Total rows should be:
# Main analyses: 2 outcomes x 4 predictor sets x 3 model types = 24
# Dimension analyes: 2 outcomes x 6 predictors x 1 model type = 12
# Total 36 rows
stopifnot(nrow(model_grid) == 36)
# Dimension analyses should have only 1 predictor
dim_an <- model_grid %>% filter(analysis == "Dimension analyses")
stopifnot(all(
purrr::map_dbl(dim_an$rhs, length) == 1))
# When rita items are predictors there should be 52 of them
rita_an <- model_grid %>% filter(stringr::str_detect(model_name, "_rita_"))
stopifnot(all(
purrr::map_dbl(rita_an$rhs, length) == 52))
# When static items are predictors there should be 24 of them
stat_an <- model_grid %>% filter(stringr::str_detect(model_name, "_stat_"))
stopifnot(all(
purrr::map_dbl(stat_an$rhs, length) == 24))
# When model name has 'bgnn' in the middle the number of predictors should be
# 52 + 24
bgnn_ans <- model_grid %>% filter(stringr::str_detect(model_name, "_bgnn_"))
stopifnot(all(
purrr::map_dbl(bgnn_ans$rhs, length) == 52 + 24))
# When model name has 'allp' in the middle the number of predictors should be
# 52 + 24 + 5
bgnn_ans <- model_grid %>% filter(stringr::str_detect(model_name, "_allp_"))
stopifnot(all(
purrr::map_dbl(bgnn_ans$rhs, length) == 52 + 24 + 5))
# No missing values in any column
stopifnot(!all(purrr::map_lgl(model_grid, anyNA)))
Explicitly set order of levels in factors.
model_grid <-
model_grid %>%
mutate(
outcome =
factor(outcome,
levels = c("General recidivism",
"Violent recidivism")),
predictors =
factor(
predictors,
levels = c("All including term",
"All at start of sentence",
"Static",
"Rita-items",
"Aggressiveness",
"Alcohol problem",
"Employment problems",
"Current drug use and its effects",
"Problems managing economy",
"Resistance to change")),
model_type =
factor(model_type,
levels = c("Logistic regression",
"Elastic net",
"Random forest"))
)
Save and make available in /data
usethis::use_data(model_grid, overwrite = TRUE)
## <U+2714> Saving 'model_grid' to 'data/model_grid.rda'
Extract model names of main analyses. Used to filter out results for main analyses in /analyses_of_results
model_names_main <-
model_grid %>%
filter(analysis == "Main analyses") %>%
select(model_name) %>%
purrr::as_vector(.type = "character")
# This should not be a named vector
names(model_names_main) <- NULL
usethis::use_data(model_names_main, overwrite = TRUE)
## <U+2714> Saving 'model_names_main' to 'data/model_names_main.rda'
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.