#'---
#'title: "Elastic Net SearcheR Datasets"
#'subtitle: "ensr package version `r packageVersion('ensr')`"
#'author: "Peter E. DeWitt"
#'date: "`r Sys.Date()`"
#'output:
#' rmarkdown::html_vignette
#'bibliography: references.bib
#'vignette: >
#' %\VignetteEngine{knitr::rmarkdown}
#' %\VignetteIndexEntry{ensr-datasets}
#' %\VignetteEncoding{UTF-8}
#'---
#+ label=setup, include = FALSE
library(knitr)
knitr::opts_chunk$set(collapse = TRUE)
library(qwraps2)
options(qwraps2_markup = "markdown")
#'
#' This vignette provides descriptions of the example data sets in the `r Rpkg(ensr)`
#' package. We provide information about how the data sets were built and also
#' about each individual variable.
#'
#' The packages needed to constuct the data sets are:
library(data.table)
library(magrittr)
library(qwraps2)
library(digest)
set.seed(42)
#/* if interactive, i.e., if active development, use devtools to load the
#package. Otherwise, and as will show in the vignette, load and attache the
#ensr namespace
if (interactive()) {
devtools::load_all()
} else {
#*/
library(ensr)
#/*
}
#*/
#'
#/*
# ------------------------- Traumatic Brain Injuries ---------------------------
#*/
#'
#' # Traumatic Brain Injuries
#'
#' A synthetic data set based based on a Traumatic Brain Injury (TBI) study
#' has been included with the `r Rpkg(ensr)` package. All data points in this data
#' set have been randomly generated and have no association with any real
#' patient.
#'
#' The columns of this data set are:
#'
#' | Column(s) | Meaning |
#' | :------------------------ | :------------------------------------------------------------------------------------------------------ |
#' | `age` | Age of the patient, in days, at time of hospital admission |
#' | `female` | An integer flag for sex, 1 == female, 0 == male. |
#' | `los` | Length of stay: number of days between hospital admission and discharge.
#' | `pcode1`, ..., `pcode6` | Indicator columns for the presence or absence of a procedure code from a billing database. |
#' | `ncode1`, ..., `ncode6` | Indicator columns for the presence or absence of the same procedure code, but from a trauma database. |
#' | `injury1`, ..., `injury3` | Indicator columns for the presence or absence of three different types of TBI. |
#'
#'
#' Construction of the synthetic data set:
TBI_SUBJECTS <- 1323L
tbi <- data.table(age = round(rweibull(TBI_SUBJECTS, shape = 1.9, scale = 2000)),
female = rbinom(TBI_SUBJECTS, 1, 0.10),
los = round(rexp(TBI_SUBJECTS, rate = 1/13)))
#'
#' The `pcode` and `ncode` variables are difficult to simulate. There are 64
#' permutations for each set. We built two vectors with relative frequencies of
#' the codes.
pcodes <-
c(`0.0.0.0.0.0` = 1230, `1.0.0.0.0.0` = 130, `0.1.0.0.0.0` = 40,
`1.1.0.0.0.0` = 7, `0.0.1.0.0.0` = 120, `1.0.1.0.0.0` = 4, `0.1.1.0.0.0` = 20,
`1.1.1.0.0.0` = 2, `0.0.0.1.0.0` = 4, `1.0.0.1.0.0` = 4, `0.1.0.1.0.0` = 20,
`1.1.0.1.0.0` = 4, `0.0.1.1.0.0` = 20, `1.0.1.1.0.0` = 2, `0.1.1.1.0.0` = 2,
`1.1.1.1.0.0` = 7, `0.0.0.0.1.0` = 5, `1.0.0.0.1.0` = 4, `0.1.0.0.1.0` = 10,
`1.1.0.0.1.0` = 9, `0.0.1.0.1.0` = 10, `1.0.1.0.1.0` = 7, `0.1.1.0.1.0` = 1,
`1.1.1.0.1.0` = 7, `0.0.0.1.1.0` = 10, `1.0.0.1.1.0` = 6, `0.1.0.1.1.0` = 5,
`1.1.0.1.1.0` = 8, `0.0.1.1.1.0` = 5, `1.0.1.1.1.0` = 2, `0.1.1.1.1.0` = 6,
`1.1.1.1.1.0` = 9, `0.0.0.0.0.1` = 40, `1.0.0.0.0.1` = 7, `0.1.0.0.0.1` = 40,
`1.1.0.0.0.1` = 10, `0.0.1.0.0.1` = 10, `1.0.1.0.0.1` = 1, `0.1.1.0.0.1` = 20,
`1.1.1.0.0.1` = 5, `0.0.0.1.0.1` = 10, `1.0.0.1.0.1` = 2, `0.1.0.1.0.1` = 6,
`1.1.0.1.0.1` = 1, `0.0.1.1.0.1` = 8, `1.0.1.1.0.1` = 1, `0.1.1.1.0.1` = 7,
`1.1.1.1.0.1` = 2, `0.0.0.0.1.1` = 9, `1.0.0.0.1.1` = 1, `0.1.0.0.1.1` = 1,
`1.1.0.0.1.1` = 2, `0.0.1.0.1.1` = 6, `1.0.1.0.1.1` = 9, `0.1.1.0.1.1` = 6,
`1.1.1.0.1.1` = 4, `0.0.0.1.1.1` = 2, `1.0.0.1.1.1` = 9, `0.1.0.1.1.1` = 3,
`1.1.0.1.1.1` = 1, `0.0.1.1.1.1` = 7, `1.0.1.1.1.1` = 4, `0.1.1.1.1.1` = 4,
`1.1.1.1.1.1` = 3)
ncodes <-
c(`0.0.0.0.0.0` = 1240, `1.0.0.0.0.0` = 200, `0.1.0.0.0.0` = 130,
`1.1.0.0.0.0` = 4, `0.0.1.0.0.0` = 90, `1.0.1.0.0.0` = 9, `0.1.1.0.0.0` = 20,
`1.1.1.0.0.0` = 7, `0.0.0.1.0.0` = 20, `1.0.0.1.0.0` = 3, `0.1.0.1.0.0` = 10,
`1.1.0.1.0.0` = 8, `0.0.1.1.0.0` = 7, `1.0.1.1.0.0` = 9, `0.1.1.1.0.0` = 8,
`1.1.1.1.0.0` = 8, `0.0.0.0.1.0` = 6, `1.0.0.0.1.0` = 9, `0.1.0.0.1.0` = 10,
`1.1.0.0.1.0` = 3, `0.0.1.0.1.0` = 9, `1.0.1.0.1.0` = 7, `0.1.1.0.1.0` = 6,
`1.1.1.0.1.0` = 8, `0.0.0.1.1.0` = 6, `1.0.0.1.1.0` = 2, `0.1.0.1.1.0` = 9,
`1.1.0.1.1.0` = 6, `0.0.1.1.1.0` = 2, `1.0.1.1.1.0` = 2, `0.1.1.1.1.0` = 5,
`1.1.1.1.1.0` = 7, `0.0.0.0.0.1` = 20, `1.0.0.0.0.1` = 6, `0.1.0.0.0.1` = 4,
`1.1.0.0.0.1` = 1, `0.0.1.0.0.1` = 2, `1.0.1.0.0.1` = 4, `0.1.1.0.0.1` = 7,
`1.1.1.0.0.1` = 6, `0.0.0.1.0.1` = 4, `1.0.0.1.0.1` = 1, `0.1.0.1.0.1` = 8,
`1.1.0.1.0.1` = 6, `0.0.1.1.0.1` = 1, `1.0.1.1.0.1` = 4, `0.1.1.1.0.1` = 2,
`1.1.1.1.0.1` = 7, `0.0.0.0.1.1` = 3, `1.0.0.0.1.1` = 7, `0.1.0.0.1.1` = 7,
`1.1.0.0.1.1` = 9, `0.0.1.0.1.1` = 5, `1.0.1.0.1.1` = 1, `0.1.1.0.1.1` = 6,
`1.1.1.0.1.1` = 5, `0.0.0.1.1.1` = 2, `1.0.0.1.1.1` = 3, `0.1.0.1.1.1` = 1,
`1.1.0.1.1.1` = 5, `0.0.1.1.1.1` = 6, `1.0.1.1.1.1` = 5, `0.1.1.1.1.1` = 4,
`1.1.1.1.1.1` = 2)
#'
#' We generated the code variables by taking a random sample of the names of the
#' above objects, splitting the names by the `.`, and coercing the result to a
#' data.table.
pcodes <-
sample(names(pcodes), TBI_SUBJECTS, replace = TRUE, prob = pcodes) %>%
strsplit("\\.") %>%
lapply(as.integer) %>%
do.call(rbind, .) %>%
as.data.table %>%
setNames(., paste0("pcode", 1:6))
pcodes
ncodes <-
sample(names(ncodes), TBI_SUBJECTS, replace = TRUE, prob = ncodes + 0.1) %>%
strsplit("\\.") %>%
lapply(as.integer) %>%
do.call(rbind, .) %>%
as.data.table %>%
setNames(., paste0("ncode", 1:6))
ncodes
#'
#' The last step in building the predictor variables is to bind the three
#' data.tables together:
tbi <- cbind(tbi, pcodes, ncodes)
#'
#' The three injury categories are constructed by fitting a simplified
#' logistic regression model. The regression coefficient vectors are defined
#' next.
injury1_coef <-
matrix(c(-5.80494, -3.03946e-05, 0.773355, 0.00556597, -1.04436, -0.849594,
0.770122, 0.968153, 16.6315, 0.411286, 4.07926, 5.16926, 2.84976,
-17.1038, -14.7382, 4.30086),
ncol = 1)
injury2_coef <-
matrix(c(-427.083, 0.0742405, -342.072, -8.09704, 299.132, 991.75, -85.0155,
-170.344, -57.408, 123.201, 161.41, -568.483, -767.944, 706.95,
10.2964, 148.551),
ncol = 1)
injury3_coef <-
matrix(c(-3.54036, -0.00054294, 1.75962, -0.0475071, -17.515, -60.4276,
4.58458, 0.58551, -19.7312, -1.16923, 2.16091, 63.7699, 39.3569,
-37.8554, -14.1002, -14.8469),
ncol = 1)
injury_expression <-
quote(
c("injury1", "injury2", "injury3") :=
list(
sapply(qwraps2::invlogit(cbind(1, as.matrix(tbi)) %*% injury1_coef),
function(p) rbinom(1, 1, p)),
sapply(qwraps2::invlogit(cbind(1, as.matrix(tbi)) %*% injury2_coef),
function(p) rbinom(1, 1, p)),
sapply(qwraps2::invlogit(cbind(1, as.matrix(tbi)) %*% injury3_coef),
function(p) rbinom(1, 1, p))
)
)
tbi[, eval(injury_expression)]
#'
#' To prove that the `tbi` object constructed in this vignette is the same as
#' the one provided in via calling `data(tbi, package = "ensr")` we show the
#' sha1 for both data sets here:
#/*
if (!grepl("data-raw", getwd())) {
#*/
ensr_env <- new.env()
data(tbi, package = "ensr", envir = ensr_env)
all.equal(digest::sha1(tbi),
digest::sha1(ensr_env$tbi))
#/*
}
#*/
#/*
# This bit builds the documentation and the data set for the package. It needs
# to be evaluated from the data-raw directory during the package build process.
if (!interactive()) {
cat("
# ------------------- file generated by automated process ----------------------
# --------------------------- DO NOT EDIT BY HAND ------------------------------
# --------- make edits to the vignette-spinners/ensr-datasets.R file -----------
#' Synthetic Data Set for Traumatic Brain Injuries
#'
#' This data is synthetic, that is, it is random data generated to have similar
#' properties to a data set used for studying traumatic brain injuries. The
#' \\code{pcode1} ... \\code{pcode6}, \\code{ncode1} ... \\code{ncode6} columns
#' are indicators for procedure or billing codes associated with a hospital
#' stay for TBI.
#'
#' @format a data.table with ", nrow(tbi), "rows and ", ncol(tbi), "columns.
#' Each row of the \\code{tbi} data.table is a unique subject.
#' \\describe{
#' \\item{age}{age, in days}
#' \\item{female}{indicator for sex, 1 == female, 0 == male}
#' \\item{los}{length of stay in the hosptial}
#' \\item{pcode1}{indicator for if the patient had pcode1}
#' \\item{pcode2}{indicator for if the patient had pcode2}
#' \\item{pcode3}{indicator for if the patient had pcode3}
#' \\item{pcode4}{indicator for if the patient had pcode4}
#' \\item{pcode5}{indicator for if the patient had pcode5}
#' \\item{pcode6}{indicator for if the patient had pcode6}
#' \\item{ncode1}{indicator for if the patient had ncode1}
#' \\item{ncode2}{indicator for if the patient had ncode2}
#' \\item{ncode3}{indicator for if the patient had ncode3}
#' \\item{ncode4}{indicator for if the patient had ncode4}
#' \\item{ncode5}{indicator for if the patient had ncode5}
#' \\item{ncode6}{indicator for if the patient had ncode6}
#' \\item{injury1}{First of three specific types of TBI}
#' \\item{injury2}{Second of three specific types of TBI}
#' \\item{injury3}{Third of three specific types of TBI}
#' }
#'
#' @seealso \\code{vignette(\"ensr-datasets\", package = \"ensr\")}
#'
\"tbi\"
",
file = "./R/tbi.R")
usethis::use_data(tbi, overwrite = TRUE)
}
#*/
#'
#/*
# ------------------- Water Percolation Through a Landfill ---------------------
#*/
#'
#' # Water Percolation Through A Landfill
#'
#' The file `landfill.csv` contains results from a computer simulation of water
#' percolation through five layers in a landfill over one year. The raw data
#' file is provided so the end user can explore their own pre-processing of the
#' data.
#/*
if (grepl("data-raw", getwd())) {
landfill <- fread(file = "./inst/extdata/landfill.csv", sep = ",")
} else {
#*/
landfill <-
fread(file = system.file("extdata/landfill.csv", package = "ensr"), sep = ",")
#/*
}
#*/
#'
#' This landfill consists of five layers:
#'
#' 1. Topsoil
#' 2. Geomembrane Liner
#' 3. Clay
#' 4. Weathered Lavery Till (WLT)
#' 5. Unweathered Lavery Till (ULT)
#'
#' The outcomes modeled by the computer program are:
#+label="regex_names", include = FALSE
regex_names <- function(pattern) {
paste(grep(pattern, names(landfill), value = TRUE), collapse = ", ")
}
#'
#' | Outcome | Variable Name(s) |
#' | :-------- | :---------------- |
#' | evaporation | `r regex_names("^evap")` |
#' | runoff | `r regex_names("^runoff")` |
#' | percolation | `r regex_names("^perc_")` |
#' | drainage | `r regex_names("^drain_")` |
#' | water content | `r regex_names("^wc_")` |
#'
#' The set of predictors included in the data set are:
#'
#' | Predictor | Variable Name(s) | Notes |
#' | :-------- | :---------------- | :---- |
#' | Porosity | `r regex_names("_porosity$")` | |
#' | van Genuchten $\alpha$ | `r regex_names("_alpha$")` | related to the inverse of the air entry suction |
#' | van Genuchten $\theta_r$ | `r regex_names("_thetar$")` | residual water content |
#' | van Genuchten N | `r regex_names("_n$")` | pore size |
#' | Saturated hydraulic conductivity | `r regex_names("_ks$")` | |
#' | Relative Humidity | `r regex_names("^rh$")` | |
#' | Leaf Area Index | `r regex_names("^lai$")` | |
#' | Grow Start | `r regex_names("growstart")` | First day of the growing season |
#' | Grow End | `r regex_names("growend")` | Last day of the growing season |
#' | Wind | `r regex_names("wind")` | Average wind speed in mph |
#' | Solar Radiation | `r regex_names("^rm\\w")` | Mean of daily solar radiation for wet days, or dry days |
#' | Amplitude of daily solar radiation | `r regex_names("^ar$")` | |
#' | SCS AMCII Curve Number | `r regex_names("cn")` | |
#' | Plant root beta | `r regex_names("^beta$")` | Used for calculating evaporation zone depth |
#' | Linear Defects | `r regex_names("^liner_defects$")` | Number of defects per acre of liner |
#' | Linear Pinholes | `r regex_names("^liner_pinholes$")` | Number of pinholes per acre of liner |
#' | 100 year average Solar Radiation | `r regex_names("weather_solrad")` | |
#' | 100 year average Precipitation | `r regex_names("weather_precip")` | |
#' | 100 year average Temperature | `r regex_names("weather_temp")` | |
#'
#' ## Landfill Data Preparation
#'
#' The elastic net method suggests that all predictors be centered and scaled.
#' In the case of multi-variable responses, it is recommended that the outcomes
#' be centered and scaled as well. For calls to `glmnet` and `cv.glmnet`, the argument
#' `standardized = TRUE` (default) will center and scale the values. If you have
#' already centered and scaled the data you may prefer to set `standardized =
#' FALSE`.
#'
#' If you want to explicitly standardize your data, the base R function `scale`
#' is adequate for most situations. However, with the landfill data the simple
#' centering and scaling could be considered inappropriate.
#' `scale` will use the mean and standard deviation of the input.
#' However, the landfill data contains measurements with the same values. Centering
#' and scaling should be based on the mean and standard deviation of just the unique
#' values. The ensr function `standardize` will, by default, standardize a numeric vector
#' based on just the unique values using either the mean and standard deviation or the
#' median and interquartile range (IQR).
#'
#' Here is an example. There are `r qwraps2::frmt(nrow(landfill))` rows in the
#' landfill data set. There are `r length(unique(landfill$topsoil_alpha))`
#' unique values for the van Genuchten $\alpha$ value. Here is how the
#' standardization results are affected by non-unique predictor values:
nrow(landfill)
length(unique(landfill$topsoil_alpha))
# Standard scaling, using the mean and standard deviation for the full vector.
scaled_topsoil_alpha_v1 <- as.vector(scale(landfill$topsoil_alpha))
scaled_topsoil_alpha_v2 <-
(landfill$topsoil_alpha - mean(landfill$topsoil_alpha)) / sd(landfill$topsoil_alpha)
all.equal(target = scaled_topsoil_alpha_v1,
current = scaled_topsoil_alpha_v2,
check.attributes = FALSE)
# Center and scale using the mean and standard deviation of only the unique
# values of the vector.
scaled_topsoil_alpha_v3 <-
as.vector(scale(landfill$topsoil_alpha,
center = mean(unique(landfill$topsoil_alpha)),
scale = sd(unique(landfill$topsoil_alpha))))
scaled_topsoil_alpha_v4 <- standardize(landfill$topsoil_alpha)
all.equal(target = scaled_topsoil_alpha_v3,
current = scaled_topsoil_alpha_v4,
check.attributes = FALSE)
# Center and scale using the median and IQR.
scaled_topsoil_alpha_v5 <-
as.vector(scale(landfill$topsoil_alpha,
center = median(unique(landfill$topsoil_alpha)),
scale = IQR(unique(landfill$topsoil_alpha))))
scaled_topsoil_alpha_v6 <-
standardize(landfill$topsoil_alpha,
stats = list(center = "median", scale = "IQR"))
all.equal(target = scaled_topsoil_alpha_v5,
current = scaled_topsoil_alpha_v6,
check.attributes = FALSE)
#'
#' For the full landfill data set we can center and scale all the columns using:
landfill <- standardize(landfill)
#'
#' Note that the values used for centering and scaling are retained as
#' attributes for each variable in the data set.
str(landfill[, 1:2] )
#'
#' The `landfill` object generated in this vignette is the same as the
#' one generated using `data(landfill, package = "ensr")`.
#/*
if (!grepl("data-raw", getwd())) {
#*/
data(landfill, package = "ensr", envir = ensr_env)
all.equal(digest::sha1(landfill),
digest::sha1(ensr_env$landfill))
#/*
}
#*/
#'
#/*
# This bit builds the documentation and the data set for the package. It needs
# to be evaluated from the data-raw directory during the package build process.
if (!interactive()) {
cat("
# ------------------- file generated by automated process ----------------------
# --------------------------- DO NOT EDIT BY HAND ------------------------------
# --------- make edits to the vignette-spinners/ensr-datasets.R file -----------
#' Water Percolation Through A Landfill
#'
#' A computer simulation of water moving through a landfill. Detailed
#' explanation for the variables and the construction of the data set is found
#' in
#' \\code{vignette(\"ensr-datasets\", package = \"ensr\")}
#'
#'
#' @seealso \\code{vignette(\"ensr-datasets\", package = \"ensr\")}
#'
\"landfill\"
",
file = "./R/landfill.R")
usethis::use_data(landfill, overwrite = TRUE)
}
#*/
#/*
# --------------------------- Session Information ------------------------------
#*/
#'
#'
#' # Session Info
print(sessionInfo(), local = FALSE)
#/*
# ------------------------------- End of File ----------------------------------
#*/
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.