#' @title Create NONMEM ready datasets for simulated clinical PK trials
#'
#' @author Sandy Floren
#'
#' @description
#' The `cohort()` function is used to generate NONMEM-ready datasets for
#' clinical PK trial simulations, either by sampling from real datasets or
#' generating synthetic data.
#'
#' @section Real Data:
#' To sample from an existing dataset, pass a dataframe or file name to
#' `data`, and leave `param` unspecified.
#'
#' Data will be filtered by the criteria specified in `include`, which must
#' use only variable names present in the data. `n` individuals will then be
#' sampled randomly.
#'
#' @section Synthetic Data:
#' To generate synthetic data, leave `data` unspecified and pass the details of
#' the distributions from which you wish to sample to `param`. Currently, all
#' variables are assumed to be independent.
#'
#' @section Dosing and Observation Events:
#' After sampling is finished, `cohort()` will create duplicate rows for each
#' individual corresponding to the timepoints specified in
#' `dose_times` and `obs_times`, and create a column "EVID" distinguishing
#' between dosing and observation events.
#'
#' Finally, the function (or fixed amount) passed to `amt` is used to calculate
#' the dose for each individual at each dosing event. If using a function, it
#' must be Vectorized, defined in the calling environment, and its arguments
#' must match the names of variables used in your data. See Example 6 for
#' details.
#'
#' @returns
#' A [tibble::tibble()] in NONMEM format with dosing and observation records..
#'
#' @param data A data frame or data frame extension containing only one row per individual.
#' @param include A character string in the form of a logical R statement, to
#' specify the inclusion criteria for this cohort. For example,
#' if "WT" and "HT" are variables corresponding to weight in kg
#' and height in cm, respectively, you can sample only
#' individuals below 50kg and below 150cm by writing:
#' include = "WT < 50 & HT < 150"
#' @param n Optionally, the number of patients enroll. When using existing data,
#' `cohort()` will randomly sample `n` patients from your data.
#' @param obs_times A numeric vector of observation times.
#' @param dose_times A numeric vector of dosing times.
#' @param amt Either a numeric fixed dose, or a function that computes a
#' dose based on variables in your data (or which were specified
#' in param). If using a function, it must be Vectorized, and the
#' names of the variables in the function must match the names of
#' the variables in data (or param). This function should be
#' defined in the environment in which cohort is called.
#' @param param If creating synthetic data, this is where you specify the
#' distribution and parameters to use for random sampling. Supply
#' a list with named fields, each of which corresponds to a
#' variable. The value of each field should be another list,
#' containing (in order):
#' -the name of an R stats function for random sampling, e.g.
#' "rnorm", "runif", "rlnorm", etc.
#' -the arguments to the above function (except "n", which you
#' will have already specified).
#' For example, to create a normally-distributed random variable
#' called "WT" with mean 16.3 and standard deviation 2.5, and a
#' binomially distributed random variable called "HIV" with p = 0.34, write:
#' param = list("WT" = list("rnorm", 16.3, 2.5), "HIV" = list("rbinom", 1,
#' 0.34))
#' @param original_id When `TRUE`, the default, `cohort()` will keep the same
#' IDs as the input data. To create new IDs starting with 1, use
#' `original_id = TRUE`.
#' @param pop_size Optional. When generating synthetic data, `cohort()` will
#' generate a population of size pop_size * n, and then randomly sample n
#' individuals from it. The default value is 10.
#' @param replace Optional. Whether to sample with replacement. Default: FALSE.
#' @param keep Optional. Character vector of column names that you do not want
#' converted to numeric.
#' @param tad Optional. Whether to calculate time after dose (TAD).
#'
#' @examples
#' pop_example
#'
#' # 1. Sampling 20 individuals, above 10 kg and below 120 cm, with a fixed dose of
#' # 200 mg, observing every 4 hours for one day and dosing at times 0, 5, and 12.
#' # Note that the data has columns called "WT" and "HT".
#'
#' inc <- "WT > 10 & HT < 120"
#' ot <- seq(0, 24, by = 4)
#' dt <- c(0, 5, 12)
#'
#' cohort(
#' pop_example,
#' include = inc,
#' n = 20,
#' obs_times = ot,
#' dose_times = dt,
#' amt = 200
#' )
#'
#'
#' # 2. Simulating data. We assume WT and HT are normally distributed random
#' # variables, with means and standard deviations of 16 and 3.4 for WT and 132
#' # and 13.6 for HT.
#'
#' p1 <-
#' list(
#' "WT" = list("rnorm", 16, 3.4),
#' "HT" = list("rnorm", 132, 13.6)
#' )
#'
#' cohort(
#' param = p1,
#' include = inc,
#' n = 20,
#' pop_size = 1000,
#' obs_times = ot,
#' dose_times = dt,
#' amt = 200,
#' original_id = FALSE
#' )
#'
#' # 3. As in (2), except we now define a dosing function.
#'
#' dose_fun <- function(WT) {
#' ifelse(WT < 16, 150,
#' ifelse(WT < 20, 200, 250)
#' )
#' }
#'
#' cohort(
#' param = p1,
#' include = inc,
#' n = 20,
#' pop_size = 500,
#' obs_times = ot,
#' dose_times = dt,
#' original_id = FALSE,
#' amt = dose_fun
#' )
#' @export
# TODO: Make dose_times optional
# TODO: Remove type checking (?)
cohort <- function(data = NULL,
include = NULL,
n = NULL,
obs_times = NULL,
dose_times = NULL,
amt = NULL,
param = NULL,
original_id = TRUE,
pop_size = NULL,
replace = FALSE,
keep = NULL,
tad = FALSE) {
# check inputs
if (is.null(data) & is.null(param)) {
stop("`data` and `param` cannot both be NULL.")
}
if (!is.null(data) & !is.null(param)) {
stop("specify one of `data` or `param`, but not both.")
}
if (!is.null(param) & (!is.list(param) | is.null(names(param)))) {
stop("`param` requires a list with named fields.")
}
if (is.null(amt) | (!is.function(amt) & !is.numeric(amt))) {
stop("`amt` must be of type numeric or function.")
}
if (!is.numeric(obs_times)) {
stop("`obs_times` must be of type numeric.")
}
if (!is.numeric(dose_times)) {
stop("`dose_times` must be of type numeric.")
}
# initialize df
df <- NULL
# construct time vector and evid vector
times <- c(dose_times, obs_times)
evid <- c(rep(1, length(dose_times)), rep(0, length(obs_times)))
n_timepoints <- length(times)
# determine whether to use real or synthetic data
if (!is.null(data)) {
# use real data
df <- data
} else {
# use synthetic data
if (original_id != FALSE) {
stop("Error: cannot use orginial ids with synthetic data.")
}
df <- dplyr::tibble("ID" = 1:pop_size)
# convert to uppercase and truncate to 4 letters if needed
names(param) <- toupper(names(param))
# names(param) <- lapply(names(param), substring, 1, 4)
# check param names
if (length(dplyr::intersect(
names(param),
c("ID", "TIME", "EVID", "AMT", "DV")
)) > 0) {
stop('Error: `param` cannot contain any of "ID", "TIME", "EVID", "AMT", or
"DV".')
}
# sample distributions for each variable in param
for (var in names(param)) {
# extract name of sampling function, e.g. rnorm
fun <- param[[var]][[1]]
# extract function arguments
fun_args <- c(pop_size, utils::tail(param[[var]], -1))
# sample pop_size times from the distribution
df[var] <- do.call(fun, fun_args)
}
}
if (is.null(n)) {
n <- nrow(df)
}
# compute pop_size
if (is.integer(pop_size)) {
pop_size <- pop_size * n
} else {
pop_size <- 10 * n
}
# convert df to numeric
df <- dplyr::mutate(df, dplyr::across(
.fns = as.numeric, .cols =
!dplyr::any_of({{ keep }})
))
# deal with original ids
if (original_id != FALSE) {
if (is.character(original_id) & original_id %in% colnames(df)) {
dplyr::rename(df, "ID" = dplyr::all_of(original_id))
}
}
dplyr::rename(df, "ID" = dplyr::matches("^id$", ignore.case = TRUE))
if (!"ID" %in% colnames(df)) {
stop("Error: data does not contain column named `id`
(case insensitive).")
}
if (!is.null(include)) {
# convert include criteria to R expression
include <- str2expression(include)
if (!is.logical(eval(include, envir = df))) {
stop(
"Error: `include` requires an expression that evaluates to TRUE or
FALSE, e.g.`AGE >= 6`."
)
}
# apply include filter
df <- dplyr::filter(df, eval(include, envir = df))
}
# ensure one row per ID
df <- dplyr::group_by(df, .data$ID)
df <- dplyr::slice(df, 1)
df <- dplyr::ungroup(df)
if (original_id == FALSE) {
df$ID <- 1:nrow(df)
}
# sample randomly
df <- dplyr::slice_sample(df, n = n, replace = replace)
if (replace & !original_id) {
df$ID <- 1:nrow(df)
}
# update n, in case fewer than n patients fit the requirements for include
n <- nrow(df)
# duplicate rows before creating time columns
df <- dplyr::slice(df, rep(1:n, each = n_timepoints))
# create TIME, EVID, and DV columns
df$TIME <- rep(times, n)
df$EVID <- rep(evid, n)
df$DV <- 0
# sort by ID and TIME
df <- dplyr::arrange(df, .data$ID, .data$TIME)
# assign dose amounts
if (is.numeric(amt)) {
df <- dplyr::mutate(df, AMT = dplyr::if_else(.data$EVID == 1, amt, 0))
} else {
# extract formals from dosing function and apply where EVID = 1
dose_args <- as.list(dplyr::syms(names(formals(amt))))
df <- dplyr::mutate(df,
AMT = dplyr::if_else(.data$EVID == 1,
do.call(amt, eval(dose_args)), 0
)
)
}
df <- dplyr::relocate(df, .data$ID, .data$TIME, .data$EVID, .data$AMT, .data$DV)
# calculate TAD
if (tad) {
df <- tad(df)
}
df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.