#' @name latrend-data
#' @title Longitudinal dataset representation
#' @description The [latrend estimation functions][latrend-estimation] expect univariate
#' longitudinal data that can be represented in a `data.frame` with one row per trajectory observation:
#' * Trajectory identifier: `numeric`, `character`, or `factor`
#' * Observation time: `numeric`
#' * Observation value: `numeric`
#'
#' In principle, any type of longitudinal data structure is supported,
#' given that it can be transformed to the required `data.frame` format using the generic [trajectories] function.
#' Support can be added by implementing the [trajectories] function for the respective signature.
#' This means that users can implement their own data adapters as needed.
#'
#' @section Included longitudinal datasets:
#' The following datasets are included with the package:
#' * [latrendData]
#' * [PAP.adh]
#' * [PAP.adh1y]
NULL
#' @title Artificial longitudinal dataset comprising three classes
#' @description
#' An artificial longitudinal dataset comprising 200 trajectories belonging to one of 3 classes.
#' Each trajectory deviates in intercept and slope from its respective class trajectory.
#' @format A `data.frame` comprising longitudinal observations from 200 trajectories.
#' Each row represents the observed value of a trajectory at a specific moment in time.
#' \describe{
#' \item{Id}{`integer`: The trajectory identifier.}
#' \item{Time}{`numeric`: The measurement time, between 0 and 2.}
#' \item{Y}{`numeric`: The observed value at the respective time `Time` for trajectory `Id`.}
#' \item{Class}{`factor`: The reference class.}
#' }
#'
#' ```{r}
#' data(latrendData)
#' head(latrendData)
#' ```
#'
#' @source This dataset was generated using [generateLongData].
#' @seealso [latrend-data] [generateLongData]
#' @examples
#' data(latrendData)
#'
#' if (require("ggplot2")) {
#' plotTrajectories(latrendData, id = "Id", time = "Time", response = "Y")
#'
#' # plot according to the reference class
#' plotTrajectories(latrendData, id = "Id", time = "Time", response = "Y", cluster = "Class")
#' }
"latrendData"
#' @name PAP.adh
#' @title Weekly Mean PAP Therapy Usage of OSA Patients in the First 3 Months
#' @description
#' A simulated longitudinal dataset comprising 301 patients with obstructive sleep apnea (OSA)
#' during their first 91 days (13 weeks) of PAP therapy.
#' The longitudinal patterns were inspired by the adherence patterns reported by Yi et al. (2022),
#' interpolated to weekly hours of usage.
#' @format A `data.frame` comprising longitudinal data of 500 patients, each having 26 observations
#' over a period of 1 year.
#' Each row represents a patient observation interval (two weeks), with columns:
#' \describe{
#' \item{Patient}{`integer`: The patient identifier, where each level represents a simulated patient.}
#' \item{Week}{`integer`: The week number, starting from 1.}
#' \item{UsageHours}{`numeric`: The mean hours of usage in the respective week.
#' Greater than or equal to zero, and typically around 4-6 hours.}
#' \item{Group}{`factor`: The reference group (i.e., adherence pattern) from which this patient was generated.}
#' }
#'
#' \insertRef{yi2022identifying}{latrend}
#' @seealso [latrend-data] [PAP.adh1y]
#' @examples
#' data(PAP.adh)
#'
#' if (require("ggplot2")) {
#' plotTrajectories(PAP.adh, id = "Patient", time = "Week", response = "UsageHours")
#'
#' # plot according to cluster ground truth
#' plotTrajectories(
#' PAP.adh,
#' id = "Patient",
#' time = "Week",
#' response = "UsageHours",
#' cluster = "Group"
#' )
#' }
"PAP.adh"
#' @name PAP.adh1y
#' @title Biweekly Mean PAP Therapy Adherence of OSA Patients over 1 Year
#' @description
#' A simulated longitudinal dataset comprising 500 patients with obstructive sleep apnea (OSA)
#' during their first year on CPAP therapy.
#' The dataset contains the patient usage hours, averaged over 2-week periods.
#'
#' The daily usage data underlying the downsampled dataset was simulated based on 7 different adherence patterns.
#' The defined adherence patterns were inspired by the adherence patterns identified by Aloia et al. (2008),
#' with slight adjustments
#' @format A `data.frame` comprising longitudinal data of 500 patients, each having 26 observations over a period of 1 year.
#' Each row represents a patient observation interval (two weeks), with columns:
#' \describe{
#' \item{Patient}{`factor`: The patient identifier, where each level represents a simulated patient.}
#' \item{Biweek}{`integer`: Two-week interval index. Starts from 1.}
#' \item{MaxDay}{`integer`: The last day used for the aggregation of the respective interval, `integer`}
#' \item{UsageHours}{`numeric`: The mean hours of usage in the respective week.
#' Greater than or equal to zero, and typically around 4-6 hours.}
#' \item{Group}{`factor`: The reference group (i.e., adherence pattern) from which this patient was generated.}
#' }
#'
#' @note This dataset is only intended for demonstration purposes.
#' While the data format will remain the same, the data content is subject to change in future versions.
#' @source This dataset was generated based on the cluster-specific descriptive statistics table
#' provided in Aloia et al. (2008),
#' with some adjustments made in order to improve cluster separation for demonstration purposes.
#'
#' \insertRef{aloia2008time}{latrend}
#' @seealso [latrend-data]
#' @examples
#' data(PAP.adh1y)
#'
#' if (require("ggplot2")) {
#' plotTrajectories(PAP.adh1y, id = "Patient", time = "Biweek", response = "UsageHours")
#'
#' # plot according to cluster ground truth
#' plotTrajectories(
#' PAP.adh1y,
#' id = "Patient",
#' time = "Biweek",
#' response = "UsageHours",
#' cluster = "Group"
#' )
#' }
"PAP.adh1y"
#' @export
#' @title Generate longitudinal test data
#' @param sizes Number of strata per cluster.
#' @param fixed Fixed effects formula.
#' @param random Random effects formula.
#' @param cluster Cluster effects formula.
#' @param id Name of the strata.
#' @param data Data with covariates to use for generation. Stratified data may be specified by
#' adding a grouping column.
#' @param clusterNames A `character` vector denoting the names of the generated clusters.
#' @param fixedCoefs Coefficients matrix for the fixed effects.
#' @param clusterCoefs Coefficients matrix for the cluster effects.
#' @param randomScales Standard deviations matrix for the size of the variance components (random effects).
#' @param rrandom Random sampler for generating the variance components at location 0.
#' @param noiseScales Scale of the random noise passed to rnoise. Either scalar or defined per cluster.
#' @param rnoise Random sampler for generating noise at location 0 with the respective scale.
#' @param shuffle Whether to randomly reorder the strata in which they appear in the data.frame.
#' @param seed Optional seed to set for the PRNG. The set PRNG state persists after the function completes.
#' @seealso [latrend-data]
#' @examples
#' longdata <- generateLongData(
#' sizes = c(40, 70), id = "Id",
#' cluster = ~poly(Time, 2, raw = TRUE),
#' clusterCoefs = cbind(c(1, 2, 5), c(-3, 4, .2))
#' )
#'
#' if (require("ggplot2")) {
#' plotTrajectories(longdata, response = "Value", id = "Id", time = "Time")
#' }
generateLongData = function(
sizes = c(40, 60),
fixed = Value ~ 1,
cluster = ~ 1 + Time,
random = ~ 1,
id = getOption('latrend.id'),
data = data.frame(Time = seq(0, 1, by = .1)),
fixedCoefs = 0,
clusterCoefs = cbind(c(-2, 1), c(2, -1)),
randomScales = cbind(.1, .1),
rrandom = rnorm,
noiseScales = c(.1, .1),
rnoise = rnorm,
clusterNames = LETTERS[seq_along(sizes)],
shuffle = FALSE,
seed = NULL
) {
if (!is.null(seed)) {
set.seed(seed)
}
nClus = length(sizes)
assert_that(nClus >= 1, all(sizes > 0))
assert_that(is.character(id) && nchar(id) > 0)
assert_that(is.character(clusterNames), length(clusterNames) == nClus)
assert_that(is.logical(shuffle))
assert_that(is.data.frame(data), all(vapply(data, is.numeric, FUN.VALUE = FALSE)))
## Fixed effects
assert_that(is.formula(fixed), hasSingleResponse(fixed))
nIds = sum(sizes)
ids = seq_len(nIds)
nObs = nrow(data)
clusters = rep(clusterNames, sizes) %>% factor(levels = clusterNames)
nTotalObs = nIds * nrow(data)
response = getResponse(fixed)
rowIds = rep(ids, each = nObs)
Xf = model.matrix(dropResponse(fixed), data)
if (ncol(Xf) > 0) {
assert_that(is.null(fixedCoefs) || is.numeric(fixedCoefs) && is.null(dim(fixedCoefs)))
Xfi = model.matrix(update(fixed, NULL ~ . - 1), data) #design matrix without intercept
assert_that(
ncol(Xf) == length(fixedCoefs),
msg = 'Missing or too many coefficients specified for fixed effects.'
)
fixedValues = Xf %*% fixedCoefs
alldata = data.table(
Id = rowIds,
Class = as.integer(clusters)[rowIds],
Mu.fixed = fixedValues[rep(seq_len(nObs), nIds)],
Xfi[rep(seq_len(nObs), nIds), , drop = FALSE],
data[, setdiff(names(data), colnames(Xfi)), drop = FALSE]
)
} else {
alldata = data.table(
Id = rowIds,
Class = as.integer(clusters)[rowIds],
Mu.fixed = 0,
data
)
}
## Cluster effects
assert_that(
is.formula(cluster),
!hasResponse(cluster),
is.numeric(clusterCoefs),
is.matrix(clusterCoefs)
)
assert_that(ncol(clusterCoefs) == nClus)
Xc = model.matrix(cluster, alldata)
assert_that(
ncol(Xc) == nrow(clusterCoefs),
msg = 'Missing or too many coefficients specified for cluster effects.'
)
alldata[, Mu.class := rowSums(Xc * t(clusterCoefs)[Class, ])]
## Random effects
assert_that(is.formula(random), !hasResponse(random))
Xr = model.matrix(random, alldata)
if (ncol(Xr) > 0) {
if (!is.matrix(randomScales)) {
randomScales = matrix(randomScales,
nrow = length(randomScales),
ncol = nClus)
}
assert_that(
is.matrix(randomScales),
ncol(randomScales) == nClus,
nrow(randomScales) == ncol(Xr)
)
# generate id-specific scales
idScales = t(randomScales)[rep(1:nClus, sizes), ] %>%
matrix(ncol = nrow(randomScales))
assert_that(nrow(idScales) == nIds)
idCoefs = rrandom(nIds * nrow(randomScales), 0, idScales) %>%
matrix(ncol = nrow(randomScales))
assert_that(nrow(idCoefs) == nIds)
alldata[, Mu.random := rowSums(Xr * idCoefs[Id, ])]
alldata[, Mu := Mu.fixed + Mu.class + Mu.random]
} else {
alldata[, Mu := Mu.fixed + Mu.class]
}
## Noise
assert_that(is.numeric(noiseScales))
if (nClus > 1 && length(noiseScales) == 1) {
noiseScales = rep(noiseScales, nClus)
}
assert_that(length(noiseScales) == nClus)
alldata[, Value := Mu + rnoise(.N, 0, noiseScales[Class])]
## Finalize
if (shuffle) {
newOrder = sample.int(nIds)
alldata = alldata[order(newOrder[Id])]
}
setnames(alldata, 'Value', response)
setnames(alldata, 'Id', id)
setcolorder(alldata, c(id, 'Class', setdiff(colnames(Xf), '(Intercept)'), response))
alldata[, Class := factor(clusterNames[Class], levels = clusterNames)]
return(alldata[])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.