Nothing
#' @name latrend-estimation
#' @title Overview of **`lcMethod`** estimation functions
#' @description This page presents an overview of the different functions that are available for estimating one or more [longitudinal cluster methods][lcMethod].
#' All functions are prefixed by *"latrend"*.
#' @section *latrend* estimation functions:
#' * **[latrend()]**: estimate a [method][lcMethod] on a [longitudinal dataset][latrend-data], returning the resulting [model][lcModel].
#' * **[latrendBatch()]**: estimate multiple [methods][lcMethod] on multiple [longitudinal datasets][latrend-data], returning a [list of models][lcModels-class].
#' * **[latrendRep()]**: repeatedly estimate a [method][lcMethod] on a [longitudinal dataset][latrend-data], returning a [list of models][lcModels-class].
#' * **[latrendBoot()]**: repeatedly estimate a [method][lcMethod] on bootstrapped [longitudinal dataset][latrend-data], returning a [list of models][lcModels-class].
#' * **[latrendCV()]**: repeatedly estimate a [method][lcMethod] using cross-validation on a [longitudinal dataset][latrend-data], returning a [list of models][lcModels-class].
#' @section Parallel estimation:
#' The functions involving repeated estimation support parallel computation. [See here.][latrend-parallel]
#' @seealso [latrend-package] [lcMethod-estimation]
NULL
#' @export
#' @title Cluster longitudinal data using the specified method
#' @description [An overview of the latrend package and its capabilities can be found here][latrend-package].
#'
#' The `latrend()` function fits a specified longitudinal cluster [method][lcMethod] to the given data comprising the trajectories.
#'
#' This function runs all steps of the standardized [method estimation procedure][lcMethod-estimation], as implemented by the given `lcMethod` object.
#' The result of this procedure is the estimated [lcModel].
#' @param method An [lcMethod] object specifying the longitudinal cluster method to apply, or the name (as `character`) of the `lcMethod` subclass to instantiate.
#' @param data The data of the trajectories to which to estimate the method for.
#' Any inputs supported by [trajectories()] can be used, including `data.frame` and `matrix`.
#' @param ... Any other arguments to update the `lcMethod` definition with.
#' @param envir The `environment` in which to evaluate the method arguments via [compose()].
#' If the `data` argument is of type `call` then this environment is also used to evaluate the `data` argument.
#' @param verbose The level of verbosity. Either an object of class `Verbose` (see [R.utils::Verbose] for details),
#' a `logical` indicating whether to show basic computation information,
#' a `numeric` indicating the verbosity level (see [Verbose]),
#' or one of `c('info', 'fine', 'finest')`.
#' @details If a seed value is specified in the `lcMethod` object or arguments to `latrend`, this seed is set using `set.seed` prior to the [preFit] step.
#' @return A [lcModel] object representing the fitted solution.
#' @examples
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#' model <- latrend(method, data = latrendData)
#'
#' model <- latrend("lcMethodLMKM", formula = Y ~ Time, id = "Id", time = "Time", data = latrendData)
#'
#' model <- latrend(method, data = latrendData, nClusters = 3, seed = 1)
#' @family longitudinal cluster fit functions
latrend = function(
method,
data,
...,
envir = NULL,
verbose = getOption('latrend.verbose')
) {
method = as.lcMethod(method)
assert_that(!missing(data))
envir = .selectEnvironment(method, parent.frame(), envir)
verbose = as.Verbose(verbose)
argList = list(...)
argList$envir = envir
newmethod = do.call(update, c(object = method, argList))
environment(newmethod) = envir
header(verbose, sprintf('Longitudinal clustering using: %s', getName(newmethod)))
cat(verbose, c('Method arguments:', as.character(newmethod)[-1]))
ruler(verbose)
# compose
enter(verbose, 'Evaluating the method arguments.', level = verboseLevels$fine, suffix = '')
cmethod = compose(newmethod, envir = envir)
exit(verbose, level = verboseLevels$finest)
id = idVariable(cmethod)
time = timeVariable(cmethod)
response = responseVariable(cmethod)
# transform data
enter(verbose, 'Checking and transforming the training data format.', suffix = '')
modelData = trajectories(
data,
id = id,
time = time,
response = response,
envir = envir
)
exit(verbose, level = verboseLevels$finest)
enter(verbose, 'Validating method arguments.', level = verboseLevels$fine, suffix = '')
validate(cmethod, modelData)
exit(verbose, level = verboseLevels$finest)
# prepare
enter(verbose, 'Preparing the training data for fitting')
modelEnv = prepareData(
method = cmethod,
data = modelData,
verbose = verbose
)
exit(verbose, level = verboseLevels$finest)
fitTiming = .enterTimed(verbose, 'Fitting the method')
mc = match.call.all()
model = .fitLatrendMethod(
cmethod,
modelData,
envir = modelEnv,
mc = mc,
verbose = verbose
)
environment(model) = envir
.exitTimed(fitTiming, msg = 'Done fitting the method (%s)')
# done
ruler(verbose)
model
}
.fitLatrendMethod = function(method, data, envir, mc, verbose) {
assert_that(
is_class_defined(method),
is.lcMethod(method),
is.data.frame(data),
is.call(mc),
is.environment(envir) || is.null(envir)
)
if (hasName(method, 'seed')) {
cat(verbose, sprintf('Setting seed %s.', as.character(method$seed)))
set.seed(method$seed)
}
suppressFun = ifelse(
as.logical(verbose),
force,
function(...) capture.output(suppressMessages(...))
)
suppressFun({
# preFit
modelEnv = preFit(
method = method,
data = data,
envir = envir,
verbose = verbose
)
# fit
model = fit(
method = method,
data = data,
envir = modelEnv,
verbose = verbose
)
})
assert_that(is_class_defined(model))
model@call = do.call(
call,
c('latrend', method = quote(getCall(method)), data = quote(mc$data))
)
model@call['envir'] = list(mc$envir)
# postFit
suppressFun({
model = postFit(
method = method,
data = data,
model = model,
envir = modelEnv,
verbose = verbose
)
})
model
}
#' @export
#' @title Cluster longitudinal data repeatedly
#' @description Performs a repeated fit of the specified latrend model on the given data.
#' @inheritParams latrend
#' @param .rep The number of repeated fits.
#' @param .errorHandling Whether to `"stop"` on an error, or to `"remove'` evaluations that raised an error.
#' @param .seed Set the seed for generating the respective seed for each of the repeated fits.
#' @param .parallel Whether to use parallel evaluation. See \link{latrend-parallel}.
#' @details This method is faster than repeatedly calling [latrend] as it only prepares the data via `prepareData()` once.
#' @return A `lcModels` object containing the resulting models.
#' @examples
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#' models <- latrendRep(method, data = latrendData, .rep = 5) # 5 repeated runs
#'
#' models <- latrendRep(method, data = latrendData, .seed = 1, .rep = 3)
#' @family longitudinal cluster fit functions
latrendRep = function(
method,
data,
.rep = 10,
...,
.errorHandling = 'stop',
.seed = NULL,
.parallel = FALSE,
envir = NULL,
verbose = getOption('latrend.verbose')
) {
method = as.lcMethod(method)
envir = .selectEnvironment(method, parent.frame(), envir)
assert_that(
!missing(data),
is.count(.rep),
is.flag(.parallel)
)
verbose = as.Verbose(verbose)
argList = list(...)
argList$envir = envir
newmethod = do.call(update, c(object = method, argList))
environment(newmethod) = envir
header(verbose, sprintf('Repeated (%d) longitudinal clustering using "%s"', .rep, getName(method)))
cat(verbose, c('Method arguments:', as.character(newmethod)[-1]))
ruler(verbose)
mc = match.call.all()
# compose
enter(verbose, 'Evaluating the method arguments.', suffix = '', level = verboseLevels$fine)
cmethod = compose(newmethod, envir = envir)
exit(verbose, level = verboseLevels$finest)
# seed
if (hasName(cmethod, 'seed') && length(cmethod$seed) > 0L) {
warning('The supplied lcMethod object defines a seed, which would result in repeated identical results. This seed will be ignored.
Use the .seed argument of latrendRep() to generate different seeds for the repetitions in a reproducible way.')
}
cat(verbose, sprintf('Generating method seeds for seed = %s.', as.character(.seed)))
localRNG(seed = .seed, {
repSeeds = sample.int(.Machine$integer.max,
size = .rep,
replace = FALSE)
})
id = idVariable(cmethod)
time = timeVariable(cmethod)
response = responseVariable(cmethod)
assert_that(
is.character(idVariable(cmethod)),
is.character(timeVariable(cmethod)),
is.character(responseVariable(cmethod))
)
# transform data
enter(verbose, 'Checking the training data and ensuring the standard data format.', suffix = '')
modelData = trajectories(
data,
id = id,
time = time,
response = response,
envir = envir
)
exit(verbose, level = verboseLevels$finest)
enter(verbose, 'Validating the method arguments.', suffix = '', level = verboseLevels$fine)
validate(cmethod, modelData)
exit(verbose, level = verboseLevels$finest)
enter(verbose, 'Preparing the training data for fitting')
prepEnv = prepareData(
method = cmethod,
data = modelData,
verbose = verbose
)
exit(verbose, level = verboseLevels$finest)
`%infix%` = ifelse(.parallel, `%dopar%`, `%do%`)
fitTiming = .enterTimed(verbose, 'Fitting the methods')
models = foreach(
i = seq_len(.rep),
iseed = repSeeds,
.combine = c,
.errorhandling = .errorHandling
) %infix% {
enter(verbose, sprintf('Fitting model %d/%d (%d%%)', i, .rep, signif(i / .rep * 100, 2)))
assert_that(is_class_defined(cmethod))
imethod = update(cmethod, seed = iseed, .eval = TRUE)
model = .fitLatrendMethod(
imethod,
data = modelData,
envir = prepEnv,
mc = mc,
verbose = verbose
)
environment(model) = envir
exit(verbose, level = verboseLevels$finest)
model
}
.exitTimed(fitTiming, msg = 'Done fitting the methods (%s)')
ruler(verbose)
as.lcModels(models)
}
# latrend-derived ####
#' @export
#' @title Cluster longitudinal data for a list of method specifications
#' @description Fit a list of longitudinal cluster methods on one or more datasets.
#' @details Methods and datasets are evaluated and validated prior to any fitting. This ensures that the batch estimation fails as early as possible in case of errors.
#' @inheritParams latrend
#' @param methods A `list` of `lcMethod` objects.
#' @param data The dataset(s) to which to fit the respective `lcMethod` on.
#' Either a `data.frame`, `matrix`, `list` or an expression evaluating to one of the supported types.
#' Multiple datasets can be supplied by encapsulating the datasets using `data = .(df1, df2, ..., dfN)`.
#' Doing this results in a more readable `call` associated with each fitted `lcModel` object.
#' @param cartesian Whether to fit the provided methods on each of the datasets. If `cartesian=FALSE`, only a single dataset may be provided or a list of data matching the length of `methods`.
#' @param parallel Whether to enable parallel evaluation. See \link{latrend-parallel}. Method evaluation and dataset transformation is done on the calling thread.
#' @param seed Sets the seed for generating a seed number for the methods.
#' Seeds are only set for methods without a seed argument or `NULL` seed.
#' @param errorHandling Whether to `"stop"` on an error, or to `"remove'` evaluations that raised an error.
#' @param envir The `environment` in which to evaluate the `lcMethod` arguments.
#' @return A `lcModels` object.
#' In case of a model fit error under `errorHandling = pass`, a `list` is returned.
#' @examples
#' data(latrendData)
#' refMethod <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#' methods <- lcMethods(refMethod, nClusters = 1:2)
#' models <- latrendBatch(methods, data = latrendData)
#'
#' # different dataset per method
#' models <- latrendBatch(
#' methods,
#' data = .(
#' subset(latrendData, Time > .5),
#' subset(latrendData, Time < .5)
#' )
#' )
#'
#' @seealso lcMethods
#' @family longitudinal cluster fit functions
latrendBatch = function(
methods,
data,
cartesian = TRUE,
seed = NULL,
parallel = FALSE,
errorHandling = 'stop',
envir = NULL,
verbose = getOption('latrend.verbose')
) {
if (!is.list(methods)) {
methods = list(methods)
}
assert_that(
is.list(methods),
all(vapply(methods, inherits, 'lcMethod', FUN.VALUE = FALSE)),
msg = 'methods argument must be a list of lcMethod objects'
)
assert_that(
all(lengths(methods) > 0),
msg = sprintf(
'the lcMethod object(s) in the "methods" argument at index %s do not have any arguments.',
paste0(
which(lengths(methods) == 0),
' (',
vapply(methods[lengths(methods) == 0], class, FUN.VALUE = ''),
')',
collapse = ', '
)
)
)
assert_that(
!missing(data),
is.flag(cartesian),
is.flag(parallel)
)
envir = .selectEnvironment(methods[[1]], parent.frame(), envir)
verbose = as.Verbose(verbose)
nMethods = length(methods)
mc = match.call()[-1]
dataCall = mc$data
if (is.call(dataCall) && dataCall[[1]] == '.') {
# data = .(d1, d2, dN)
dataList = as.list(dataCall[-1])
} else {
# data = varName OR data = expr
dataEval = eval(dataCall, envir = parent.frame())
assert_that(length(dataEval) >= 1)
if (is(dataEval, 'list')) {
dataList = lapply(as.numeric(seq_along(dataEval)), function(d)
substitute(dataObj[[d]], list(dataObj = dataCall, d = d)))
} else {
dataList = list(dataCall)
}
}
nData = length(dataList)
assert_that(
cartesian || nData %in% c(1, nMethods),
msg = 'number of datasets must be 1 or match the number of specified methods'
)
header(verbose, sprintf('Longitudinal clustering of %d dataset(s) using %d method(s)', nData, nMethods))
# compose methods
enter(verbose, sprintf('Evaluating method arguments of %d methods', nMethods), level = verboseLevels$fine, suffix = '')
methods = lapply(methods, compose, envir = envir)
exit(verbose, level = verboseLevels$finest)
# generate method and data lists
if (cartesian) {
allMethods = methods[rep(seq_len(nMethods), nData)]
allDataOpts = dataList[rep(seq_len(nData), nMethods)]
} else if (nMethods == nData) {
allMethods = methods
allDataOpts = dataList
} else {
# replicate methods and data such that they are equal length
allMethods = methods[rep_len(seq_len(nMethods), length.out = max(nMethods, nData))]
allDataOpts = dataList[rep_len(seq_len(nData), length.out = max(nMethods, nData))]
}
assert_that(length(allMethods) == length(allDataOpts))
nModels = length(allMethods)
# transform data
enter(verbose, sprintf('Checking and transforming the data format for %d datasets.', nData), suffix = '', level = verboseLevels$fine)
allData = lapply(allDataOpts, eval, envir = envir)
allModelData = Map(
function(data, method) {
trajectories(
data,
id = idVariable(method),
time = timeVariable(method),
response = responseVariable(method),
envir = envir
)
},
allData,
allMethods
)
assert_that(
is.list(allModelData),
length(allModelData) == nModels,
all(vapply(allModelData, is.data.frame, FUN.VALUE = TRUE))
)
exit(verbose, level = verboseLevels$finest)
# validate methods on data
enter(verbose, sprintf('Validating methods against the datasets (%d checks).', nModels), level = verboseLevels$fine, suffix = '')
Map(validate, allMethods, allModelData)
exit(verbose, level = verboseLevels$finest)
# resolve method seeds
# a method's seed is considered set when a non-empty value is set.
seedMask = !vapply(
allMethods,
function(m) hasName(m, 'seed') && length(m$seed) > 0L,
FUN.VALUE = TRUE
)
if (any(seedMask)) {
cat(verbose, sprintf('Generating seeds for %d methods that do not have a pre-set seed argument.', sum(seedMask)))
localRNG(seed = seed, {
allSeeds = sample.int(.Machine$integer.max, size = nModels, replace = FALSE)
})
# update methods that don't have a seed argument
allMethods[seedMask] = Map(
function(method, seed) update(method, seed = seed, .eval = TRUE),
allMethods[seedMask],
allSeeds[seedMask]
)
} else if (length(seed) > 0L) {
# all methods already have a seed
warning('latrendBatch() seed argument was set but all methods already have a pre-defined seed. No methods to seed.')
}
# generate calls
allCalls = vector('list', length(allMethods))
for (i in seq_along(allMethods)) {
allCalls[[i]] = do.call(
call,
c(
'latrend',
method = allMethods[[i]],
data = quote(allDataOpts[[i]]),
# seed does not need to be set because its part of the method
envir = quote(envir),
verbose = quote(verbose)
)
)
}
`%infix%` = ifelse(parallel, `%dopar%`, `%do%`)
penv = parent.frame()
# latrend
fitTiming = .enterTimed(verbose, sprintf('Fitting %d models', nModels))
ruler(verbose)
models = foreach(
i = seq_along(allMethods),
modelMethod = allMethods,
modelData = allModelData,
modelCall = allCalls,
.packages = 'latrend',
.errorhandling = errorHandling) %infix%
{
modelTiming = .enterTimed(
verbose,
sprintf(
'Fitting model %d/%d (%d%%) using %s',
i,
nModels,
round(i / nModels * 100),
getName(modelMethod)
)
)
on.exit(expr = .exitTimed(modelTiming), add = TRUE)
cat(verbose, as.character(modelMethod, prefix = '- '))
prepEnv = local({
enter(verbose, 'Preparing the training data for fitting', suffix = '', level = verboseLevels$fine)
on.exit(expr = exit(verbose, level = verboseLevels$finest), add = TRUE)
prepareData(method = modelMethod, data = modelData, verbose = verbose)
})
model = .fitLatrendMethod(
method = modelMethod,
data = modelData,
mc = modelCall,
envir = prepEnv,
verbose = verbose
)
model
}
.exitTimed(fitTiming)
# handle model results
errorMask = !vapply(models, is.lcModel, FUN.VALUE = TRUE)
if (any(errorMask)) {
# some list entries are not lcModel
nError = sum(errorMask)
cat(verbose, sprintf('Done, but errors occurred in %d out of %d methods', nError, nModels))
ruler(verbose)
warning(sprintf(
'Returning "list" object instead of "lcModels" object for latrendBatch()
because %d method estimations produced an error',
nError
))
return (models)
} else if (length(models) < nModels ) {
# fewer models were obtained than expected
nError = nModels - length(models)
cat(verbose, sprintf('Done, but errors occurred in %d out of %d methods', nError, nModels))
ruler(verbose)
return (as.lcModels(models))
} else {
# no errors
ruler(verbose)
return (as.lcModels(models))
}
}
#' @export
#' @title Cluster longitudinal data using bootstrapping
#' @description Performs bootstrapping, generating samples from the given data at the id level, fitting a lcModel to each sample.
#' @inheritParams latrend
#' @param data A `data.frame`.
#' @param samples The number of bootstrap samples to evaluate.
#' @param seed The seed to use. Optional.
#' @inheritParams latrendBatch
#' @return A `lcModels` object of length `samples`.
#' @examples
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#' bootModels <- latrendBoot(method, latrendData, samples = 10)
#'
#' bootMAE <- metric(bootModels, name = "MAE")
#' mean(bootMAE)
#' sd(bootMAE)
#' @family longitudinal cluster fit functions
#' @family validation methods
latrendBoot = function(
method,
data,
samples = 50,
seed = NULL,
parallel = FALSE,
errorHandling = 'stop',
envir = NULL,
verbose = getOption('latrend.verbose')
) {
assert_that(is.lcMethod(method), msg = 'method must be lcMethod object (e.g., lcMethodKML("Y") )')
assert_that(!missing(data), msg = 'data must be specified')
assert_that(is.data.frame(data), msg = 'data must be data.frame')
assert_that(is.count(samples))
verbose = as.Verbose(verbose)
header(
verbose,
sprintf(
'Longitudinal cluster estimation using %d bootstrap samples',
samples
)
)
ruler(verbose)
mc = match.call()
# generate seeds
localRNG(seed = seed, {
sampleSeeds = sample.int(
.Machine$integer.max,
size = samples,
replace = FALSE
)
})
id = idVariable(method)
assert_that(has_name(data, id))
# fit models
methods = replicate(samples, method, simplify = FALSE)
dataCalls = lapply(sampleSeeds, function(s)
enquote(substitute(
bootSample(data, id, s),
env = list(
data = mc$data,
id = id,
s = s
)
)))
dataCall = do.call(call, c(name = '.', dataCalls))
cl = do.call(
call,
list(
name = 'latrendBatch',
methods = methods,
data = enquote(dataCall),
cartesian = FALSE,
parallel = parallel,
errorHandling = errorHandling,
envir = quote(envir),
verbose = verbose
)
)
models = eval(cl, envir = parent.frame())
models
}
# Cross validation ####
#' @export
#' @title Cluster longitudinal data over k folds
#' @description Apply k-fold cross validation for internal cluster validation.
#' Creates k random subsets ("folds") from the data, estimating a model for each of the k-1 combined folds.
#' @inheritParams latrend
#' @inheritParams latrendBoot
#' @param data A `data.frame`.
#' @param folds The number of folds. Ten folds by default.
#' @return A `lcModels` object of containing the `folds` training models.
#' @examples
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#'
#' if (require("caret")) {
#' model <- latrendCV(method, latrendData, folds = 5, seed = 1)
#'
#' model <- latrendCV(method, subset(latrendData, Time < .5), folds = 5)
#' }
#' @family longitudinal cluster fit functions
#' @family validation methods
latrendCV = function(
method,
data,
folds = 10,
seed = NULL,
parallel = FALSE,
errorHandling = 'stop',
envir = NULL,
verbose = getOption('latrend.verbose')
) {
.checkPackageInstalled('caret')
assert_that(!missing(data), msg = 'data must be specified')
assert_that(is.data.frame(data), msg = 'data must be data.frame')
assert_that(is.count(folds))
verbose = as.Verbose(verbose)
header(verbose, sprintf('Longitudinal clustering with %d-fold cross validation', folds))
ruler(verbose)
if (is.null(seed)) {
seed = sample.int(.Machine$integer.max, size = 1)
}
id = idVariable(method)
assert_that(has_name(data, id))
mc = match.call()
dataFoldCalls = lapply(as.numeric(1:folds), function(fold) {
enquote(substitute(
trainFold(data, fold = fold, id, folds, seed),
env = list(
data = mc$data,
id = id,
fold = fold,
folds = folds,
seed = seed
)
))
})
dataCall = do.call(call, c('.', dataFoldCalls))
models = do.call(
latrendBatch,
list(
method = method,
data = dataCall,
parallel = parallel,
errorHandling = errorHandling,
verbose = verbose
),
envir = parent.frame()
)
models
}
#' @export
#' @title Create the training data for each of the k models in k-fold cross validation evaluation
#' @param data A `data.frame` representing the complete dataset.
#' @param folds The number of folds. By default, a 10-fold scheme is used.
#' @param id The trajectory identifier variable.
#' @param seed The seed to use, in order to ensure reproducible fold generation at a later moment.
#' @return A `list` of `data.frame` of the `folds` training datasets.
#' @family validation methods
#' @examples
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#'
#' if (require("caret")) {
#' trainFolds <- createTrainDataFolds(latrendData, folds = 5, id = "Id", seed = 1)
#'
#' foldModels <- latrendBatch(method, data = trainFolds)
#' testDataFolds <- createTestDataFolds(latrendData, trainFolds)
#' }
createTrainDataFolds = function(
data,
folds = 10L,
id = getOption('latrend.id'),
seed = NULL
) {
assert_that(
is.count(folds),
folds > 1L,
is.data.frame(data),
has_name(data, id)
)
ids = unique(data[[id]])
localRNG(seed = seed, {
foldIdsList = caret::createFolds(
seq_along(ids),
k = folds,
list = TRUE,
returnTrain = TRUE
) %>%
lapply(function(i) ids[i])
})
dataList = lapply(foldIdsList, function(foldIds) {
.trajSubset(data, ids = foldIds, id = id)
})
dataList
}
#' @export
#' @title Create the test fold data for validation
#' @inheritParams createTrainDataFolds
#' @param trainData A `data.frame` representing the training data, which should be a subset of `data`.
#' @seealso createTrainDataFolds
#' @family validation methods
#' @examples
#' data(latrendData)
#'
#' if (require("caret")) {
#' trainDataList <- createTrainDataFolds(latrendData, id = "Id", folds = 10)
#' testData1 <- createTestDataFold(latrendData, trainDataList[[1]], id = "Id")
#' }
createTestDataFold = function(data, trainData, id = getOption('latrend.id')) {
assert_that(is.data.frame(trainData))
trainIds = unique(trainData[[id]])
allIds = unique(data[[id]])
assert_that(all(trainIds %in% allIds))
testIds = setdiff(allIds, trainIds)
.trajSubset(data, ids = testIds, id = id)
}
#' @export
#' @title Create all k test folds from the training data
#' @inheritParams createTestDataFold
#' @param trainDataList A `list` of `data.frame` representing each of the data training folds. These should be derived from `data`.
#' @param ... Arguments passed to [createTestDataFold].
#' @family validation methods
#' @examples
#' data(latrendData)
#'
#' if (require("caret")) {
#' trainDataList <- createTrainDataFolds(latrendData, folds = 10, id = "Id")
#' testDataList <- createTestDataFolds(latrendData, trainDataList)
#' }
createTestDataFolds = function(data, trainDataList, ...) {
lapply(
trainDataList,
function(trainData) createTestDataFold(data = data, trainData = trainData, ...)
)
}
# Data helper functions ####
#' @export
#' @name lcModel-data-filters
#' @rdname lcModel-data-filters
#' @title Data filters for lcModel
#' @description The data filters are applied by [latrend] prior to model estimation. These filters are used in [latrendBoot] and [latrendCV].
#' @inheritParams trajectories
#' @param data The `data.frame` representing the model dataset.
#' @param seed Optional seed for ensuring reproducibility.
#' @return A subset of `data` of type `data.frame`.
#' @family validation methods
bootSample = function(data, id, seed = NULL) {
assert_that(is.data.frame(data), has_name(data, id))
ids = unique(data[[id]])
localRNG(seed = seed, {
sampleIdx = sample.int(length(ids), replace = TRUE)
})
.trajSubset(data, ids = ids[sampleIdx], id = id)
}
#' @export
#' @rdname lcModel-data-filters
#' @param fold The fold to select.
#' @param folds Total number of folds to create.
#' @family validation methods
#' @keywords internal
trainFold = function(data, fold, id, folds, seed) {
assert_that(
is.data.frame(data),
has_name(data, id),
!is.null(seed)
)
ids = unique(data[[id]])
localRNG(seed = seed, {
foldIdx = caret::createFolds(
seq_along(ids),
k = folds,
list = TRUE,
returnTrain = TRUE
)[[fold]]
})
.trajSubset(data, ids = ids[foldIdx], id = id)
}
#' @export
#' @rdname lcModel-data-filters
#' @family validation methods
testFold = function(data, fold, id, folds, seed) {
trainData = trainFold(
data,
id = id,
fold = fold,
folds = folds,
seed = seed
)
createTestDataFold(data, trainData = trainData, id = id)
}
#' @title Select trajectories
#' @description Create a subset of the data with the given trajectories
#' @param data The longitudinal dataset, a `data.frame`
#' @param ids The trajectory identifiers, `vector`
#' @param id The name of the id column
#' @keywords internal
.trajSubset = function(data, ids, id) {
assert_that(
is.data.frame(data),
has_name(data, id),
length(ids) > 0L,
noNA(ids),
all(ids %in% data[[id]])
)
rowIds = data[[id]]
newdata = data[rowIds %in% ids, ]
if (is.factor(data[[id]])) {
newdata[[id]] = droplevels(newdata[[id]])
}
newdata
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.