R/modelPartition.R
In latrend: A Framework for Clustering Longitudinal Data

Documented in lcModelPartition

#' @include modelApprox.R
setClass(
  'lcModelPartition',
  representation(
    name = 'character',
    center = 'function',
    trajectoryClusterIndices = 'integer',
    converged = 'ANY'
  ),
  contains = 'lcApproxModel'
)

#' @export
#' @title Create a lcModel with pre-defined partitioning
#' @description Represents an arbitrary partitioning of a set of trajectories.
#' As such, this model has no predictive capabilities. The cluster trajectories are represented by the specified center function (mean by default).
#' @inheritParams lcMethodStratify
#' @param data A `data.frame` representing the trajectory data.
#' @param trajectoryAssignments A `vector` of cluster membership per trajectory, a `data.frame` with an id column and `"Cluster"` column, or the name of the cluster membership column in the `data` argument..
#' For `vector` input, the type must be `factor`, `character`, or `integer` (`1` to `nClusters`).
#' The order of the trajectory, and thus the respective assignments, is determined by the id column of the data.
#' Provide a `factor` id column for the input data to ensure that the ordering is as you aspect.
#' @param nClusters The number of clusters. Should be `NA` for trajectory assignments of type `factor`.
#' @param clusterNames The names of the clusters, or a function with input `n` outputting a `character vector` of names.
#' If unspecified, the names are determined from the `trajectoryAssignments` argument.
#' @param method Optional `lcMethod` object that was used for fitting this model to the data.
#' @param converged Set the converged state.
#' @param model An optional object to attach to the `lcModelPartition` object, representing the internal model that was used for obtaining the partition.
#' @param envir The `environment` associated with the model. Used for evaluating the assigned `data` object by [model.data.lcModel].
#' @examples
#' # comparing a model to the ground truth using the adjusted Rand index
#' data(latrendData)
#' method <- lcMethodLMKM(Y ~ Time, id = "Id", time = "Time")
#' model <- latrend(method, latrendData, nClusters = 3)
#'
#' # extract the reference class from the Class column
#' trajLabels <- aggregate(Class ~ Id, head, 1, data = latrendData)
#' trajLabels$Cluster <- trajLabels$Class
#' refModel <- lcModelPartition(latrendData, response = "Y", trajectoryAssignments = trajLabels)
#'
#' if (require("mclustcomp")) {
#'   externalMetric(model, refModel, "adjustedRand")
#' }
lcModelPartition = function(
  data,
  response,
  trajectoryAssignments,
  nClusters = NA,
  clusterNames = character(),
  time = getOption('latrend.time'),
  id = getOption('latrend.id'),
  name = 'part',
  center = meanNA,
  method = NULL,
  converged = TRUE,
  model = NULL,
  envir = parent.frame()
) {
  assert_that(
    is.data.frame(data),
    has_name(data, response),
    has_name(data, time),
    has_name(data, id),
    is.character(clusterNames),
    noNA(clusterNames),
    is.scalar(nClusters),
    is.na(nClusters) || nClusters >= 1L,
    is.na(nClusters) || length(clusterNames) %in% c(0, nClusters),
    noNA(trajectoryAssignments)
  )

  if (is.na(nClusters) && length(clusterNames) > 0) {
    nClusters = length(clusterNames)
  }

  if (length(trajectoryAssignments) == 1 &&
      is.character(trajectoryAssignments) &&
      uniqueN(data[[id]]) > 1) {
    # cluster column name
    assert_that(has_name(data, trajectoryAssignments), msg = 'cluster column specified in "trajectoryAssignments" argument is not present in data')
    trajectoryAssignments = aggregate(data[[trajectoryAssignments]], by = list(Id = data[[id]]), FUN = head, 1) %>%
      set_names(c(id, 'Cluster'))
  }

  if (is.data.frame(trajectoryAssignments)) {
    # data.frame input
    assert_that(
      has_name(trajectoryAssignments, id),
      has_name(trajectoryAssignments, 'Cluster'),
      all(trajectoryAssignments[[id]] %in% unique(data[[id]]))
    )

    newOrder = match(trajectoryAssignments[[id]], levels(factor(data[[id]])))
    assert_that(
      noNA(newOrder),
      uniqueN(newOrder) == length(newOrder)
    )

    trajectoryAssignments = trajectoryAssignments$Cluster[newOrder]
  }

  # vector input
  assert_that(
    length(trajectoryAssignments) == uniqueN(data[[id]]),
    msg = 'Number of trajectory assignments does not match the number of trajectories (ids) in the data'
  )

  if (is.numeric(trajectoryAssignments)) {
    # integer
    assert_that(all(vapply(trajectoryAssignments, is.count, FUN.VALUE = TRUE)), msg = 'numeric input must be integer from 1,2,3,...')

    if (is.na(nClusters)) {
      numClus = max(trajectoryAssignments)
    } else {
      assert_that(max(trajectoryAssignments) <= nClusters)
      numClus = nClusters
    }
  } else if (is.character(trajectoryAssignments)) {
    # character
    if (is.na(nClusters)) {
      numClus = uniqueN(trajectoryAssignments)
    } else {
      assert_that(uniqueN(trajectoryAssignments) <= nClusters)
      numClus = nClusters
    }

    if (length(clusterNames) == 0L) {
      trajectoryAssignments = factor(trajectoryAssignments)
      clusterNames = levels(trajectoryAssignments)
    } else {
      assert_that(all(trajectoryAssignments %in% clusterNames))
      trajectoryAssignments = factor(trajectoryAssignments, levels = clusterNames)
    }
  } else if (is.factor(trajectoryAssignments)) {
    # factor
    assert_that(
      is.na(nClusters) || nClusters == nlevels(trajectoryAssignments),
      msg = 'nClusters should be NA or match the number of levels of the trajectoryAssignments argument'
    )
    if (length(clusterNames) == 0L) {
      clusterNames = levels(trajectoryAssignments)
    } else {
      assert_that(nlevels(trajectoryAssignments) == length(clusterNames))
      trajectoryAssignments = factor(trajectoryAssignments, levels = clusterNames, labels = clusterNames)
    }
    numClus = nlevels(trajectoryAssignments)
  } else {
    stop('unsupported input type for trajectoryAssignments argument')
  }


  intAssignments = as.integer(trajectoryAssignments)
  assert_that(
    min(intAssignments) >= 1L,
    max(intAssignments) <= numClus
  )

  if (length(clusterNames) == 0L) {
    clusterNames = make.clusterNames(numClus)
  }
  assert_that(length(clusterNames) == numClus)

  mc = match.call()
  partModel = new(
    'lcModelPartition',
    call = mc,
    data = data,
    name = name,
    clusterNames = clusterNames,
    center = center,
    trajectoryClusterIndices = intAssignments,
    id = id,
    time = time,
    response = response,
    estimationTime = 0,
    converged = converged
  )

  if (!is.null(method)) {
    partModel@method = method
  }

  if (!is.null(model)) {
    partModel@model = model
  }

  environment(partModel) = envir

  partModel
}

#. clusterTrajectories
#' @rdname interface-custom
#' @param center The function to use to compute the cluster trajectory center at the respective moment in time.
setMethod('clusterTrajectories', 'lcModelPartition',
  function(
    object,
    at = time(object),
    center = object@center,
    approxFun = approx,
    ...
  ) {
  if (length(at) > 0) {
    return(callNextMethod())
  }

  if (is.null(center)) {
    center = meanNA
  }

  assert_that(
    is.function(center),
    length(formalArgs(center)) > 0,
    msg = 'center argument must be a function accepting an input vector'
  )

  data = as.data.table(model.data(object))
  response = responseVariable(object)
  time = timeVariable(object)
  trajClusters = trajectoryAssignments(object)
  rowClusters = trajClusters[make.idRowIndices(object)]
  assert_that(length(rowClusters) == nrow(data))

  # compute cluster trajectories at all moments in time
  clusTrajs = data[,
    .(Value = center(get(..response))),
    keyby = .(Cluster = rowClusters, Time = .time),
    env = list(.time = time)
  ]

  if (uniqueN(trajClusters) < nClusters(object)) {
    emptyMask = clusterSizes(object) == 0
    warning(
      sprintf(
        'Cannot compute cluster trajectory for cluster(s) "%s": no trajectories were assigned to the cluster(s).',
        paste0(clusterNames(object)[emptyMask], collapse = '", "')
      )
    )
    # add missing clusters
    emptyClusTraj = clusTrajs[, .(Time = unique(Time), Value = NaN)]
    clusTrajs = rbind(
      clusTrajs,
      data.table(
        Cluster = rep(
          setdiff(seq_len(nClusters(object)), unique(rowClusters)),
          each = nrow(emptyClusTraj)
        ),
        emptyClusTraj
      )
    )
  }

  setnames(clusTrajs, 'Value', response)
  setnames(clusTrajs, 'Time', time)

  as.data.frame(clusTrajs)
})


#. converged ####
#' @rdname interface-custom
setMethod('converged', 'lcModelPartition', function(object, ...) {
  if (methods::.hasSlot(object, 'converged')) {
    object@converged
  } else {
    TRUE
  }
})


# . getName ####
#' @rdname interface-custom
setMethod('getName', 'lcModelPartition', function(object, ...) object@name)

# . getShortName ####
#' @rdname interface-custom
setMethod('getShortName', 'lcModelPartition', function(object, ...) object@name)


#. postprob ####
#' @rdname interface-custom
setMethod('postprob', 'lcModelPartition', function(object, ...) {
  pp = postprobFromAssignments(object@trajectoryClusterIndices, k = nClusters(object))
  colnames(pp) = clusterNames(object)
  return(pp)
})