R/crop.R

Defines functions summary_crop.summary.medic summary_crop.timing_atc_group summary_crop.timing_trajectory summary_crop.comedication_count summary_crop.medication_frequency summary_crop.cluster_frequency summary_crop

Documented in summary_crop summary_crop.cluster_frequency summary_crop.comedication_count summary_crop.medication_frequency summary_crop.summary.medic summary_crop.timing_atc_group summary_crop.timing_trajectory

#' Crop Clustering Summary
#'
#' Functions for cropping summarized cluster data.
#'
#' @param object The summary object to be cropped.
#' @param top_n integer. In the case of `cluster_frequency` it is the number of
#'   clusters to keep. In the case of `medication_frequency` it is the number of
#'   medications to keep. If `inf`, all clusters or medications are kept.
#' @param min_count integer. The minimum count of a cluster or medication to
#'   keep it in the summary. If 0, the default, the minimum count is zero, i.e.
#'   there is not a minimum count.
#' @param min_percent numeric. The minimum percentage of a cluster or medication
#'   to keep it in the summary. If 0, the default, the minimum percentage is
#'   zero, i.e. there is not a minimum percentage.
#' @param scope character. The scope of the summary crops `top_n`, `min_count`
#'  and `min_percent`. The options are "cluster" and "global". The default is
#'  "cluster". If "cluster", the crop is based on the percentage of medication
#'  in the cluster. If "global", the crop is based on the percentage of all
#'  medication.
#' @param sample_n_individual a logical or integer. If FALSE, no individual
#'   timing trajectories are sampled. If integer, `sample_n_individual` is the
#'   number of individual timing trajectories to sample. To sample all
#'   individual timing trajectories, set `sample_n_individual` to `Inf`.
#' @param weighted_sample a logical, but only used if `sample_n_individual` is
#'   an integer. If TRUE, the individual timing trajectories are sampled
#'   weighted by the number of medications in the individual timing trajectory.
#'   If FALSE, the individual timing trajectories are sampled uniformly.
#' @param which A character vector specifying which summaries to crop. The
#'   options are "cluster_frequency", "medication_frequency",
#'   "comedication_count", "timing_trajectory", and "timing_atc_group". The
#'   default is "all".
#' @param ... Additional arguments to be passed to the specific method.
#'
#' @details
#'
#' # `cluster_frequency` summary crop
#' Extracts the top `top_n` clusters by count. If `top_n` is `Inf`, all clusters
#' are kept. If `min_count` is greater than 0, clusters with a count less than
#' `min_count` are removed. If `min_percent` is greater than 0, clusters with a
#' percentage less than `min_percent` are removed. The remaining clusters are
#' grouped into a "Remaining" cluster.
#'
#' # `medication_frequency` summary crop
#' Extracts the top `top_n` medications by count. If `top_n` is `Inf`, all
#' medications are kept. If `min_count` is greater than 0, medications with a
#' count less than `min_count` are removed. If `min_percent` is greater than 0,
#' medications with a percentage less than `min_percent` are removed. The
#' remaining medications are grouped into a "Remaining" cluster.
#'
#' The `scope` argument determines the scope of the crop. If `scope` is
#' "cluster", the crop is based on the percentage of medication in the cluster.
#' If `scope` is "global", the crop is based on the percentage of all
#' medication.
#'
#' # `comedication_count` summary crop
#' TO DO
#'
#' # `timing_trajectory` summary crop
#' Samples `sample_n_individual` individual timing trajectories. If
#' `sample_n_individual` is `Inf`, all individual timing trajectories are kept.
#' If `weighted_sample` is `TRUE`, the individual timing trajectories are
#' sampled weighted by the number of medications in the individual timing
#' trajectory.
#'
#' # `timing_atc_group` summary crop
#' Samples `sample_n_individual` individual timing trajectories. If
#' `sample_n_individual` is `Inf`, all individual timing trajectories are kept.
#' If `weighted_sample` is `TRUE`, the individual timing trajectories are
#' sampled weighted by the number of medications in the individual timing
#' trajectory.
#'
#' # `summary.medic` summary crop
#' Crops multiple summaries. The `which` argument is a character vector
#' specifying which summaries to crop. The options are "cluster_frequency",
#' "medication_frequency", "comedication_count", "timing_trajectory", and
#' "timing_atc_group". If `which` is "all", all summaries are cropped.
#'
#' The `...` argument is passed to the specific methods, e.g. `top_n` and
#' `min_count` are passed to `cluster_frequency` and `medication_frequency`.
#'
#' @seealso \code{\link{summary}}, \code{\link{cluster_frequency}},
#' \code{\link{medication_frequency}}, \code{\link{comedication_count}},
#' \code{\link{timing_trajectory}}, \code{\link{timing_atc_group}}
#'
#' @return A summary object, which is a modified version of the input summary
#' object.
#'
#' @examples
#' clust <- medic(
#'   complications,
#'   id = id,
#'   atc = atc,
#'   k = 3:5,
#'   timing = first_trimester:third_trimester
#' )
#'
#'
#' # Crop the cluster frequency summary
#' clust |>
#'   cluster_frequency() |>
#'   summary_crop(top_n = 3)
#'
#' clust |>
#'   summary() |>
#'   summary_crop(which = "cluster_frequency", top_n = 3)
#'
#'
#' # Crop the medication frequency summary
#' clust |>
#'   medication_frequency() |>
#'   summary_crop(top_n = 3)
#'
#' clust |>
#'   summary() |>
#'   summary_crop(which = "medication_frequency", top_n = 3)
#'
#'
#' # Crop the co-medication count summary
#' clust |>
#'   comedication_count() |>
#'   summary_crop(min_count = 10)
#'
#' clust |>
#'   summary() |>
#'   summary_crop(which = "comedication_count", min_count = 10)
#'
#'
#' # crop the timing trajectory summary
#' clust |>
#'   timing_trajectory() |>
#'   summary_crop()
#'
#' clust |>
#'   summary() |>
#'   summary_crop(which = "timing_trajectory")
#'
#'
#' # crop the timing ATC group summary
#' clust |>
#'   timing_atc_group() |>
#'   summary_crop()
#'
#' clust |>
#'   summary() |>
#'   summary_crop(which = "timing_atc_group")
#'
#' # crop multiple summaries
#' clust |>
#'   summary() |>
#'   summary_crop(
#'     which = c("cluster_frequency", "medication_frequency"),
#'     top_n = 3
#'   )
#'
#' @rdname summary_crop
#' @export
summary_crop <- function(object, ...) {
  UseMethod("summary_crop", object)
}

#' @rdname summary_crop
#' @export
summary_crop.cluster_frequency <- function(
  object,
  top_n = 5L,
  min_count = 0,
  min_percent = 0,
  ...
) {

  cluster_levels <- c(levels(object$Cluster), "Remaining")

  res <- object |>
    dplyr::group_by(.data$Clustering) |>
    dplyr::mutate(
      Cluster = dplyr::if_else(
        top_n < rank(-.data$Count, ties.method = "first") - 1 |
          .data$Percent < min_percent |
          .data$Count < min_count,
        "Remaining",
        .data$Cluster
      )
    ) |>
    dplyr::group_by(.data$Clustering, .data$Cluster) |>
    dplyr::summarise(
      Count = sum(.data$Count),
      Percent = sum(.data$Percent),
      .groups = "drop"
    ) |>
    dplyr::mutate(
      Cluster = factor(
        .data$Cluster,
        levels = intersect(cluster_levels, .data$Cluster)
      )
    ) |>
    dplyr::arrange(.data$Clustering, .data$Cluster)

  class(res) <- c("cluster_frequency", class(res))
  return(res)
}


#' @rdname summary_crop
#' @export
summary_crop.medication_frequency <- function(
  object,
  top_n = 5L,
  min_count = 0,
  min_percent = 0,
  scope = "cluster",
  ...
) {

  scope_name <- switch(
    scope,
    "cluster" = "Percent of Medication in Cluster",
    "global" = "Percent of All Medication",
    stop("'scope' must be either 'cluster' or 'global'.")
  )

  group_name <- switch(
    scope,
    "cluster" = c("Clustering", "Cluster"),
    "global" = "Clustering"
  )

  atc_name <- attr(object, "atc")

  cluster_levels <- c(levels(object$Cluster), "Remaining")

  selected_top <- object |>
    dplyr::group_by(!!!dplyr::syms(group_name)) |>
    dplyr::mutate(
      remaining = top_n < rank(-.data$Count, ties.method = "first") |
        !!dplyr::sym(scope_name) < min_percent |
        .data$Count < min_count
    ) |>
    dplyr::ungroup() |>
    dplyr::select(!!!group_name, !!atc_name, "remaining") |>
    dplyr::distinct()

  res <- object |>
    dplyr::left_join(selected_top, by = c(group_name, atc_name)) |>
    dplyr::mutate(
      "{atc_name}" := dplyr::if_else(              # nolint: object_name_linter.
        .data$remaining,
        "Remaining",
        !!dplyr::sym(atc_name)
      ),
      "Percent of ATC code" = dplyr::if_else( #
        .data$remaining,                      # We may be able to make something
        NA_real_,                             # a bit smarter, but this is fine
        .data$`Percent of ATC code`           # for now...
      )                                       #
    ) |>
    dplyr::group_by(.data$Clustering, .data$Cluster, !!dplyr::sym(atc_name)) |>
    dplyr::summarise(
      dplyr::across(
        dplyr::all_of(
          c(
            "Count",
            "Percent of Medication in Cluster",
            "Percent of All Medication",
            "Percent of ATC code"
          )
        ),
        sum
      ),
      .groups = "drop"
    ) |>
    dplyr::mutate(
      Cluster = factor(
        .data$Cluster,
        levels = intersect(cluster_levels, .data$Cluster)
      )
    ) |>
    dplyr::arrange(.data$Clustering, .data$Cluster)

  class(res) <- c("medication_frequency", class(res))
  return(res)
}


#' @rdname summary_crop
#' @export
summary_crop.comedication_count <- function(object, ...) {
  cat("\nTO DO: summary_crop.comedication_count\n")
  return(object)
}

#' @rdname summary_crop
#' @export
summary_crop.timing_trajectory <- function(
  object,
  sample_n_individual = 100L,
  weighted_sample = TRUE,
  ...
) {

  res <- object
  res$individual <- res$individual |>
    dplyr::group_by(.data$Clustering, .data$Cluster) |>
    dplyr::slice_sample(
      n = sample_n_individual,
      weight_by = if (weighted_sample) .data$Count
    )

  class(res) <- c("timing_trajectory", class(res))
  return(res)
}

#' @rdname summary_crop
#' @export
summary_crop.timing_atc_group <- function(
  object,
  sample_n_individual = 100L,
  weighted_sample = TRUE,
  min_count = 0L,
  ...
) {

  # Do we need more simplification tools options here?

  res <- object

if(0L < sample_n_individual) {
  res$individual <- res$individual |>
    dplyr::group_by(.data$Clustering, .data$Cluster, .data$`ATC Groups`) |>
    dplyr::slice_sample(
      n = sample_n_individual,
      weight_by = if (weighted_sample) {
        .data$`Number of Medications with Timing Trajectory`
      }
    )
}

if (0L < min_count) {
  res$average <- res$average |>
    dplyr::mutate(
      dplyr::across(
        attr(res, "timing"),
        ~dplyr::if_else(
          `Number of Individuals in ATC group` < min_count,
          NA_real_,
          .
        )
      )
    )
}

  class(res) <- c("timing_atc_group", class(res))
  return(res)
}

#' @rdname summary_crop
#' @export
summary_crop.summary.medic <- function(object, which = "all", ...) {

  summary_options <- c(
    "cluster_frequency",
    "medication_frequency",
    "comedication_count",
    "timing_trajectory",
    "timing_atc_group"
  )

  if (any(! which %in% c("all", summary_options))) {
    stop(
      "'which' must be 'all' or a subset of\n",
      paste0("'", summary_options, "'", collapse = ", ")
    )
  }

  chosen_crops <- if (any(which == "all")) summary_options else which

  check_nulls <- sapply(chosen_crops, function(cc) is.null(object[[cc]]))
  if (any(check_nulls)) {
    stop(
      "The following summaries are missing from 'object':\n",
      paste0("'", names(check_nulls)[check_nulls], "'", collapse = ", ")
    )
  }

  for (cc in chosen_crops) {
    object[[cc]] <- summary_crop(object[[cc]], ...)
  }

  return(object)
}

Try the tame package in your browser

Any scripts or data that you put into this service are public.

tame documentation built on April 12, 2025, 1:40 a.m.