R/cluster.R

#' Cluster data based on mass and retention time
#'
#' Cluster the data using hierarchical clustering with the
#' \code{\link[stats]{hclust}} function. The data are clustered with the
#' normalized recalibrated mass and the normalized aligned retention times.
#'
#' @param x A \code{data.table} output from the \code{recalibrate_mass}
#'   function.
#'
#' @param errors A \code{list} output from the \code{calc_error} function. The
#'   first element of the list contains the standard deviation and median of the
#'   mass measurement error for each data set. The second element is the
#'   standard deviation of the mass measurement error across all data sets. The
#'   third element is the standard deviation of the retention time in seconds
#'   across all data sets.
#'
#' @param method A character string indicating what agglomeration method should
#'   be used in the hclust function. See \code{\link[stats]{hclust}} for more
#'   details.
#'
#' @param height An number specifying the height at which the tree created by
#'   the hclust function should be cut. See \code{\link[stats]{hclust}} for more
#'   details.
#'
#' @param min_size An integer value indicating the minimum number of points a
#'   cluster must have. All clusters with fewer members than min_size will be
#'   reclassified as "noise" points. The cluster assignment for noise points is
#'   0.
#'
#' @return A \code{data.table} with the cluster assignment for each observation.
#'
#' @md
#'
#' @importFrom magrittr %>%
#'
#' @author Evan A Martin
#'
#' @export
#'
cluster <- function (x, errors, method, height, min_size) {

  return (
    x %>%
      dplyr::group_by(Gene) %>%
      # Remove rows corresponding to Genes with only one observation because
      # hclust will throw an error unless there are at least two observations.
      dplyr::add_count(name = "obs") %>%
      dplyr::filter(obs > 1) %>%
      dplyr::select(-obs) %>%
      dplyr::ungroup() %>%
      # Normalize the mass according to the ppm error that was computed in the
      # calc_error function. The normalized recalibrated mass will be used for
      # clustering. This means the h argument in the cutree function will
      # correspond to the standard deviation.
      dplyr::mutate(
        NormRecalMass = log10(RecalMass) / log10(1 + errors$ppm_sd / 1e6)
      ) %>%
      # Normalize the aligned rt according to the rt error that was computed in
      # the calc_error function. The normalized rt will be used for clustering.
      # This means the h argument in the cutree function will correspond to the
      # standard deviation.
      dplyr::mutate(NormRTalign = RTalign / errors$rt_sd) %>%
      dplyr::ungroup() %>%
      dplyr::nest_by(Gene) %>%
      dplyr::mutate(
        cluster = list(
          stats::cutree(
            stats::hclust(stats::dist(dplyr::select(data,
                                                    NormRTalign,
                                                    NormRecalMass)),
                          method = method),
            h = height
          )
        )
      ) %>%
      # Find all clusters that have fewer than min_size members. In the next step
      # these points will be reclassified as "noise" points.
      dplyr::mutate(noise = list(tibble::tibble(clust = cluster) %>%
                                   dplyr::group_by(clust) %>%
                                   dplyr::tally() %>%
                                   dplyr::filter(n < min_size) %>%
                                   dplyr::pull(clust))) %>%
      # Convert all clusters with fewer than min_size members to cluster 0.
      dplyr::mutate(cluster = list(replace(cluster, cluster %in% noise, 0))) %>%
      dplyr::select(Gene, data, cluster) %>%
      tidyr::unnest(cols = c(data, cluster)) %>%
      dplyr::select(-NormRecalMass, -NormRTalign) %>%
      dplyr::ungroup()
  )

}
evanamartin/TopPICR documentation built on Dec. 9, 2022, 8:05 p.m.