knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%",
  dpi = 320
)

R-CMD-check Coverage status CRAN status JOSS DOI USC IMAGE

partition

partition is a fast and flexible framework for agglomerative partitioning. partition uses an approach called Direct-Measure-Reduce to create new variables that maintain the user-specified minimum level of information. Each reduced variable is also interpretable: the original variables map to one and only one variable in the reduced data set. partition is flexible, as well: how variables are selected to reduce, how information loss is measured, and the way data is reduced can all be customized.

Installation

You can install the partition from CRAN with:

install.packages("partition")

Or you can install the development version of partition GitHub with:

# install.packages("remotes")
remotes::install_github("USCbiostats/partition")

Example

library(partition)
set.seed(1234)
df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 100)

#  don't accept reductions where information < .6
prt <- partition(df, threshold = .6)
prt

# return reduced data
partition_scores(prt)

# access mapping keys
mapping_key(prt)

unnest_mappings(prt)

# use a lower threshold of information loss
partition(df, threshold = .5, partitioner = part_kmeans())

# use a custom partitioner
part_icc_rowmeans <- replace_partitioner(
  part_icc, 
  reduce = as_reducer(rowMeans)
)
partition(df, threshold = .6, partitioner = part_icc_rowmeans) 

partition also supports a number of ways to visualize partitions and permutation tests; these functions all start with plot_*(). These functions all return ggplots and can thus be extended using ggplot2.

plot_stacked_area_clusters(df) +
  ggplot2::theme_minimal(14)

Performance

partition has been meticulously benchmarked and profiled to improve performance, and key sections are written in C++ or use C++-based packages. Using a data frame with 1 million rows on a 2017 MacBook Pro with 16 GB RAM, here's how each of the built-in partitioners perform:

large_df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 1e6)

basic_benchmarks <- microbenchmark::microbenchmark(
  icc = partition(large_df, .3),
  kmeans = partition(large_df, .3, partitioner = part_kmeans()),
  minr2 = partition(large_df, .3, partitioner = part_minr2()),
  pc1 = partition(large_df, .3, partitioner = part_pc1()),
  stdmi = partition(large_df, .3, partitioner = part_stdmi())
)
library(microbenchmark)
library(ggplot2)
if (params$invalidate_cache) {
  large_df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 1e6)

  basic_benchmarks <- microbenchmark::microbenchmark(
    icc = partition(large_df, .3),
    kmeans = partition(large_df, .3, partitioner = part_kmeans()),
    minr2 = partition(large_df, .3, partitioner = part_minr2()),
    pc1 = partition(large_df, .3, partitioner = part_pc1()),
    stdmi = partition(large_df, .3, partitioner = part_stdmi())
  )

  readr::write_rds(basic_benchmarks, "basic_benchmarks.rds")
} else {
  basic_benchmarks <- readr::read_rds("basic_benchmarks.rds")
}

basic_benchmarks$expr <- forcats::fct_reorder(basic_benchmarks$expr, basic_benchmarks$time)
ggplot2::autoplot(basic_benchmarks) %+% 
  ggplot2::stat_ydensity(color = "#0072B2", fill = "#0072B2BF") +
  ggplot2::theme_minimal()

ICC vs K-Means

As the features (columns) in the data set become greater than the number of observations (rows), the default ICC method scales more linearly than K-Means-based methods. While K-Means is often faster at lower dimensions, it becomes slower as the features outnumber the observations. For example, using three data sets with increasing numbers of columns, K-Means starts as the fastest and gets increasingly slower, although in this case it is still comparable to ICC:

narrow_df <- simulate_block_data(3:5, lower_corr = .4, upper_corr = .6, n = 100)
wide_df <- simulate_block_data(rep(3:10, 2), lower_corr = .4, upper_corr = .6, n = 100)
wider_df <- simulate_block_data(rep(3:20, 4), lower_corr = .4, upper_corr = .6, n = 100)

icc_kmeans_benchmarks <- microbenchmark::microbenchmark(
  icc_narrow = partition(narrow_df, .3),
  icc_wide = partition(wide_df, .3),
  icc_wider = partition(wider_df, .3),
  kmeans_narrow = partition(narrow_df, .3, partitioner = part_kmeans()),
  kmeans_wide = partition(wide_df, .3, partitioner = part_kmeans()),
  kmeans_wider  = partition(wider_df, .3, partitioner = part_kmeans())
)
if (params$invalidate_cache) {
  narrow_df <- simulate_block_data(3:5, lower_corr = .4, upper_corr = .6, n = 100)
  wide_df <- simulate_block_data(rep(3:10, 2), lower_corr = .4, upper_corr = .6, n = 100)
  wider_df <- simulate_block_data(rep(3:20, 4), lower_corr = .4, upper_corr = .6, n = 100)

  icc_kmeans_benchmarks <- microbenchmark::microbenchmark(
    icc_narrow = partition(narrow_df, .3),
    icc_wide = partition(wide_df, .3),
    icc_wider = partition(wider_df, .3),
    kmeans_narrow = partition(narrow_df, .3, partitioner = part_kmeans()),
    kmeans_wide = partition(wide_df, .3, partitioner = part_kmeans()),
    kmeans_wider  = partition(wider_df, .3, partitioner = part_kmeans())
  )

  readr::write_rds(icc_kmeans_benchmarks, "icc_kmeans_benchmarks.rds")
} else {
  icc_kmeans_benchmarks <- readr::read_rds("icc_kmeans_benchmarks.rds")
}

icc_kmeans_benchmarks$type <- stringr::str_extract(icc_kmeans_benchmarks$expr, "icc|kmeans")

ggplot2::autoplot(icc_kmeans_benchmarks) %+% 
  ggplot2::stat_ydensity(color = "#0072B2", fill = "#0072B2BF") +
  ggplot2::facet_wrap(~type, ncol = 1, scales = "free_y") + 
  ggplot2::theme_minimal()

For more information, see our paper in Bioinformatics, which discusses these issues in more depth [@R-partition].

Contributing

Please read the Contributor Guidelines prior to submitting a pull request to partition. Also note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.

References



USCbiostats/partition documentation built on Feb. 3, 2024, 3:38 a.m.