knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%", dpi = 320 )
partition is a fast and flexible framework for agglomerative partitioning. partition uses an approach called Direct-Measure-Reduce to create new variables that maintain the user-specified minimum level of information. Each reduced variable is also interpretable: the original variables map to one and only one variable in the reduced data set. partition is flexible, as well: how variables are selected to reduce, how information loss is measured, and the way data is reduced can all be customized.
You can install the partition from CRAN with:
install.packages("partition")
Or you can install the development version of partition GitHub with:
# install.packages("remotes") remotes::install_github("USCbiostats/partition")
library(partition) set.seed(1234) df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 100) # don't accept reductions where information < .6 prt <- partition(df, threshold = .6) prt # return reduced data partition_scores(prt) # access mapping keys mapping_key(prt) unnest_mappings(prt) # use a lower threshold of information loss partition(df, threshold = .5, partitioner = part_kmeans()) # use a custom partitioner part_icc_rowmeans <- replace_partitioner( part_icc, reduce = as_reducer(rowMeans) ) partition(df, threshold = .6, partitioner = part_icc_rowmeans)
partition also supports a number of ways to visualize partitions and permutation tests; these functions all start with plot_*()
. These functions all return ggplots and can thus be extended using ggplot2.
plot_stacked_area_clusters(df) + ggplot2::theme_minimal(14)
partition has been meticulously benchmarked and profiled to improve performance, and key sections are written in C++ or use C++-based packages. Using a data frame with 1 million rows on a 2017 MacBook Pro with 16 GB RAM, here's how each of the built-in partitioners perform:
large_df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 1e6) basic_benchmarks <- microbenchmark::microbenchmark( icc = partition(large_df, .3), kmeans = partition(large_df, .3, partitioner = part_kmeans()), minr2 = partition(large_df, .3, partitioner = part_minr2()), pc1 = partition(large_df, .3, partitioner = part_pc1()), stdmi = partition(large_df, .3, partitioner = part_stdmi()) )
library(microbenchmark) library(ggplot2) if (params$invalidate_cache) { large_df <- simulate_block_data(c(3, 4, 5), lower_corr = .4, upper_corr = .6, n = 1e6) basic_benchmarks <- microbenchmark::microbenchmark( icc = partition(large_df, .3), kmeans = partition(large_df, .3, partitioner = part_kmeans()), minr2 = partition(large_df, .3, partitioner = part_minr2()), pc1 = partition(large_df, .3, partitioner = part_pc1()), stdmi = partition(large_df, .3, partitioner = part_stdmi()) ) readr::write_rds(basic_benchmarks, "basic_benchmarks.rds") } else { basic_benchmarks <- readr::read_rds("basic_benchmarks.rds") } basic_benchmarks$expr <- forcats::fct_reorder(basic_benchmarks$expr, basic_benchmarks$time) ggplot2::autoplot(basic_benchmarks) %+% ggplot2::stat_ydensity(color = "#0072B2", fill = "#0072B2BF") + ggplot2::theme_minimal()
As the features (columns) in the data set become greater than the number of observations (rows), the default ICC method scales more linearly than K-Means-based methods. While K-Means is often faster at lower dimensions, it becomes slower as the features outnumber the observations. For example, using three data sets with increasing numbers of columns, K-Means starts as the fastest and gets increasingly slower, although in this case it is still comparable to ICC:
narrow_df <- simulate_block_data(3:5, lower_corr = .4, upper_corr = .6, n = 100) wide_df <- simulate_block_data(rep(3:10, 2), lower_corr = .4, upper_corr = .6, n = 100) wider_df <- simulate_block_data(rep(3:20, 4), lower_corr = .4, upper_corr = .6, n = 100) icc_kmeans_benchmarks <- microbenchmark::microbenchmark( icc_narrow = partition(narrow_df, .3), icc_wide = partition(wide_df, .3), icc_wider = partition(wider_df, .3), kmeans_narrow = partition(narrow_df, .3, partitioner = part_kmeans()), kmeans_wide = partition(wide_df, .3, partitioner = part_kmeans()), kmeans_wider = partition(wider_df, .3, partitioner = part_kmeans()) )
if (params$invalidate_cache) { narrow_df <- simulate_block_data(3:5, lower_corr = .4, upper_corr = .6, n = 100) wide_df <- simulate_block_data(rep(3:10, 2), lower_corr = .4, upper_corr = .6, n = 100) wider_df <- simulate_block_data(rep(3:20, 4), lower_corr = .4, upper_corr = .6, n = 100) icc_kmeans_benchmarks <- microbenchmark::microbenchmark( icc_narrow = partition(narrow_df, .3), icc_wide = partition(wide_df, .3), icc_wider = partition(wider_df, .3), kmeans_narrow = partition(narrow_df, .3, partitioner = part_kmeans()), kmeans_wide = partition(wide_df, .3, partitioner = part_kmeans()), kmeans_wider = partition(wider_df, .3, partitioner = part_kmeans()) ) readr::write_rds(icc_kmeans_benchmarks, "icc_kmeans_benchmarks.rds") } else { icc_kmeans_benchmarks <- readr::read_rds("icc_kmeans_benchmarks.rds") } icc_kmeans_benchmarks$type <- stringr::str_extract(icc_kmeans_benchmarks$expr, "icc|kmeans") ggplot2::autoplot(icc_kmeans_benchmarks) %+% ggplot2::stat_ydensity(color = "#0072B2", fill = "#0072B2BF") + ggplot2::facet_wrap(~type, ncol = 1, scales = "free_y") + ggplot2::theme_minimal()
For more information, see our paper in Bioinformatics, which discusses these issues in more depth [@R-partition].
Please read the Contributor Guidelines prior to submitting a pull request to partition. Also note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.