R/data.R
In ComplexUpset: Create Complex UpSet Plots Using 'ggplot2' Components

Documented in create_upset_abc_example upset_data

#' @importFrom utils stack
NULL

NOT_IN_KNOWN_SETS = 'Outside of known sets'

sanitize_names = function(variables_names) {
    sanitized_names = c()
    for (name in variables_names) {
        if (grepl('-', name, fixed=TRUE)) {
            original_name = name
            name = gsub('-', '_', name)
            if (name %in% variables_names) {
            stop(paste(
                'The group names contain minus characters (-) which prevent intersections names composition;',
                'offending group:', original_name, 'please substitute these characters using gsub and try again.'
            ))
            }
        }
        sanitized_names = c(sanitized_names, name)
        }
    sanitized_names
}


encode_names = function(variables_names, avoid) {
    sapply(
        # using rank ensures that alphabetic order is retained in case of equal degrees
        # and that re-ordering columns in the dataframe will not lead to flickering of the result
        as.character(rank(variables_names)),
        function (name) {
            while (any(name %in% avoid)) {
                name = paste0(name, 'x')
            }
            name
        }
    )
}


names_of_members = function(row) {
  # the original implementation used which()
  #    members = names(which(row))
  # but which() is doing a few more things that are not needed;
  # it is an equivalent to seq_along(x)[!is.na(x) & x] + names assignment
  # and those steps can be omitted

  members = names(row)[row]
  if (length(members) != 0) {
      paste(members, collapse='-')
  } else {
      # this optimization may not be beneficial for dense matrices,
      # but we could add a heuristic that checks if the matrix is dense
      NOT_IN_KNOWN_SETS
  }
}


get_intersection_members = function(x) {
    strsplit(x, '-', fixed=TRUE)
}


gather = function(data, idvar, col_name, value_name='value') {
    not_idvar = colnames(data)
    not_idvar = not_idvar[not_idvar != idvar]
    result <- stack(data, select=not_idvar)
    result$group <- as.factor(rep(data[[idvar]], times=ncol(data) - 1))
    colnames(result) = c(value_name, col_name, idvar)
    result
}


compute_matrix = function(intersections_as_groups, sorted_groups) {

     matrix = sapply(
        intersections_as_groups,
        function(i_groups) {
            sorted_groups %in% i_groups
        },
        simplify=FALSE
    )

    matrix_data = as.data.frame(matrix, row.names=sorted_groups, check.names=FALSE)
    matrix_data
}



check_argument = function(
    value,
    allowed,
    description
) {
    if (!(value %in% allowed)) {
        stop(
            paste0(
                description,
                ' has to be one of: ',
                paste(allowed, collapse=' or '),
                ', not "',
                value,
                '"'

            )
        )
    }
}


check_sort = function(
    sort_order,
    allowed = c('descending', 'ascending'),
    what = 'order'
) {

    if (sort_order == FALSE) {
        return(TRUE)
    }

    check_argument(
        sort_order,
        allowed,
        paste('Sort', what)
    )

    TRUE
}


get_sort_order = function(data, sort_order) {
    check_sort(sort_order)

    if (sort_order == 'descending') {
        do.call(order, data)
    } else {
        do.call(order, lapply(data, function(x) {-x}))
    }
}


calculate_degree = function(x) {
    values = lengths(get_intersection_members(x))
    values[x == NOT_IN_KNOWN_SETS] = 0
    values
}


trim_intersections = function(
    intersections_by_size, min_size=0, max_size=Inf,
    min_degree=0, max_degree=Inf,
    n_intersections=NULL
) {
    intersections_by_size = intersections_by_size[
        (intersections_by_size >= min_size)
        &
        (intersections_by_size <= max_size)
    ]
    if (min_degree > 0 || max_degree != Inf) {
        degrees = calculate_degree(names(intersections_by_size))
        intersections_by_size = intersections_by_size[
            (degrees >= min_degree)
            &
            (degrees <= max_degree)
        ]
    }

    if (!is.null(n_intersections)) {
        intersections_by_size = tail(
            sort(intersections_by_size),
            n_intersections
        )
    }

    intersections_by_size
}


binary_grid = function(n, m) {
    if (m == 0) {
        return (matrix(rep(0, n), byrow=TRUE, nrow=1))
    }
    if (n == m) {
        return (matrix(rep(1, n), byrow=TRUE, nrow=1))
    }
    m_minus_n = m - n
    paths = list(
        c(0, rep(NA, n-1)),
        c(1, rep(NA, n-1))
    )
    sums = c(0, 1)
    for (level in 2:n) {
        upper_threshold = level + m_minus_n

        is_worth_adding_0 = (sums <= m) & (upper_threshold <= sums)
        is_worth_adding_1 = (sums <= m - 1) & (upper_threshold - 1 <= sums)

        x = paths[is_worth_adding_0]
        y = paths[is_worth_adding_1]

        for (i in 1:length(x)) {
            x[[i]][[level]] = 0
        }
        for (i in 1:length(y)) {
            y[[i]][[level]] = 1
        }
        paths = c(x, y)
        sums = c(sums[is_worth_adding_0], sums[is_worth_adding_1] + 1)
    }
    matrix(unlist(paths), byrow=TRUE, nrow=length(paths))
}

all_intersections_matrix = function(intersect, observed_intersections_matrix, min_degree, max_degree) {
    if (max_degree == Inf) {
        intersections_matrix = do.call(expand.grid, rep(list(0:1), length(intersect)))
    } else {
        if (max_degree > length(intersect)) {
            warning('provided `max_degree` was greater than the number of sets, reducing `max_degree` to the number of sets')
            max_degree = length(intersect)
        }
        intersections_matrix = do.call(rbind, lapply(min_degree:max_degree, function(degree) {
            binary_grid(n=length(intersect), m=degree)
        }))

        # need to add observed intersections too, otherwise the observations in intersections with other degrees would disappear
        # see https://github.com/krassowski/complex-upset/issues/89
        intersections_matrix = rbind(intersections_matrix, observed_intersections_matrix)
        intersections_matrix = intersections_matrix[!duplicated(intersections_matrix), ]
    }

    colnames(intersections_matrix) = intersect
    rownames(intersections_matrix) = apply(intersections_matrix == TRUE, 1, names_of_members)
    intersections_matrix = as.matrix(intersections_matrix)
    intersections_matrix
}


timer = NULL
profile = FALSE

note_time = function(text) {
    if (!profile) {
        return (NULL)
    }
    old = timer
    timer <<- Sys.time()
    if (!is.null(old))
        cat(paste(timer - old, text, '\n'))
}


intersection_vector_to_id = function (intersection_vector, sanitized_labels, sets_ordering_in_ids) {
    not_in_known_map = NOT_IN_KNOWN_SETS
    names(not_in_known_map) = NOT_IN_KNOWN_SETS
    sanitizer_map = c(sanitized_labels, not_in_known_map)
    sets = unname(sanitizer_map[intersection_vector])
    sets_ordering_in_ids = c(
        sets_ordering_in_ids,
        NOT_IN_KNOWN_SETS
    )
    paste(sets_ordering_in_ids[sets_ordering_in_ids %in% sets], collapse='-')
}


#' Prepare data for UpSet plots
#'
#' @param data a dataframe including binary columns representing membership in classes
#' @param intersect which columns should be used to compose the intersection
#' @param min_size minimal number of observations in an intersection for it to be included
#' @param max_size maximal number of observations in an intersection for it to be included
#' @param min_degree minimal degree of an intersection for it to be included
#' @param max_degree maximal degree of an intersection for it to be included
#' @param n_intersections the exact number of the intersections to be displayed; n largest intersections that meet the size and degree criteria will be shown
#' @param keep_empty_groups whether empty sets should be kept (including sets which are only empty after filtering by size)
#' @param warn_when_dropping_groups whether a warning should be issued when empty sets are being removed
#' @param warn_when_converting whether a warning should  be issued when input is not boolean
#' @param sort_sets whether to sort the rows in the intersection matrix (descending sort by default); one of: `'ascending'`, `'descending'`, `FALSE`
#' @param sort_intersections whether to sort the columns in the intersection matrix (descending sort by default); one of: `'ascending'`, `'descending'`, `FALSE`
#' @param sort_intersections_by the mode of sorting, the size of the intersection (cardinality) by default; one of: `'cardinality'`, `'degree'`, `'ratio'`, or any combination of these (e.g. `c('degree', 'cardinality')`)
#' @param sort_ratio_numerator the mode for numerator when sorting by ratio
#' @param sort_ratio_denominator the mode for denominator when sorting by ratio
#' @param group_by the mode of grouping intersections; one of: `'degree'`, `'sets'`
#' @param mode region selection mode for sorting and trimming by size. See `get_size_mode()` for accepted values.
#' @param size_columns_suffix suffix for the columns to store the sizes (adjust if conflicts with your data)
#' @param encode_sets whether set names (column in input data) should be encoded as numbers (set to TRUE to overcome R limitations of max 10 kB for variable names for datasets with huge numbers of sets); default TRUE for upset() and FALSE for upset_data()
#' @param intersections whether only the intersections present in data (`observed`, default), or all intersections (`all`) should be computed; using all intersections for a high number of sets is not computationally feasible - use `min_degree` and `max_degree` to narrow down the selection; this is only useful for modes different from the default exclusive intersection. You can also provide a list with a custom selection of intersections (order is respected when you set `sort_intersections=FALSE`)
#' @param max_combinations_datapoints_n a fail-safe limit preventing accidental use of `intersections='all'` with a high number of sets and observations

#' @export
upset_data = function(
    data, intersect, min_size=0, max_size=Inf, min_degree=0, max_degree=Inf,
    n_intersections=NULL,
    keep_empty_groups=FALSE,
    warn_when_dropping_groups=FALSE,
    warn_when_converting='auto',
    sort_sets='descending',
    sort_intersections='descending',
    sort_intersections_by='cardinality',
    sort_ratio_numerator='exclusive_intersection',
    sort_ratio_denominator='inclusive_union',
    group_by='degree',
    mode='exclusive_intersection',
    size_columns_suffix='_size',
    encode_sets=FALSE,
    # 10^10 fail-safe will allow for up to:
    # - for degree == 2: 500 sets x 100 observations, or 100 sets x 10 000 observations
    # - for degree <= 3: 150 sets x 100 observations, or 49 sets x 10 000 observations
    max_combinations_datapoints_n=10^10,
    intersections='observed'
) {
    # Check arguments
    mode = solve_mode(mode)

    if (length(intersections) == 1) {
        check_argument(
            intersections,
            allowed=c('observed', 'all'),
            description='intersections'
        )
        specific_intersections = FALSE
    } else {
        specific_intersections = TRUE
        if (!is.list(intersections)) {
            warning(paste0(
                '`intersections` is not `observed`, `all`, nor a list of vectors;',
                ' did you mean to use `list(c("A"), c("B"), c("A", "B"))`',
                ' instead of `c(c("A"), c("B"), c("A", "B"))`?'
            ))
        }
    }

    check_argument(
        group_by,
        allowed=c('degree', 'sets'),
        description='group_by'
    )

    check_sort(sort_sets)

    for (by in sort_intersections_by) {
        check_sort(by, allowed=c('cardinality', 'degree', 'ratio'), what='method')
    }

    intersect = unlist(intersect)

    if (specific_intersections) {
        sets_from_manual_intersections = setdiff(
            unique(unlist(intersections)),
            NOT_IN_KNOWN_SETS
        )
        sets_from_intersect = unique(intersect)
        missing_sets = setdiff(sets_from_manual_intersections, sets_from_intersect)
        if (length(missing_sets) != 0) {
            correct_missing_sets = base::intersect(
                colnames(data),
                missing_sets
            )
            incorrect_missing_sets = base::setdiff(
                missing_sets,
                colnames(data)
            )

            if (length(incorrect_missing_sets) != 0) {
                stop(
                    paste(
                        'Sets provided in `intersections` are missing in both `intersect` and in `data`:',
                        paste(incorrect_missing_sets, collapse=', ')
                    )
                )
            } else {
                warning(
                    paste(
                        'Following sets provided in `intersections` are missing in `intersect`:',
                        paste(missing_sets, collapse=', ')
                    )
                )
            }

            intersect = c(intersect, correct_missing_sets)
        }
    }
    if (length(intersect) == 1) {
        stop('Needs at least two indicator variables')
    }

    # Transform data
    note_time('initialised')

    if ('tbl' %in% class(data) | 'data.table' %in% class(data)) {
        data = as.data.frame(data)
    }

    # convert to logical if needed
    is_column_logical = sapply(data[, intersect], is.logical)
    if (any(!is_column_logical)) {
        non_logical = names(is_column_logical[is_column_logical == FALSE])

        if (warn_when_converting == 'auto') {
            unique_values = unique(
                as.vector(
                    as.matrix(
                        data[, non_logical]
                    )
                )
            )
            if (setequal(unique_values, c(0, 1))) {
                warn_when_converting = FALSE
            } else {
                warn_when_converting = TRUE
            }
        }
        if (warn_when_converting) {
            warning(paste('Converting non-logical columns to binary:', paste(non_logical, collapse=', ')))
        }

        data[, non_logical] = sapply(data[, non_logical], as.logical)
    }

    if (any(is.na(data[, intersect]))) {
        warning('Detected missing values in the columns indicating sets, coercing to FALSE')
        data[, intersect][is.na(data[, intersect])] = FALSE
    }

    intersect_in_order_of_data = colnames(data)[colnames(data) %in% intersect]

    non_sanitized_labels = intersect
    to_avoid = colnames(data)[!(colnames(data) %in% intersect)]

    if (encode_sets) {
        colnames(data)[colnames(data) %in% intersect] <- encode_names(intersect_in_order_of_data, avoid=to_avoid)
        intersect = unlist(encode_names(intersect, avoid=to_avoid))
    } else {
        colnames(data)[colnames(data) %in% intersect] <- sanitize_names(intersect_in_order_of_data)
        intersect = sanitize_names(intersect)
    }
    names(non_sanitized_labels) = intersect

    sanitized_labels = names(non_sanitized_labels)
    names(sanitized_labels) = non_sanitized_labels

    # sanitize or encode names of intersections selection/order
    if (specific_intersections) {
        intersections = sapply(intersections, function(intersection) {
            intersection_vector_to_id(
                intersection,
                sanitized_labels=sanitized_labels,
                sets_ordering_in_ids=intersect
            )
        })
    }

    note_time('converted data')

    data$intersection = apply(data[intersect], 1, names_of_members)

    unique_members_matrix = data[!duplicated(data$intersection), intersect]
    rownames(unique_members_matrix) = apply(unique_members_matrix, 1, names_of_members)

    # TODO: maybe use + to convert to numeric for speed (is it faster?)?
    unique_members_matrix = apply(unique_members_matrix, 1, as.numeric)

    observed_intersections_matrix = t(unique_members_matrix)

    if (specific_intersections) {

        if (mode == 'exclusive_intersection') {
            observed_intersections = rownames(observed_intersections_matrix)
            non_observed_exclusive_but_requested = setdiff(
                intersections,
                observed_intersections
            )

            translate_to_labels = function(endcoded_intersections) {
                sapply(
                    sapply(endcoded_intersections, get_intersection_members),
                    function(members) {
                        if (encode_sets) {
                            members = as.integer(members)
                        }
                        paste(non_sanitized_labels[members], collapse='-')
                    }
                )
            }

            if (length(non_observed_exclusive_but_requested) == length(intersections)) {
                non_observed_exclusive_but_requested_labels = translate_to_labels(
                    non_observed_exclusive_but_requested
                )
                observed_intersections_labels = translate_to_labels(
                    observed_intersections
                )
                warning(
                    paste0(
                        'None of the requested exclusive intersections is observed in the data:',
                        '\n  - requested: ',
                        paste(non_observed_exclusive_but_requested_labels, collapse =', '),
                        '\n  - available for exclusive intersection mode: ',
                        paste(observed_intersections_labels, collapse =', ')
                    )
                )
            }
        }

        # while this might seem strange to have duplicates, it would be a valid use case
        # e.g. to add a reference intersection multiple time for ease of comparison

        unique_intersections = unique(intersections)
        intersections_members = get_intersection_members(unique_intersections)

        sets_from_manual_intersections = setdiff(
            unique(unlist(intersections_members)),
            NOT_IN_KNOWN_SETS
        )

        # TODO: this is slow and memory hungry; ideally we would only get the relevant intersection straight away!
        possible_intersections = all_intersections_matrix(intersect, NULL, 0, Inf)

        relevant_intersections = rownames(possible_intersections[
            rowSums(possible_intersections[, sets_from_manual_intersections]) > 0,
        ])
        possible_intersections_members = get_intersection_members(relevant_intersections)

        # + to convert to numeric for consistency
        intersections_matrix = t(+sapply(
            possible_intersections_members,
            function(i) {
                intersect %in% i
            }
        ))
        colnames(intersections_matrix) = intersect
        rownames(intersections_matrix) = relevant_intersections

        unique_members_matrix = t(intersections_matrix)
        product_matrix = tcrossprod(intersections_matrix)
    } else if (intersections == 'observed') {
        intersections_matrix = observed_intersections_matrix
        colnames(intersections_matrix) = intersect
        product_matrix = intersections_matrix %*% unique_members_matrix
    } else if (intersections == 'all') {
        effective_max_degree = min(length(intersect), max_degree)

        combinations_n = sum(sapply(min_degree:effective_max_degree, function(m) choose(length(intersect), m)))
        datapoints_n = nrow(data) * ncol(data) * combinations_n

        if (datapoints_n > max_combinations_datapoints_n)  {
            degrees_text = ifelse(
                min_degree == max_degree,
                paste0(' equal ', min_degree),
                paste0('s between ', min_degree, ' and ', effective_max_degree)
            )

            advice_message = paste0(
                'The number of combinations with degree', degrees_text,
                ' (', formatC(combinations_n, format='e', digits=1), ') multiplied by the number of observations',
                ' (', nrow(data), ') and columns (', ncol(data), ') accounts to an upper bound of ',
                formatC(datapoints_n, format='e', digits=1), ' datapoints;',
                ' such a high number may lead to out of memory errors (depending on the available RAM size).',
                ' Please adjust `min_degree` and `max_degree`, remove unused columns, or',
                ' adjust `max_combinations_datapoints_n` (if you wish to proceed anyways).',
                '\nNote: filtering by size (`min_size` and/or `max_size`) or setting `n_intersections`',
                ' reduces the memory requirements and if you already do that',
                ' it may be safe to increase `max_combinations_datapoints_n`.'
            )
            stop(advice_message)
        }

        intersections_matrix = all_intersections_matrix(intersect, observed_intersections_matrix, min_degree, max_degree)
        unique_members_matrix = t(intersections_matrix)
        # note: tcrossprod is significantly faster than: intersections_matrix %*% unique_members_matrix
        product_matrix = tcrossprod(intersections_matrix)
    }
    note_time('calculated intersections')

    exclusive_intersection = table(data$intersection)
    observed_intersections = names(exclusive_intersection)
    exclusive_intersection = as.numeric(exclusive_intersection)
    names(exclusive_intersection) = observed_intersections

    product_matrix[product_matrix == 0] = -1

    if (NOT_IN_KNOWN_SETS %in% rownames(product_matrix) && NOT_IN_KNOWN_SETS %in% colnames(product_matrix)) {
        product_matrix[NOT_IN_KNOWN_SETS, ] = -1
        product_matrix[, NOT_IN_KNOWN_SETS] = -1
        product_matrix[NOT_IN_KNOWN_SETS, NOT_IN_KNOWN_SETS] = 0
    }

    exclusive_intersection_counts = exclusive_intersection[colnames(product_matrix)]
    inclusive_union = (product_matrix >= 0) * exclusive_intersection_counts

    observed_intersections_degrees = colSums(unique_members_matrix)
    desired_intersections_degrees = rowSums(intersections_matrix)

    exclusive_union = ((product_matrix >= 0) & (product_matrix >= observed_intersections_degrees)) * exclusive_intersection_counts

    if (NOT_IN_KNOWN_SETS %in% colnames(product_matrix)) {
        desired_intersections_degrees[NOT_IN_KNOWN_SETS] = 0
    }

    intersection_condition = t(t(product_matrix) >= desired_intersections_degrees)
    inclusive_intersection = intersection_condition * exclusive_intersection_counts

    if (!specific_intersections && intersections != 'observed') {
        exclusive_condition = t(t(product_matrix) == observed_intersections_degrees) & (product_matrix == observed_intersections_degrees)
        exclusive_intersection = exclusive_condition * exclusive_intersection_counts
        exclusive_intersection[is.na(exclusive_intersection)] = 0
        exclusive_intersection = colSums(exclusive_intersection)
    }
    note_time('calculated intersection sizes')

    inclusive_intersection[is.na(inclusive_intersection)] = 0
    exclusive_union[is.na(exclusive_union)] = 0
    inclusive_union[is.na(inclusive_union)] = 0

    sizes = list(
        exclusive_intersection=exclusive_intersection,
        inclusive_intersection=colSums(inclusive_intersection),
        exclusive_union=colSums(exclusive_union),
        inclusive_union=colSums(inclusive_union)
    )

    if (specific_intersections) {
        # add empty intersections if specified see:
        # - https://github.com/krassowski/complex-upset/issues/99
        # - https://github.com/krassowski/complex-upset/issues/104
        # - https://github.com/krassowski/complex-upset/issues/101
        for (kind in names(sizes)) {
            empty_intersections_to_include = setdiff(
                intersections,
                names(sizes[[kind]])
            )
            if (length(empty_intersections_to_include)) {
                sizes_of_empties = rep(0, length(empty_intersections_to_include))
                names(sizes_of_empties) = empty_intersections_to_include

                sizes[[kind]] = c(
                    sizes[[kind]],
                    sizes_of_empties
                )
            }
        }
    }

    intersections_by_size = sizes[[mode]]

    if (min_size > 0 || max_size != Inf || min_degree > 0 || max_degree != Inf || !is.null(n_intersections)) {
        intersections_by_size_trimmed = trim_intersections(
            intersections_by_size,
            min_size=min_size,
            max_size=max_size,
            min_degree=min_degree,
            max_degree=max_degree,
            n_intersections=n_intersections
        )
        if (length(intersections_by_size_trimmed) == 0) {

            if (min_size > 0) {
                tip = paste(': the maximal size for `min_size` for this dataset is', max(intersections_by_size))
            } else if (min_degree > 0) {
                degrees = calculate_degree(names(intersections_by_size))
                tip = paste(': the maximal degree for `min_degree` for this dataset is', max(degrees))
            } else if (!is.null(n_intersections) && n_intersections < 1) {
                tip = paste0(': provide `n_intersections` >= 1 (you provoided: ', n_intersections, ')')
            } else if (max_size < 1) {
                tip = paste0(': provide `max_size` >= 1 (you provoided: ', max_size, ')')
            } else if (max_degree < 0) {
                # note: max_degree = 0 returns observations that are not in any of the known sets
                tip = paste0(': provide `max_degree` >= 0 (you provoided: ', max_degree, ')')
            } else {
                tip = ''
            }

            stop(paste0('No intersections left after filtering', tip))
        }
    }

    if (min_size > 0 || max_size != Inf || !is.null(n_intersections)) {
        regions_to_include = colnames(inclusive_union)[
            colnames(inclusive_union) %in% names(intersections_by_size_trimmed)
        ]
    } else {
        regions_to_include = colnames(inclusive_union)
    }


    rownames(inclusive_union) = rownames(product_matrix)
    selected_intersections = intersect(colnames(inclusive_union), observed_intersections)

    original_data_indices = 1:nrow(data)
    indices_by_exclusive_intersection = split(original_data_indices, data$intersection)

    inclusive_union_indices = lapply(regions_to_include, function(region) {
        counts = inclusive_union[selected_intersections[selected_intersections != region], region]
        non_empty_subregions = names(counts[counts != 0])

        unlist(unname(indices_by_exclusive_intersection[non_empty_subregions]))
    })

    ## assert sapply(indices, length)) == colSums(inclusive_union[, union_to_be_added])

    lengths = sapply(inclusive_union_indices, length)
    all_indices = c(original_data_indices, unlist(inclusive_union_indices))
    offsets = cumsum(c(length(original_data_indices), lengths))
    names(offsets) = c(regions_to_include, NaN)


    # the initial length(original_data_indices) entries are only for regions of exclusive intersections
    # and indices here do not need any additional addressing offset. Following indices are for regions
    # that are not exclusive and require additional offest as follows:
    rownames(inclusive_intersection) = rownames(product_matrix)

    inclusive_intersections_counts = inclusive_intersection[
        intersect(colnames(inclusive_intersection), observed_intersections), , drop=FALSE
    ]
    names(inclusive_union_indices) = regions_to_include

    inlusive_intersection_ids = unlist(unname(sapply(regions_to_include, function(region) {
        counts = inclusive_intersections_counts[, region]
        non_empty_subregions = names(counts[counts != 0])

        indices_in_input_space = unlist(unname(indices_by_exclusive_intersection[non_empty_subregions]))

        additional_indices = which(inclusive_union_indices[[region]] %in% indices_in_input_space)
        offsets[[region]] + additional_indices
    })))

    rownames(exclusive_union) = rownames(product_matrix)

    exclusive_intersections_counts = exclusive_union[
        intersect(colnames(exclusive_union), observed_intersections), , drop=FALSE
    ]

    exclusive_union_ids = unlist(unname(sapply(regions_to_include, function(region) {
        counts = exclusive_intersections_counts[, region]
        non_empty_subregions = names(counts[counts != 0])

        indices_in_input_space = unlist(unname(indices_by_exclusive_intersection[non_empty_subregions]))

        additional_indices = which(inclusive_union_indices[[region]] %in% indices_in_input_space)
        offsets[[region]] + additional_indices
    })))


    data = data[all_indices, ]
    data$exclusive_intersection = data$intersection[all_indices]

    data$intersection = c(
        data$intersection[original_data_indices],
        rep(regions_to_include, times=lengths)
    )
    exclusive_intersection_indices = original_data_indices
    data$in_exclusive_intersection = c(
        rep(c(1, 0), times=c(length(exclusive_intersection_indices), sum(lengths)))
    )
    data$in_inclusive_union = 1

    data[, 'in_inclusive_intersection'] = data$in_exclusive_intersection
    data[inlusive_intersection_ids, 'in_inclusive_intersection'] = 1
    # note: new_indices = 1:nrow(data); new_indices %in% all_inlusive_intersection_ids is slightly slower
    # assuming all_inlusive_intersection_ids = c(exclusive_intersection_indices, inlusive_intersection_ids)
    # assert max(all_inlusive_intersection_ids) < nrow(data)
    # assert !any(duplicated(all_inlusive_intersection_ids))
    # assert length(all_inlusive_intersection_ids) == sum(colSums(inclusive_intersection))
    # assert sum(data$in_inclusive_intersection) == sum(colSums(inclusive_intersection))

    data[, 'in_exclusive_union'] = data$in_exclusive_intersection
    data[exclusive_union_ids, 'in_exclusive_union'] = 1
    note_time('calculated modes')

    plot_intersections_subset = names(intersections_by_size)
    plot_sets_subset = intersect

    if (min_size > 0 || max_size != Inf || min_degree > 0 || max_degree != Inf || !is.null(n_intersections)) {

        # once the unused intersections are removed, we need to decide
        # if the groups not participating in any of the intersections should be kept or removed
        if (!keep_empty_groups) {
            # see: https://github.com/krassowski/complex-upset/issues/90
            itersect_data = data.frame(
                intersections_matrix[names(intersections_by_size_trimmed), ] == 1,
                check.names=FALSE,
                check.rows=FALSE
            )

            is_non_empty = sapply(itersect_data, any)
            empty_groups = names(itersect_data[!is_non_empty])

            if (length(empty_groups) != 0 && warn_when_dropping_groups) {
                to_display = ifelse(
                    length(empty_groups) <= 5,
                    paste('Dropping empty groups:', paste(empty_groups, collapse=', ')),
                    paste('Dropping', length(empty_groups), 'empty groups')
                )
                warning(to_display)
            }
            intersect_subset = intersect[!(intersect %in% empty_groups)]
        } else {
            intersect_subset = intersect
        }

        intersections_by_size = intersections_by_size_trimmed
        for (mode in names(sizes)) {
            sizes[[mode]] = sizes[[mode]][names(sizes[[mode]]) %in% names(intersections_by_size_trimmed)]
        }

        intersect = intersect_subset

        plot_intersections_subset = names(intersections_by_size_trimmed)
        plot_sets_subset = intersect_subset
    }
    if (specific_intersections) {
        plot_intersections_subset = plot_intersections_subset[plot_intersections_subset %in% intersections]
    }
    note_time('trimmed')

    stacked = stack(data[original_data_indices, ], intersect)
    stacked$id = rep(original_data_indices, length(intersect))
    stacked = stacked[stacked$values == TRUE, ]

    # Note: we do want to include the additional attributes as those provide info for filling set sizes
    metadata = data[
        match(
            stacked$id,
            original_data_indices
        ),
        setdiff(colnames(data), intersect),
        drop=FALSE
    ]

    stacked = cbind(stacked, metadata)

    names(stacked)[names(stacked) == 'ind'] = 'group'
    groups_by_size = table(stacked$group)
    groups_by_size[NOT_IN_KNOWN_SETS] = sum(data[original_data_indices, 'intersection'] == NOT_IN_KNOWN_SETS)

    note_time('stacked')

    if (sort_sets != FALSE) {
        groups_by_size = groups_by_size[get_sort_order(list(groups_by_size), sort_sets)]
    } else {
        groups_by_size = groups_by_size[names(groups_by_size)]
    }

    sorted_groups_with_not_in_known_sets = names(groups_by_size)
    sorted_groups = sorted_groups_with_not_in_known_sets[
        sorted_groups_with_not_in_known_sets != NOT_IN_KNOWN_SETS
    ]

    sort_order = NULL

    if (sort_intersections != FALSE) {

        sort_values = lapply(
            sort_intersections_by,
            function(by) {
                if (by == 'cardinality') {
                    sort_value = intersections_by_size
                } else if (by == 'degree') {
                    original_intersections_names = names(intersections_by_size)
                    sort_value = calculate_degree(original_intersections_names)
                    names(sort_value) = original_intersections_names
                } else if (by == 'ratio') {
                    sort_value = (
                        sizes[[sort_ratio_numerator]][names(intersections_by_size)]
                        /
                        sizes[[sort_ratio_denominator]][names(intersections_by_size)]
                    )
                }
                sort_value
            }
        )

        sort_order = get_sort_order(sort_values, sort_intersections)
    } else if (specific_intersections) {
        sort_order = rev(match(intersections, names(intersections_by_size)))
    }

    if (!is.null(sort_order)) {
        intersections_by_size = intersections_by_size[sort_order]

         for (mode in names(sizes)) {
            sizes[[mode]] = sizes[[mode]][names(intersections_by_size)]
        }
    }

    unique_sorted_intersections = names(intersections_by_size)
    rm(intersections_by_size)
    note_time('sorted')

    unique_intersection_members = get_intersection_members(unique_sorted_intersections)
    names(unique_intersection_members) = unique_sorted_intersections

    if (group_by == 'degree') {
        sorted_intersections = unique_sorted_intersections
    } else if (group_by == 'sets') {
        # failed refactoring attempt 1 note:
        # returning a (named) list with lapply and rbind has comparable (marginally worse)
        # time performance and worse memory performance
        # failed refactoring attempt 2 note:
        # using outer does not work here as difficult to vectorize just yet

        intersections_indices = list()
        new_intersections_ids = list()
        old_intersections_ids = list()
        lead_groups = list()
        i = 0
        new_indices = 1:nrow(data)
        indices_by_intersection = split(new_indices, data$intersection)

        for (group in sorted_groups_with_not_in_known_sets) {

            for (intersection in names(unique_intersection_members)) {
                i_groups = unique_intersection_members[[intersection]]

                if (group %in% i_groups) {
                    i = i + 1

                    old_intersections_ids[[i]] = intersection
                    lead_groups[[i]] = group
                    intersections_indices[[i]] = indices_by_intersection[[intersection]]
                    new_intersections_ids[[i]] = paste(c(group, i_groups[i_groups != group]), collapse='-')
                }
            }
        }

        lengths = sapply(intersections_indices, length)

        new_intersections_ids = unlist(new_intersections_ids)
        old_intersections_ids = unlist(old_intersections_ids)

        plot_intersections_subset = new_intersections_ids[old_intersections_ids %in% plot_intersections_subset]
        sorted_intersections = new_intersections_ids

        for (mode in names(sizes)) {
            sizes[[mode]][new_intersections_ids] = sizes[[mode]][old_intersections_ids]
        }

        data = data[unlist(intersections_indices), ]
        data$intersection = unlist(rep(new_intersections_ids, times=lengths))
        data$group_by_group = unlist(rep(lead_groups, times=lengths))

        unique_intersection_members = unique_intersection_members[old_intersections_ids]
        names(unique_intersection_members) = new_intersections_ids
    }
    note_time('grouped')

    intersections_as_groups = unique_intersection_members

    matrix_data = compute_matrix(intersections_as_groups, sorted_groups)

    group = rownames(matrix_data)

    matrix_frame = gather(
        cbind(group, matrix_data),
        'group',
        'intersection',
        'value'
    )

    if (group_by == 'sets') {
        # the set (group) by which the intersections were grouped is stored as the first element of "intersection"
        # extract first element of intersection:

        intersection_to_group = lead_groups
        names(intersection_to_group) = new_intersections_ids
        matrix_frame$group_by_group = unlist(intersection_to_group[as.character(matrix_frame$intersection)])
    }

    # restore the previous column names
    colnames(data)[colnames(data) %in% intersect] <- non_sanitized_labels[intersect]

    for (mode in names(sizes)) {
        column_name = paste0(mode, size_columns_suffix)
        data[[column_name]] = as.numeric(
            sizes[[mode]][data$intersection]
        )
    }

  note_time('finished')

  list(
    with_sizes=data,
    sets_ordering_in_ids=intersect,
    presence=stacked,
    matrix=matrix_data,
    matrix_frame=matrix_frame,
    sorted=list(
      groups=sorted_groups,
      intersections=sorted_intersections
    ),
    sizes=sizes,
    plot_intersections_subset=plot_intersections_subset,
    plot_sets_subset=plot_sets_subset,
    sanitized_labels=sanitized_labels,
    non_sanitized_labels=non_sanitized_labels
  )
}

#' Create an example dataset with three sets: A, B and C
#'
#' @export
create_upset_abc_example = function() {
    data.frame(
        # 1) 50 in A only, 2) 50 in B only, 3) 200 in C only
        # 4) 10 in A-B only, 5) 6 in A-C only, 6) 6 in B-C only
        # 7) 1 in A-B-C only, 8) 2 in neither
        A = c(
            # 1) 50 in A only
            rep(T, 50),
            # 2) 50 in B only
            rep(F, 50),
            # 3) 200 in C only
            rep(F, 200),
            # 4) 10 in A-B only
            rep(T, 10),
            # 5) 6 in A-C only
            rep(T, 6),
            # 6) 6 in B-C only
            rep(F, 6),
            # 7) 1 in A-B-C only
            rep(T, 1),
            # 8) 2 in neither
            rep(F, 2)
        ),
        B = c(
            # 1) 50 in A only
            rep(F, 50),
            # 2) 50 in B only
            rep(T, 50),
            # 3) 200 in C only
            rep(F, 200),
            # 4) 10 in A-B only
            rep(T, 10),
            # 5) 6 in A-C only
            rep(F, 6),
            # 6) 6 in B-C only
            rep(T, 6),
            # 7) 1 in A-B-C only
            rep(T, 1),
            # 8) 2 in neither
            rep(F, 2)
        ),
        C = c(
            # 1) 50 in A only
            rep(F, 50),
            # 2) 50 in B only
            rep(F, 50),
            # 3) 200 in C only
            rep(T, 200),
            # 4) 10 in A-B only
            rep(F, 10),
            # 5) 6 in A-C only
            rep(T, 6),
            # 6) 6 in B-C only
            rep(T, 6),
            # 7) 1 in A-B-C only
            rep(T, 1),
            # 8) 2 in neither
            rep(F, 2)
        )
    )
}