NanoMethViz: Visualise methylation data from Oxford Nanopore sequencing

Documented in cluster_regions

#' Cluster regions by K-means
#'
#' Cluster regions by k-means based on their methylation profiles. In order to
#' cluster using k-means the methylation profile of each region is interpolated
#' and sampled at fixed points. The first 10 principal components are used for
#' the k-means clustering. The clustering is best behaved in regions of similar
#' width and CpG density.
#'
#' @param x the NanoMethResult object.
#' @param regions a table of regions containing at least columns chr, strand,
#'   start and end.
#' @param centers number of centers for k-means, identical to the number of
#'   output clusters.
#' @param grid_method the method for generating the sampling grid. The default
#'   option "density" attempts to create a grid with similar density as the
#'   data, "uniform" creates a grid of uniform density.
#'
#' @return the table of regions given by the 'regions' argument with the column
#'   'cluster' added.
#'
#' @importFrom stats quantile approxfun prcomp kmeans sd
#' @importFrom graphics hist
#' @export
#'
#' @examples
#' nmr <- load_example_nanomethresult()
#' gene_anno <- exons_to_genes(NanoMethViz::exons(nmr))
#' # uniform grid due to low number of input features
#' gene_anno_clustered <- cluster_regions(nmr, gene_anno, centers = 2, grid_method = "uniform")
#' plot_agg_regions(nmr, gene_anno_clustered, group_col = "cluster")
cluster_regions <- function(x, regions, centers = 2, grid_method = c("density", "uniform")) {
    grid_method <- match.arg(grid_method)

    methy_df <- tibble(
        regions,
        methy = query_methy(x, regions$chr, regions$start, regions$end, simplify = FALSE)
    )

    # rescale position values
    for (i in seq_len(nrow(methy_df))) {
        start <- methy_df$start[i]
        end <- methy_df$end[i]
        methy_df$methy[[i]] <- if (methy_df$strand[i] == "-") {
            # re-sort by position due to flip
            methy_df$methy[[i]] %>%
                mutate(pos = 1 - ((.data$pos - start) / (end - start))) %>%
                arrange(.data$pos)
        } else {
            # include * case
            methy_df$methy[[i]] %>%
                mutate(pos = (.data$pos - start) / (end - start))
        }
    }

    # take grid size as twice the 90% percentile
    grid_length <- 2 * floor(quantile(purrr::map_int(methy_df$methy, ~length(unique(.x$pos))), 0.9))

    if (grid_method == "uniform") {
        grid <- seq(0, 1, length.out = grid_length)
    } else if (grid_method == "density") {
        unique_pos <- unlist(purrr::map(methy_df$methy, ~unique(.x$pos)))
        hist_out <- hist(unique_pos, plot = FALSE, breaks = grid_length)
        epdf <- approxfun(hist_out$mids, hist_out$density / sum(hist_out$density), rule = 2)
        epdf_vals <- epdf(seq(0, 1, length.out = grid_length))

        ecdf_vals <- cumsum(epdf_vals) / max(cumsum(epdf_vals))
        inv_cdf <- suppressWarnings(
            approxfun(ecdf_vals, seq(0, 1, length.out = grid_length), rule = 2, ties = mean)
        )

        grid <- inv_cdf(seq(0, 1, length.out = grid_length))
    }

    sample_methy_grid <- function(x, grid) {
        summaried_methy <- x %>%
            group_by(.data$pos) %>%
            summarise(methy_prob = median(e1071::sigmoid(.data$statistic))) %>%
            ungroup()

        smoothed_f <- approxfun(
            summaried_methy$pos,
            summaried_methy$methy_prob,
            rule = 2
        )

        smoothed_f(grid)
    }

    methy_matrix <- map(methy_df$methy, sample_methy_grid, grid = grid) %>%
        unlist() %>%
        matrix(nrow = length(methy_df$methy), byrow = TRUE)

    km_pca <- kmeans(methy_matrix, centers = centers)

    row_means_split <- function(x, f) {
        out <- list()
        unique_f <- unique(f)

        for (i in seq_along(unique_f)) {
            out[[i]] <- colMeans(x[which(f == i), , drop = FALSE])
        }

        out
    }

    cluster_means <- row_means_split(methy_matrix, km_pca$cluster)

    out <- regions %>%
        mutate(cluster = km_pca$cluster)

    out$cluster_dev <- numeric(nrow(out))
    for (i in seq_len(nrow(out))) {
        out$cluster_dev[i] <- sum((methy_matrix[i, ] - cluster_means[[out$cluster[i]]])^2) / ncol(methy_matrix)
    }

    out %>%
        mutate(cluster = factor(.data$cluster))
}