extraChIPs: Additional functions for working with ChIP-Seq data

Documented in .makeFinalProfileHeatmap

#' @title Draw a coverage Profile Heatmap
#'
#' @description Plot a coverage Profile Heatmap across multiple ranges
#'
#' @details
#' Convenience function for plotting coverage heatmaps across a common set of
#' ranges, shared between one or more samples. These are most commonly the
#' coverage values from merged samples within a treatment group. THe input
#' data structure is based on that obtained from \link{getProfileData}, and can
#' be provided either as a GRanges object (generally for one sample) or as a
#' GRangesList.
#'
#' A 'profile DataFrame' here refers to a data.frame (or tibble, or DataFrame)
#' with a coverage value in one column that corresponds to a genomic bin of a
#' fixed size denoted in another, as generated by \link{getProfileData}.
#' Given that multiple ranges are most likely to be drawn, each profile data
#' frame must be the same size in terms of the number of bins, each of which
#' represent a fixed number of nucleotides. At a minimum this is a two column
#' data frame although getProfileData will provide three columns for each
#' specified genomic region.
#'
#' If using a GRangesList, each list element will be drawn as a separate panel
#' by default. These panels will appear in the same order as the list elements
#' of the GRangesList, although this can easily be overwritten by passing a
#' column name to the facetX argument. The default approach will add the
#' original element names as the column "name" which can be seen in the $data
#' element of any resultant ggplot object produced by this function.
#'
#' @param object A GRanges or GRangesList object
#' @param profileCol Column name specifying where to find the profile DataFrames
#' @param xValue,fillValue Columns within the profile DataFrames for heatmaps
#' @param facetX,facetY Columns used for faceting across the x- or y-axis
#' respectively
#' @param colour Column used for colouring lines in the summary panel. Defaults
#' to any column used for facetY
#' @param linetype Column used for linetypes in the summary panel
#' @param summariseBy Function for creating the summary plot in the top panel.
#' If set to 'none', no summary plot will be drawn. Otherwise the top panel will
#' contain a line-plot representing this summary value for each x-axis bin
#' @param xLab,yLab,fillLab Labels for plotting aesthetics. Can be overwritten
#' using labs() on any returned object
#' @param relHeight The relative height of the top summary panel.
#' Represents the fraction of the plotting area taken up by the summary panel.
#' @param summaryLabelSide Side to place y-axis for the summary plot in the top
#' panel
#' @param respectLevels logical(1) If FALSE, facets along the y-axis will be
#' arranged in descending order of signal, otherwise any original factor levels
#' will be retained
#' @param sortFilter If calling on a GRangesList, a method for subsetting the
#' original object (e.g. 1:2). If calling on a GRanges object should be and
#' expression able to be parsed as a filtering expression using
#' \link[rlang]{eval_tidy}. This is applied when sorting the range order down
#' the heatmap such that ranges can be sorted by one or specific samples, or
#' all. Ranges will always be sorted such that those with the strongest signal
#' are at the top of the plot
#' @param maxDist Maximum distance from the centre to find the strongest signal
#' when arranging the ranges
#' @param labelFunX Function for formatting x-axis labels
#' @param ... Passed to \link[ggplot2]{facet_grid} internally. Can be utilised
#' for switching panel strips or passing a labeller function
#'
#' @return
#' A `ggplot2` object, able to be customised using standard `ggplot2` syntax
#'
#' @examples
#' \donttest{
#' library(rtracklayer)
#' fl <- system.file(
#' "extdata", "bigwig", c("ex1.bw", "ex2.bw"), package = "extraChIPs"
#' )
#' bwfl <- BigWigFileList(fl)
#' names(bwfl) <- c("ex1", "ex2")
#'
#' gr <- GRanges(
#'   c(
#'     "chr10:103880281-103880460", "chr10:103892581-103892760",
#'     "chr10:103877281-103877460"
#'    )
#' )
#' pd <- getProfileData(bwfl, gr)
#' plotProfileHeatmap(pd, "profile_data") +
#'   scale_fill_viridis_c(option = "inferno", direction = -1) +
#'   labs(fill = "Coverage")
#' }
#'
#' @name plotProfileHeatmap
#' @rdname plotProfileHeatmap-methods
#' @import methods
#' @export
setGeneric(
    "plotProfileHeatmap",
    function(object, ...) standardGeneric("plotProfileHeatmap")
)
#' @rdname plotProfileHeatmap-methods
#' @import methods
#' @importFrom forcats fct_inorder
#' @importFrom rlang ensym
#' @export
setMethod(
    "plotProfileHeatmap", signature = signature(object = "GenomicRangesList"),
    function(
        object,
        profileCol = "profile_data", xValue = "bp", fillValue = "score",
        facetX = NULL, facetY = NULL, colour = facetY, linetype = NULL,
        summariseBy = c("mean", "median", "min", "max", "none"),
        xLab = xValue, yLab = NULL, fillLab = fillValue, labelFunX = waiver(),
        relHeight = 0.3, sortFilter = NULL, maxDist = 100, ...
    ) {

        ## All elements of the list should usually have identical ranges,
        ## however, this is not enforced given that scenarios can exist where
        ## this is not required

        ## Set the List element names as a column, then use these as the default
        ## for facetting along the x-axis
        if (is.null(names(object)))
            names(object) <- paste0("X.", seq_along(object))
        ## Set the samples to subset by when sorting
        if (is.null(sortFilter)) sortFilter <- names(object)
        if (!is.character(sortFilter)) {
            sortFilter <- names(object)[sortFilter]
        } else {
            sortFilter <- match.arg(sortFilter, names(object), several.ok = TRUE)
        }

        gr <- unlist(object)
        stopifnot(is(gr, "GRanges"))
        gr$name <- fct_inorder(names(gr))
        names(gr) <- NULL
        ## Handle variables including unquoted ones
        if (is.null(facetX)) facetX <- "name"
        xValue <- as.character(ensym(xValue))
        fillValue <- as.character(ensym(fillValue))
        if (!is.null(facetY)) facetY <- as.character(ensym(facetY))
        if (!is.null(colour)) colour <- as.character(ensym(colour))
        if (!is.null(linetype)) linetype <- as.character(ensym(linetype))
        plotProfileHeatmap(
            object = gr, profileCol = profileCol, xValue = xValue,
            fillValue = fillValue, facetX = facetX, facetY = facetY,
            colour = colour, linetype = linetype, summariseBy = summariseBy,
            xLab = xLab, yLab = yLab, fillLab = fillLab, labelFunX = labelFunX,
            relHeight = relHeight, sortFilter = name %in% sortFilter,
            maxDist = maxDist, ...
        )
    }
)
#' @import methods
#' @importFrom S4Vectors mcols
#' @importFrom tidyr unnest nest
#' @importFrom rlang !! sym ensym eval_tidy enquo quo_is_null
#' @importFrom dplyr arrange desc bind_cols
#' @importFrom forcats fct_rev fct_inorder
#' @rdname plotProfileHeatmap-methods
#' @export
setMethod(
    "plotProfileHeatmap", signature = signature(object = "GenomicRanges"),
    function(
        object,
        profileCol = "profile_data", xValue = "bp", fillValue = "score",
        facetX = NULL, facetY = NULL, colour = facetY, linetype = NULL,
        summariseBy = c("mean", "median", "min", "max", "none"),
        xLab = xValue, yLab = NULL, fillLab = fillValue, labelFunX = waiver(),
        relHeight = 0.3, summaryLabelSide = "left", respectLevels = FALSE,
        sortFilter = NULL, maxDist = 100, ...
    ) {

        ## Check the profile data.frames for identical dims & required cols
        df <- mcols(object)
        profileCol <- match.arg(profileCol, colnames(df))
        stopifnot(.checkProfileDataFrames(df[[profileCol]], xValue, fillValue))

        ## Handle variables including unquoted ones
        xValue <- as.character(ensym(xValue))
        fillValue <- as.character(ensym(fillValue))
        if (!is.null(facetX))
            facetX <- unlist(lapply(facetX, \(x) as.character(ensym(x))))
        if (!is.null(facetY))
            facetY <- unlist(lapply(facetY, \(x) as.character(ensym(x))))
        if (!is.null(colour)) colour <- as.character(ensym(colour))
        if (!is.null(linetype)) linetype <- as.character(ensym(linetype))
        filter <- enquo(sortFilter)

        ## Check the other columns exist
        keepCols <- setdiff(colnames(df), profileCol)
        specCols <- unique(unlist(c(facetX, facetY, colour, linetype)))
        if (!is.null(specCols)) stopifnot(all(specCols %in% keepCols))

        ## Tidy the data
        tbl <- as_tibble(granges(object))
        if (length(keepCols) > 0)
            tbl <- bind_cols(tbl, as_tibble(df[keepCols]))
        pfl <- setNames(df[[profileCol]], seq_along(df[[profileCol]]))
        pf <- unlist(pfl, use.names = TRUE)
        pf_tbl <- as_tibble(pf, rownames = "id")
        pf_tbl$id <- gsub("^([0-9]+).+", "\\1", pf_tbl$id)
        pf_nest <- nest(pf_tbl, "profiles" = all_of(colnames(pfl[[1]])))
        tbl$profiles <- pf_nest$profiles
        tbl <- unnest(tbl, !!sym("profiles"))
        ## Ensure the ranges are shown with the strongest signal at the top
        ## Use the 4 bins aronud the centre to pick the strongest signal
        tbl4sort <- dplyr::filter(tbl, abs(!!sym(xValue)) <= maxDist)
        if (!quo_is_null(filter)) {
            keep <- eval_tidy(filter, tbl4sort)
            tbl4sort <- tbl4sort[keep,]
        }
        lv <- unique(arrange(tbl4sort, desc(!!sym(fillValue)))$range)
        tbl$range <- fct_rev(factor(tbl$range, levels = lv))
        tbl <- arrange(tbl, desc(!!sym("range")))
        if (!is.null(facetY) & !respectLevels)
            tbl[facetY] <- lapply(tbl[facetY], \(x) fct_inorder(as.character(x)))

        ## Pass to the private function
        summariseBy <- match.arg(summariseBy)
        .makeFinalProfileHeatmap(
            data = tbl, x = xValue, y = "range", fill = fillValue,
            colour = colour, linetype = linetype, facet_x = facetX,
            facet_y = facetY, summary_fun = summariseBy,
            rel_height = relHeight, x_lab = xLab, y_lab = yLab,
            fill_lab = fillLab, lab_fun_x = labelFunX,
            label_side = summaryLabelSide, ...
        )
    }
)


#' @title Make a profile heatmap
#' @description Make a profile heatmap with optional summary panel at the top
#' @details The workhorse function for generating the final heatmap
#' Expects a single data.frame in long form with requisite columns
#' @return A ggplot2 object
#' @param data A data.frame or tibble in long form
#' @param x,y The values mapped to the x & y axes
#' @param fill The column used for heatmap colours
#' @param colour,linetype Columns used for the summary plot in the top panel
#' @param facet_x,facet_y Columns used to facet the plot along these axes
#' @param summary_fun Function used to create the summary value at each position
#' @param rel_height The relative height of the top panel
#' @param x_lab,y_lab,fill_lab _labels added to x/y-axes & the fill legend
#' @param ... Passed to facet_grid
#'
#' @import ggside
#' @importFrom dplyr group_by summarise
#' @importFrom rlang !! sym !!! syms quos
#' @importFrom stats as.formula
#' @import ggplot2
#' @keywords internal
.makeFinalProfileHeatmap <- function(
        data, x = NULL, y = NULL, fill = NULL, colour = NULL, linetype = NULL,
        facet_x = NULL, facet_y = NULL,
        summary_fun = c("mean", "median", "min", "max", "none"),
        rel_height = 0.3, x_lab = NULL, y_lab = NULL, fill_lab = NULL,
        lab_fun_x = waiver(), label_side = c("left", "right", "none"), ...
) {

    ## data should be a simple data.frame or tibble used to make the final plot
    all_vars <- c(x, y, fill, colour, linetype, facet_x, facet_y)
    stopifnot(is(data, "data.frame"))
    args <- colnames(data)
    stopifnot(all(all_vars %in% args))
    if (!is.null(fill)) fill <- sym(fill)
    if (!is.null(colour)) colour <- sym(colour)
    if (!is.null(linetype)) linetype <- sym(linetype)
    if (!is.null(x)) x <- sym(x)
    if (!is.null(y)) y <- sym(y)
    if (!is.null(facet_x)) facet_x <- syms(facet_x)
    if (!is.null(facet_y)) facet_y <- syms(facet_y)
    label_side <- match.arg(label_side)
    summary_fun <- match.arg(summary_fun)

    ## The basic plot
    x_axis <- scale_x_discrete(expand = rep(0, 4))
    if (is.numeric(data[[x]]))
        x_axis <-  scale_x_continuous(expand = rep(0, 4), labels = lab_fun_x)
    p <- ggplot(
        data,
        aes(
            {{ x }}, {{ y }}, fill = {{ fill }}, colour = {{ colour }},
            linetype = {{ linetype }}
        )
    ) +
        geom_raster() + x_axis +
        scale_y_discrete(breaks = NULL, expand = expansion(c(0, 0))) +
        labs(x = x_lab, y = y_lab, fill = fill_lab)

    ## Given that ggside does not currently create a legend for parameters only
    ## used in these side panels, add some dummy lines here in the main panel.
    ## This will ensure a legend appears for colour or linetype
    if (!is.null(colour) | !is.null(linetype))
        p <- p + geom_segment(
            aes(
                {{ x }}, {{ y }}, xend = {{ x }}, yend = {{ y }},
                colour = {{ colour }}, linetype = {{ linetype }}
            ),
            data = data[1,], inherit.aes = FALSE
        )
    ## Only add the top summary if this is called
    if (summary_fun != "none") {
        brks <- waiver()
        if (label_side == "none") {
            brks <- NULL
            label_side <- "left"
        }
        f <- match.fun(summary_fun)
        grp_vars <- c(x, facet_x, facet_y, colour, linetype)
        grp_vars <- unlist(lapply(grp_vars, as.character))
        summ_df <- summarise(data, y = f(!!fill), .by = all_of(grp_vars))
        p <- p + geom_xsideline(
            aes({{ x }}, y, colour = {{ colour }}, linetype = {{ linetype }}),
            data = summ_df, inherit.aes = FALSE
        ) +
            ggside(collapse = "x") +
            scale_xsidey_continuous(
                expand = c(0.1, 0, 0.1, 0), position = label_side, breaks = brks
            ) +
            theme(
                panel.grid.minor = element_blank(),
                ggside.panel.scale = rel_height[[1]]
            )
    }
    ## Add the faceting information
    if (!is.null(facet_x) | !is.null(facet_y)) {
        p <- p + facet_grid(
            cols = quos(!!!facet_x), rows = quos(!!!facet_y),
            space = "free_y", scales = "free_y", ...
        )
    }
    p

}

#' @importFrom methods is
#' @importFrom IRanges commonColnames
.checkProfileDataFrames <- function(df, x, fill) {

    msg <- c()

    ## Expects a SplitDataFrameList, or similar
    ## Needs to check all have the same dimensions & the same column names
    if (is(df, "CompressedSplitDataFrameList")) {
        ## CompressedSplitDataFrameList objects already have the colnames &
        ## DataFrame stuff checked. Check these first for speed
        nr <- vapply(df, nrow, integer(1))
        if (length(unique(nr)) != 1)
            msg <- c(msg, "All elements must have the same number of rows\n")
        if (!all(c(x, fill) %in% commonColnames(df)))
            msg <- c(
                msg,
                paste(
                    "Column", setdiff(c(x, fill), commonColnames(df)),
                    "is missing"
                )
            )
        if (!is.null(msg)) {
            message(msg)
            return(FALSE)
        }
        return(TRUE)
    }

    if (!is(df, "list_OR_List"))
        msg <- c(msg, "Object must be a list_OR_List\n")
    isDF <- vapply(df, is, logical(1), class2 = "DataFrame")
    is_df <- vapply(df, is, logical(1), class2 = "data.frame")
    if (!all(isDF | is_df))
        msg <- c(
            msg, "Each list element must contain DataFrame or data.frame\n"
        )
    if (!is.null(msg)) {
        message(msg)
        return(FALSE)
    }
    ## Now check the actual data frames
    dims <- vapply(df, dim, integer(2))
    if (length(unique(dims[1, ])) > 1)
        msg <- c(msg, "Each data element must have the same number of rows\n")
    if (length(unique(dims[2, ])) > 1)
        msg <- c(
            msg, "Each data element must have the same number of columns\n"
        )
    all_names <- vapply(df, colnames, character(dims[2, 1]))
    same_cols <- apply(all_names, 1, function(df) length(unique(df)) == 1)
    if (!x %in% all_names[,1])
        msg <- c(msg, paste("Column", x, "is missing\n"))
    if (!fill %in% all_names[,1])
        msg <- c(msg, paste("Column", fill, "is missing"))
    if (!all(same_cols))
        msg <- c(msg, "All elements must have the same column names\n")
    if (!is.null(msg)) {
        message(msg)
        return(FALSE)
    }
    TRUE

}