R/func__visualisation__heatMapPAM.R

Defines functions heatMapPAM

Documented in heatMapPAM

#' @title An expansion of ggtree's gheatmap function for displaying an allelic
#' presence-absence matrix.
#'
#' @description To append a heatmap of a presence-absence matrix (PAM) to the
#' right side of a phylogenetic tree. This function is modified from the function
#' gheatmap in the R package ggtree (github.com/GuangchuangYu/ggtree) through
#' replacing scale_fill_gradient with scale_fill_discrete. As a result, columns
#' in the heat map can be coloured by group information. This function is useful
#' when users need draw a heat map of gene or allele contents across strains.
#'
#' For each entry in the PAM, this function assumes that the absence status is coded as zero
#' and the presence status is coded as one.
#'
#' @param p A tree view from the function ggtree in the package ggtree
#' @param data A binary presence-absence matrix (PAM)
#' @param col_colours Either a vector of colour codes named by column names of the PAM,
#' or a single colour code (for example, "red") for presence status in all columns.
#' @param null_colour The colour for absence status.
#' @param presence_label The label for presence status in the legend. Default: Presence.
#' This parameter only works when col_colours equals a single colour code.
#' @param absence_label The label for absence status in the legend. Default: Absence.
#' This parameter also only works when col_colours equals a single colour code.
#' @param border_colour Colour of heatmap cell border
#' @param cluster_cols A logical argument determining if columns of PAM will be
#' clustered or not. Default: FALSE. Clustering will not run when there are less
#' than three columns in the PAM.
#' @param cluster_method Method for clustering. Default: binary.
#' @param cluster_distance The distance used for clustering columns. Default: binary.
#' @param rev_cols A logical argument specifying whether to reverse the column order
#' when it is necessary for a better visualisation.
#' @param colnames Logical, add matrix colnames or not
#' @param colnames_position One of 'bottom' or 'top'
#' @param colnames_angle Angle of column names
#' @param colnames_level Levels of colnames
#' @param set_label_colours Whether colour column names in the same way as painting columns.
#' By default, this option is turned off, which result in all column names to be printed
#' in black.
#' @param colnames_offset_x x offset for column names
#' @param colnames_offset_y y offset for column names
#' @param font.size Font size of matrix colnames
#' @param hjust hjust for column names (0: align left, 0.5: align center, 1: align righ)
#' @param offset Offset of heatmap to tree
#' @param width Total width of heatmap, compare to width of tree
#' @param show_legend A logical argument determining whether to show the legend.
#'
#' @return A list of three elements: p, a tree view that can be plotted directly;
#' mapping: a data frame mapping allele names to colours; data, a data frame of
#' weighted presence-absence status.
#'
#' @examples htmap <- heatMapPAM(...)
#' plot(htmap$p)
#'
#' @export
#' @author Guangchuang Yu, Yu Wan (\email{wanyuac@@126.com})
#
# First edition of this function: 5 Sep 2018; the latest edition: 8 Aug 2022
# Licence: Artistic License 2.0 (follow the licence of the package ggtree)

heatMapPAM <- function(p, data, col_colours = "black", null_colour = "grey90",
                       presence_label = "Presence", absence_label = "Absence",
                       border_colour = "white", cluster_cols = FALSE,
                       cluster_method = "single", cluster_distance = "binary",
                       rev_cols = FALSE, colnames = TRUE,
                       colnames_position = "bottom", colnames_angle = 0,
                       colnames_level = NULL, set_label_colours = FALSE,
                       colnames_offset_x = 0, colnames_offset_y = 0,
                       font.size = 4, hjust = 0.5, offset = 0, width = 1,
                       show_legend = FALSE) {
    # The first two packages are dependencies of the package ggtree.
    require(ggplot2)
    require(tidyr)  # for the function gather
    require(tibble)
    require(magrittr)  # for operators "%<>%" and "%>%"(github.com/GuangchuangYu/ggtree/blob/master/R/operator.R)
    require(ggtree)

    colnames_position %<>% match.arg(c("bottom", "top"))
    variable <- value <- lab <- y <- NULL

    ## convert relative width of the PAM to width of each cell
    width <- width * (p$data$x %>% range(na.rm = TRUE) %>% diff) / ncol(data)
    isTip <- x <- y <- variable <- value <- from <- to <- NULL

    df <- p$data
    df <- df[df$isTip, ]
    start <- max(df$x, na.rm = TRUE) + offset

    # Column-wise clustering
    if (cluster_cols && ncol(data) > 2) {
        hc <- hclust(d = dist(t(data), method = cluster_distance),
                     method = cluster_method)
        data <- data[, hc$order]  # reorder columns
        clustered <- TRUE
    } else {
        clustered <- FALSE
    }

    # Inverse columns for improving visualisation
    if (rev_cols) {
        data <- data[, ncol(data) : 1]
    }

    # weight cells before converting the PAM (dd) into a data frame
    n_colours <- length(col_colours)
    is_multi_colour <- n_colours > 1 && !is.null(names(col_colours))
    if (is_multi_colour) {
        colours_uniq <- sort(unique(as.character(col_colours)), decreasing = FALSE)  # colours for positive values in PAM. Column names are discarded here.
        colour_codes <- 1 : length(colours_uniq)  # codes for colours
        names(colour_codes) <- colours_uniq  # e.g., colour_codes becomes c("red" = 1, "blue" = 2, ...)
        column_names <- colnames(data)  # Since matrix product loses column names (allele names), we need to save them beforehand.
        if (clustered) {
            col_colours <- col_colours[column_names]  # rearrange colour codes when columns of the PAM have been clustered
        } else if (n_colours > length(column_names))  {
            col_colours <- col_colours[column_names]
        }
        data <- data %*% diag(as.integer(colour_codes[as.character(col_colours)]))  # convert colour characters into integer codes
        colnames(data) <- column_names
        names(colours_uniq) <- as.character(colour_codes)
        colours_uniq <- append(c("0" = null_colour), colours_uniq)

        # Convert values in the matrix to factors so as to map colours to the levels
        dd <- as.data.frame(data)
        for (i in names(dd)) {
            dd[, i] <- factor(dd[, i], levels = c(0, as.integer(colour_codes)),
                              labels = c("0", as.character(colour_codes)))
        }
    } else {
        dd <- as.data.frame(data)

        # Here, I convert binary values to factors in order to show the legend as binary rather than continuous.
        for (i in names(dd)) {
            dd[, i] <- factor(dd[, i], levels = c(0, 1), labels = c("0", "1"))
        }
    }

    i <- order(df$y)

    ## handle collapsed tree
    ## https://github.com/GuangchuangYu/ggtree/issues/137
    i <- i[!is.na(df$y[i])]

    lab <- df$label[i]
    ## https://github.com/GuangchuangYu/ggtree/issues/182
    dd <- dd[match(lab, rownames(dd)), , drop = FALSE]
    dd$y <- sort(df$y)
    dd$lab <- lab

    # tibble::set_tidy_names(dd) solves the problem of the error: "Can't bind data because some arguments have the same name"
    # see https://github.com/tidyverse/tidyr/issues/472
    dd <- gather(data = tibble::set_tidy_names(dd), key = variable, value = value, -c(lab, y))

    i <- which(dd$value == "")
    if (length(i) > 0) {
        dd$value[i] <- NA
    }
    if (is.null(colnames_level)) {
        dd$variable <- factor(dd$variable, levels = colnames(data))
    } else {
        dd$variable <- factor(dd$variable, levels = colnames_level)
    }
    V2 <- start + as.numeric(dd$variable) * width

    # Create a data frame for label attributes
    mapping <- data.frame(from = as.character(dd$variable), to = V2, stringsAsFactors = FALSE)  # from: label texts
    mapping <- unique(mapping)
    paint_labels <- set_label_colours && is_multi_colour
    if (paint_labels) {
        mapping$col <- as.character(col_colours[mapping$from])  # variable label colour
        mapping$col <- factor(mapping$col, levels = as.character(colours_uniq),
                              labels = names(colours_uniq))
    }
    mapping$from = as.factor(mapping$from)  # change back to factors

    # Create the coloured tile matrix
    dd$x <- V2
    dd$width <- width
    dd[[".panel"]] <- factor("Tree")
    if (is.null(border_colour)) {
        p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width,
                            inherit.aes = FALSE)
    } else {
        p2 <- p + geom_tile(data = dd, aes(x, y, fill = value), width = width,
                            colour = border_colour, inherit.aes = FALSE)
    }

    # Print column names
    if (colnames) {
        if (colnames_position == "bottom") {
            y <- 0
        } else {
            y <- max(p$data$y) + 1
        }
        mapping$y <- y
        mapping[[".panel"]] <- factor("Tree")
        if (paint_labels) {
            p2 <- p2 + geom_text(data = mapping,
                                 aes(x = to, y = y, label = from, colour = col),
                                 size = font.size, inherit.aes = FALSE,
                                 angle = colnames_angle, nudge_x = colnames_offset_x,
                                 nudge_y = colnames_offset_y, hjust = hjust)
        } else {  # Labels are printed in black.
            p2 <- p2 + geom_text(data = mapping,
                                 aes(x = to, y = y, label = from),
                                 size = font.size, inherit.aes = FALSE,
                                 angle = colnames_angle, nudge_x = colnames_offset_x,
                                 nudge_y = colnames_offset_y, hjust = hjust)
        }
    }

    # Scale colours for both tiles and column labels
    if (is_multi_colour) {
        # This command does not scale label colours when paint_labels = FALSE.
        # Class: title of the legend
        # scale_fill_manual sets tile colours; scale_colour_manual sets label colours
        p2 <- p2 + scale_fill_manual(name = "Class",
                                     breaks = c("0", as.character(colour_codes)),
                                     values = colours_uniq, na.value = NA)
            # Bug identified (8/8/2022): loss of tip labels after scalling colours
            # + scale_colour_manual(name = "Class",
            #                    breaks = c("0", as.character(colour_codes)),
            #                    values = colours_uniq, na.value = NA)
    } else {
        p2 <- p2 + scale_fill_manual(name = "Class", breaks = c("1", "0"),
                               values = c("1" = col_colours, "0" = null_colour),
                               labels = c(presence_label, absence_label),
                               na.value = NA)
        #p2 <- p2 + scale_fill_gradient(low = null_colour, high = col_colours, na.value = NA)  # reserve this command for continuous variable
    }

    # Print legend
    if (show_legend) {
        p2 <- p2 + theme(legend.position = "right", legend.title = element_blank())
    }

    attr(p2, "mapping") <- mapping

    return(list(p = p2, mapping = mapping, data = dd))
}
wanyuac/GeneMates documentation built on Aug. 12, 2022, 7:37 a.m.