R/plotEpibed.R

Defines functions .plotMethAvg .createLegend .methByReadPlot .setQlTheme .makePlotData plotEpibed

Documented in plotEpibed

#' Plot the results of tabulateEpibed() as a quasi-lollipop plot.
#'
#' NOTE: If you run tabulateEpibed() in NOMe mode with include_empty_reads = TRUE,
#' then it is highly suggested you run plotEpibed() with show_all_points = TRUE also.
#' If not, it is possible that reads in the CG and GC plots will not line up, depending
#' on if there are empty reads in one plot, but not the other.
#'
#' @param mat Input matrix that comes out of tabulateEpibed()
#' @param plot_meth_avg Whether to also plot the average methylation state (default: FALSE)
#' @param show_readnames Whether to show the read names (default: TRUE)
#' @param show_positions Whether to show the genomic positions (default: TRUE)
#' @param show_all_points Whether to show all points (i.e., empty reads, unknown, and filtered sites) (default: FALSE)
#' @param show_legend Whether to include legend in plots (default: TRUE)
#' @param meth_color What color should the methylated states be (default: '#FF2400')
#' @param unmeth_color What color should the unmethylated states be (default: '#6495ED')
#' @param na_color What color should the NA values be (default: 'grey')
#' @param size What size the points should be drawn as (default: 6)
#' @param n_reads Maximum number of reads to plot, if force = FALSE then takes first n_reads (default: 20)
#' @param n_loci Maximum number of loci to plot, if force = FALSE then takes first n_loci (default: 10)
#' @param force Force plotting of all reads and loci in input table (default: FALSE)
#'
#' @return An epibed ggplot object or list of ggplot objects if plot_read_avg is TRUE
#'
#' @import ggplot2
#' @importFrom reshape2 melt
#' @importFrom cowplot plot_grid
#'
#' @export
#'
#' @examples
#'
#' epibed.nome <- system.file("extdata", "hct116.nome.epibed.gz",
#'                            package="biscuiteer")
#' epibed.nome.gr <- readEpibed(epibed = epibed.nome, is.nome = TRUE,
#'                              genome = "hg19", chr = "chr1")
#' epibed.tab.nome <- tabulateEpibed(epibed.nome.gr)
#' plotEpibed(epibed.tab.nome$gc_table)
#'
plotEpibed <- function(mat,
                       plot_meth_avg = FALSE,
                       show_readnames = TRUE,
                       show_positions = TRUE,
                       show_all_points = FALSE,
                       show_legend = TRUE,
                       meth_color = "#FF2400",
                       unmeth_color = "#6495ED",
                       na_color = "grey",
                       size = 6,
                       n_reads = 20,
                       n_loci = 10,
                       force = FALSE) {
    # check if input is a matrix
    if (is.list(mat) & is(mat[[1]], "matrix")) {
        message("Input given is likely the output of tabulateEpibed().")
        stop("Please select $cg_table, $gc_table, $vr_table, or $tab_cgvr to plot.")
    }
    if (!is.list(mat) & !is(mat, "matrix")) {
        stop("Input needs to be a matrix generated by tabulateEpibed().")
    }
    if (all(is.na(mat))) {
        stop("All data are NA. Try a different matrix.")
    }

    # Reduce number of reads plotted
    if ((nrow(mat) > n_reads) && (isFALSE(force))) {
        message("Removing ", nrow(mat)-n_reads, " reads. Keeping first ", n_reads, " reads.")
        message("Rerun with increased `n_reads` option to plot more reads.")

        mat <- mat[1:n_reads,]
    }

    # Reduce number of loci plotted
    if ((ncol(mat) > n_loci) && (isFALSE(force))) {
        message("Removing ", ncol(mat)-n_loci, " loci. Keeping first ", n_loci, " loci.")
        message("Rerun with increased `n_loci` option to plot more loci.")

        mat <- mat[, 1:n_loci]
    }

    mat_melt <- .makePlotData(mat, show_all_points)
    ql_theme <- .setQlTheme(show_readnames, show_positions)

    plt <- .methByReadPlot(mat_melt, ql_theme, show_legend, meth_color, unmeth_color, na_color, size)

    if (plot_meth_avg) {
        avg <- .plotMethAvg(mat, ql_theme, show_legend, meth_color, unmeth_color, size)
    } else {
        avg <- NA
    }

    return(
        list(
            epi = plt,
            avg = avg
        )
    )
}

# helper to make the melted dataset for plotting
.makePlotData <- function(mat, show_all_points) {
    cg <- c("M", "U", "N")
    gc <- c("O", "S")
    vr <- c("A", "C", "G", "T", "R", "Y")
    possibles <- c(
        cg, gc, vr,
        apply(expand.grid(vr, cg), 1, paste, collapse=""),
        apply(expand.grid(cg, vr), 1, paste, collapse=""),
        apply(expand.grid(vr, cg, vr), 1, paste, collapse="")
    )

    # break if no known values exist in matrix
    if (!any(mat %in% possibles)) {
        message("Unsure what to do with input.")
        stop("Please run tabulateEpibed() first to produce input to this function.")
    }

    # cast to a 'melted' data frame
    mat_melt <- reshape2::melt(mat, id.vars = rownames(mat))
    if (!show_all_points) {
        mat_melt <- subset(mat_melt, !is.na(value))
    }

    mat_melt
}

# helper to set the plot theme
.setQlTheme <- function(show_readnames, show_positions) {
    ql_theme <- theme_bw(12) +
        theme(
            axis.title = element_blank(),
            legend.position = "none",
            legend.title = element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.major.y = element_line(color = "black")
        )

    if (!show_readnames) {
        ql_theme <- ql_theme +
            theme(
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank()
            )
    }

    if (!show_positions) {
        ql_theme <- ql_theme +
            theme(
                axis.text.x = element_blank(),
                axis.ticks.x = element_blank()
            )
    } else {
        ql_theme <- ql_theme +
            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
    }

    ql_theme
}

# helper to plot epibed by reads/fragments
.methByReadPlot <- function(pd, ql_theme, show_legend, meth_color, unmeth_color, na_color, size) {
    cg <- c("M", "U", "N")
    gc <- c("O", "S")
    vr <- c("A", "C", "G", "T", "R", "Y")
    cg_vr_r <- apply(expand.grid(vr, cg), 1, paste, collapse="")
    cg_vr_l <- apply(expand.grid(cg, vr), 1, paste, collapse="")
    cg_vr_m <- apply(expand.grid(vr, cg, vr), 1, paste, collapse="")

    # CGs with SNP at C position
    cvr_cg <- subset(pd, value %in% cg_vr_r)
    cvr_vr <- subset(pd, value %in% cg_vr_r)
    cvr_cg$value <- gsub(paste(c("[", vr, "]"), collapse=""), "", cvr_cg$value)
    cvr_vr$value <- gsub(paste(c("[", cg, "]"), collapse=""), "", cvr_vr$value)

    # CGs with SNP at G position
    cvl_cg <- subset(pd, value %in% cg_vr_l)
    cvl_vr <- subset(pd, value %in% cg_vr_l)
    cvl_cg$value <- gsub(paste(c("[", vr, "]"), collapse=""), "", cvl_cg$value)
    cvl_vr$value <- gsub(paste(c("[", cg, "]"), collapse=""), "", cvl_vr$value)

    # CGs with SNPs at C and G positions
    cvm_cg <- subset(pd, value %in% cg_vr_m)
    cvm_vr <- subset(pd, value %in% cg_vr_m)
    cvm_cg$value <- gsub(paste(c("[", vr, "]"), collapse=""), "", cvm_cg$value)
    cvm_vr$value <- gsub(paste(c("[", cg, "]"), collapse=""), ",", cvm_vr$value)

    # CG/GC/SNP that don't collide with one another
    plt <- ggplot(pd, aes(x = Var2, y = Var1)) +
        geom_point(data = subset(pd, is.na(value)), aes(fill = value), size=size, pch=21, color="black") +
        geom_point(data = subset(pd, value %in% cg), aes(fill = value), size=size, pch=21, color="black") +
        geom_point(data = subset(pd, value %in% gc), aes(fill = value), size=size, pch=21, color="black") +
        geom_point(data = subset(pd, value %in% vr), fill = "white", size=size+1, pch=21, color="black") +
        geom_text(data = subset(pd, value %in% vr), aes(label = value), size=size-1, color="black") +
        guides(color = "legend") +
        ql_theme

    # SNP in C of CG
    plt <- plt +
        geom_point(data = cvr_cg, aes(fill = value), size=size+1, pch=22, color="black") +
        geom_text(data = cvr_vr, aes(label = value), size=size-1, color="black")

    # SNP in G of CG
    plt <- plt +
        geom_point(data = cvl_cg, aes(fill = value), size=size+1, pch=23, color="black") +
        geom_text(data = cvl_vr, aes(label = value), size=size-1, color="black")

    # SNP in C and G of CG
    plt <- plt +
        geom_point(data = cvm_cg, aes(fill = value), size=size+1, pch=24, color="black") +
        geom_text(data = cvm_vr, aes(label = value), size=size-1, color="black")

    # Color methylation/accessibility
    if (any(pd$value %in% gc)) {
        plt <- plt +
            scale_fill_manual(
                values = c(O = meth_color, S = unmeth_color),
                labels = c(O = "Open", S = "Closed"),
                na.value = na_color,
                name = "Accessibility"
            )
    } else {
        plt <- plt +
            scale_fill_manual(
                values = c(M = meth_color, U = unmeth_color, N = na_color),
                labels = c(M = "Methylated", U = "Unmethylated", N = "Unknown"),
                na.value = na_color,
                name = "Methylation"
            )
    }

    # Add legend if desired
    if (show_legend) {
        # Check which values are found in data
        extra_vals <- c()
        if (any(pd$value %in% c("N"))) { extra_vals <- c(extra_vals, "N") }
        if (length(cvr_cg$value) > 0)  { extra_vals <- c(extra_vals, "C") }
        if (length(cvl_cg$value) > 0)  { extra_vals <- c(extra_vals, "G") }
        if (length(cvm_cg$value) > 0)  { extra_vals <- c(extra_vals, "CG") }
        leg <- .createLegend(extra_vals, size, meth_color, unmeth_color, na_color)
        out <- plot_grid(plt, leg, rel_widths = c(4, 1))
    } else {
        out <- plt
    }

    out
}

# helper to create legend for epiBED plot
# it was easier to manually create the figure than to deal with ggplot forming the legend auto-magically
.createLegend <- function(extra_vals, size, meth_color, unmeth_color, na_color) {
    # Set up what points will be shown
    # M, U, and Lone CpG/SNP will always be shown
    xs <- rep(1,3)
    ys <- c(3, 2, -1)
    ds <- c("M", "U", "A")
    ls <- c("Methylated", "Unmethylated", "Lone CpG/SNP")

    # Add extra values if they exist
    if ("N" %in% extra_vals) { # unknown
        xs <- c(xs, 1)
        ys <- c(ys, 1)
        ds <- c(ds, "N")
        ls <- c(ls, "Unknown")
    }
    if ("C" %in% extra_vals) { # CpG, SNP at C
        xs <- c(xs, 1)
        ys <- c(ys, -2)
        ds <- c(ds, "C")
        ls <- c(ls, "CpG, SNP at C")
    }
    if ("G" %in% extra_vals) { # CpG, SNP at G
        y <- -3
        if (!("C" %in% extra_vals)) { y <- y + 1 }
        xs <- c(xs, 1)
        ys <- c(ys, y)
        ds <- c(ds, "G")
        ls <- c(ls, "CpG, SNP at G")
    }
    if ("CG" %in% extra_vals) { # CpG, SNP at both
        y <- -4
        if (!("C" %in% extra_vals)) { y <- y + 1 }
        if (!("G" %in% extra_vals)) { y <- y + 1 }
        xs <- c(xs, 1)
        ys <- c(ys, y)
        ds <- c(ds, "CG")
        ls <- c(ls, "CpG, SNP at both")
    }

    points <- data.frame(x=xs, y=ys, data=ds, labels=ls)

    plt <- ggplot(points, aes(x=x, y=y)) +
        geom_point(aes(fill=data, shape=data), size=size, color="black") +
        geom_text(aes(label=labels), hjust=0, nudge_x=1, size=size-1, color="black") +
        xlim(0, 10) +
        ylim(-10, 10) +
        scale_fill_manual(
            values = c(
                A = "black",
                C = "black",
                G = "black",
                CG = "black",
                M = meth_color,
                U = unmeth_color,
                N = na_color
            )
        ) +
        scale_shape_manual(values = c(A = 21, C = 22, G = 23, CG = 24, M = 21, N = 21, U = 21))

    plt <- plt +
        theme_bw() +
        theme(
            panel.border = element_blank(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.line = element_blank(),
            axis.title = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            legend.position = "none"
        )

    plt
}

# helper to calculate avg methylation
.plotMethAvg <- function(mat, ql_theme, show_legend, meth_color, unmeth_color, size) {
    vr <- c("A", "C", "G", "T", "R", "Y")
    meth <- c("M", "O")
    unme <- c("U", "S")

    # check if input is a matrix
    if (!is(mat, "matrix")) {
        stop("To plot average methylation, a matrix is needed as input.")
    }

    # Remove any SNPs and replace missing/empty values with NA
    mat <- gsub(paste(c("[", vr, "]"), collapse=""), "", mat)
    mat <- gsub("N", "", mat)
    mat <- replace(mat, mat == "", NA)

    mat[mat %in% meth] <- 1
    mat[mat %in% unme] <- 0

    mat <- apply(mat, 2, as.numeric)
    avg <- data.frame(avg_meth = colMeans(mat, na.rm=TRUE)) # SNP only columns end up as NaN

    # Replace SNP only columns with NA so they have a spot retained in the figure but don't receive a circle
    avg$avg_meth <- ifelse(is.nan(avg$avg_meth), NA, avg$avg_meth)

    avg$position <- rownames(avg)
    avg$y <- ifelse(is.na(avg$avg_meth), "SNP status", "Average methylation")

    plt <- ggplot(avg, aes(x=position, y=y)) +
        geom_point(aes(fill = avg_meth), size=size, pch=21, color="black", na.rm = TRUE) +
        scale_fill_gradient(low=unmeth_color, high=meth_color, limits=c(0, 1)) +
        scale_x_discrete(name="", limits=rownames(avg)) +
        scale_y_discrete(limits=c("Average methylation")) +
        guides(color="legend") +
        ql_theme

    if (show_legend) {
        plt <- plt + theme(legend.position = "right")
    }

    plt
}

#' Calculate and plot a hierarchical clustering of the epibeds
#'
#' WARNING: This is a beta function and should not be used for publication level analyses!!!!
#'
#' @param mat Input matrix that comes out of tabulateEpibed()
#' @param stringdist_method stringdist::stringdist algorithm to use (default: "hamming")
#' @param hclust_method Clustering algorithm to use (default: "ward.D2")
#' @param plot Whether to plot the clustered epibeds (default: TRUE)
#'
#' @return A hierarchical cluster (hclust) object
#'
#' @import ggtree
#' @importFrom stringdist stringdist
#' @importFrom cowplot plot_grid
#'
#' @examples
#'
#' #epibed.nome <- system.file("extdata", "hct116.nome.epibed.gz",
#' #                           package="biscuiteer")
#' #epibed.nome.gr <- readEpibed(epibed = epibed.nome, is.nome = TRUE,
#' #                             genome = "hg19", chr = "chr1")
#' #epibed.tab.nome <- tabulateEpibed(epibed.nome.gr)
#' #epistateCaller(epibed.tab.nome)
#'
#epistateCaller <- function(mat,
#                           stringdist_method="hamming",
#                           hclust_method="ward.D2",
#                           plot = TRUE) {
#    # check for both cg and gc tables
#    if (is.list(mat)) {
#        stopifnot(exists("cg_table", where = mat) & exists("gc_table", where = mat))
#
#        mat.merge <- merge(mat$cg_table, mat$gc_table, by = 0)
#        row.names(mat.merge) <- mat.merge[, 1]
#        mat.merge <- mat.merge[-1]
#    } else {
#        mat.merge <- mat
#    }
#
#    mat.merge[is.na(mat.merge) |
#              mat.merge == "A" | mat.merge == "T" |
#              mat.merge == "G" | mat.merge == "C" |
#              mat.merge == "U" | mat.merge == "S"] <- 0
#    mat.merge[mat.merge == "M" | mat.merge == "O"] <- 1
#
#    epistates <- apply(mat.merge, 1, paste0, collapse = "")
#    hamming <- outer(epistates, epistates, stringdist, method = stringdist_method)
#
#    hcluster <- hclust(as.dist(hamming), method = hclust_method)
#    if (!plot) {
#        return(hcluster)
#    }
#
#    if (!is.list(mat)) {
#        matplotdata <- list(.makePlotData(mat, show_all_points = FALSE))
#    } else {
#        matplotdata <- lapply(
#            mat, function(meth_table) { .makePlotData(meth_table, show_all_points = FALSE) }
#        )
#    }
#
#    tree <- ggtree(as.dendrogram(hcluster), branch.length = "none")
#
#    ql_theme <- .set_ql_theme(TRUE, TRUE)
#
#    plots <- lapply(
#        matplotdata,
#        function(m) {
#            m$Var1 <- factor(m$Var1, levels = rev(get_taxa_name(tree)))
#            plt <- .epiClustPlot(m, ql_theme)
#
#            clust_plot <- plot_grid(
#                tree, NULL, plt,
#                rel_widths = c(1, -0.05, 2), nrow = 1, align = 'h'
#            )
#            return(clust_plot)
#        }
#    )
#
#    return(plots)
#}
huishenlab/bisplotti documentation built on Sept. 20, 2023, 10:13 p.m.