R/genomic.R

Defines functions normalize_genomic_signals_to_bins bin_genome

Documented in bin_genome normalize_genomic_signals_to_bins

GHEATMAP_ENV = new.env()

# == title
# Bin the genome
#
# == param
# -species Abbreviation of the genome, pass to `circlize::read.chromInfo`.
# -bins Number of bins. The final number of bins is approximately equal to it.
# -bin_size Size of the bins. If ``bin_size`` is set, ``bins`` is ignored.
# -... All pass to `circlize::read.chromInfo`. E.g. you can set a subset of chromosomes there.
#
# == value
# A `GenomicRanges::GRanges` object of the genomic bins.
#
bin_genome = function(species = "hg19", bins = 2000, bin_size = NULL, ...) {

    lt = circlize::read.chromInfo(species = species, ...)

    chr_df = lt$df
    chr_gr = GenomicRanges::GRanges(seqnames = chr_df[, 1], ranges = IRanges::IRanges(chr_df[, 2] + 1, chr_df[, 3]))

    if(is.null(bin_size)) {
        bin_size = round(sum(lt$chr.len)/bins)
        bin_size = max(bin_size, 1)
    }
    chr_window = EnrichedHeatmap::makeWindows(chr_gr, w = bin_size)
    GenomicRanges::mcols(chr_window) = NULL

    GHEATMAP_ENV$chr_window = chr_window
    invisible(chr_window)
}

# == title
# Overlap genomic signals to the genomic bins
#
# == param
# -gr A `GenomicRanges::GRanges` object.
# -value The corresponding signals corresponding to ``gr``.
# -value_column If ``value`` is not set and the values are in the meta-columns in ``gr``, you can specify the column
#     indices for these value columns, better to use name indices.
# -method One of "weighted", "w0" and "absolute". For the three different methods, please refer to https://bioconductor.org/packages/release/bioc/vignettes/EnrichedHeatmap/inst/doc/EnrichedHeatmap.html#toc_7 .
# -empty_value The value for the bins where no signal is overlapped.
# -window The genomic bins generated from `bin_genome`.
#
# == details
# The genomic bins should be generated by `bin_genome` in advance. The genomic bins are saved internally, so that multiple
# uses of `bin_genome` ensure they all return the matrices with the same rows.
#
# It supports following values.
#
# - When neither ``value`` nor ``value_column`` is set, it simply overlap ``gr`` to the genomic bins and returns a one-column logical
#   matrix which represents whether the current genomic bin overlaps to any signal.
# - When the signals are numeric, ``value`` can be a numeric vector or a matrix, or ``value_column`` can contain multiple columns.
#   The function returns a numeric matrix where the values are properly averaged depending on what ``method`` was used.
# - When the signals are character, ``value`` can only be a vector or ``value_column`` can only contain one single column.
#   The function returns a one-column character matrix.
#
# == value
# A matrix with the same row as the genomic bins.
#
# == examples
# \dontrun{
# require(circlize)
# require(GenomicRanges)
#
# chr_window = bin_genome("hg19")
#
# #### the first is a numeric matrix #######
# bed1 = generateRandomBed(nr = 1000, nc = 10)
# gr1 = GRanges(seqnames = bed1[, 1], ranges = IRanges(bed1[, 2], bed1[, 3]))
#
# num_mat = normalize_genomic_signals_to_bins(gr1, bed1[, -(1:3)])
#
# #### the second is a character matrix ######
# bed_list = lapply(1:10, function(i) {
#     generateRandomBed(nr = 1000, nc = 1, 
#         fun = function(n) sample(c("gain", "loss"), n, replace = TRUE))
# })
# char_mat = NULL
# for(i in 1:10) {
#     bed = bed_list[[i]]
#     bed = bed[sample(nrow(bed), 20), , drop = FALSE]
#     gr_cnv = GRanges(seqnames = bed[, 1], ranges = IRanges(bed[, 2], bed[, 3]))
#
#     char_mat = cbind(char_mat, normalize_genomic_signals_to_bins(gr_cnv, bed[, 4]))
# }
#
# #### two numeric columns ##########
# bed2 = generateRandomBed(nr = 100, nc = 2)
# gr2 = GRanges(seqnames = bed2[, 1], ranges = IRanges(bed2[, 2], bed2[, 3]))
#
# v = normalize_genomic_signals_to_bins(gr2, bed2[, 4:5])
#
# ##### a list of genes need to be highlighted
# bed3 = generateRandomBed(nr = 40, nc = 0)
# gr3 = GRanges(seqnames = bed3[, 1], ranges = IRanges(bed3[, 2], bed3[, 2]))
# gr3$gene = paste0("gene_", 1:length(gr3))
#
# mtch = as.matrix(findOverlaps(chr_window, gr3))
# at = mtch[, 1]
# labels = mcols(gr3)[mtch[, 2], 1]
#
# ##### order of the chromosomes ########
# chr = as.vector(seqnames(chr_window))
# chr_level = paste0("chr", c(1:22, "X", "Y"))
# chr = factor(chr, levels = chr_level)
#
# #### make the heatmap #######
# subgroup = rep(c("A", "B"), each = 5)
#
# ht_opt$TITLE_PADDING = unit(c(4, 4), "points")
# ht_list = Heatmap(num_mat, name = "mat", col = colorRamp2(c(-1, 0, 1), c("green", "white", "red")),
#     row_split = chr, cluster_rows = FALSE, show_column_dend = FALSE,
#     column_split = subgroup, cluster_column_slices = FALSE,
#     column_title = "numeric matrix",
#     top_annotation = HeatmapAnnotation(subgroup = subgroup, annotation_name_side = "left"),
#     row_title_rot = 0, row_title_gp = gpar(fontsize = 10), border = TRUE,
#     row_gap = unit(0, "points")) +
# Heatmap(char_mat, name = "CNV", col = c("gain" = "red", "loss" = "blue"),
#     border = TRUE, column_title = "character matrix") +
# rowAnnotation(label = anno_mark(at = at, labels = labels)) +
# rowAnnotation(pt = anno_points(v, gp = gpar(col = 4:5), pch = c(1, 16)), 
#     width = unit(2, "cm")) +
# rowAnnotation(bar = anno_barplot(v[, 1], gp = gpar(col = ifelse(v[ ,1] > 0, 2, 3))), 
#     width = unit(2, "cm"))
# draw(ht_list, merge_legend = TRUE)
#
# ##### or horizontally ###
# ht_list = Heatmap(t(num_mat), name = "mat", col = colorRamp2(c(-1, 0, 1), c("green", "white", "red")),
#     column_split = chr, cluster_columns = FALSE, show_row_dend = FALSE,
#     row_split = subgroup, cluster_row_slices = FALSE,
#     row_title = "numeric matrix",
#     left_annotation = rowAnnotation(subgroup = subgroup, show_annotation_name = FALSE,
#         annotation_legend_param = list(
#             subgroup = list(direction = "horizontal", title_position = "lefttop", nrow = 1))),
#     column_title_gp = gpar(fontsize = 10), border = TRUE,
#     column_gap = unit(0, "points"),
#     column_title = ifelse(seq_along(chr_level) \%\% 2 == 0, paste0("\n", chr_level), paste0(chr_level, "\n")),
#     heatmap_legend_param = list(direction = "horizontal", title_position = "lefttop")) \%v\%
# Heatmap(t(char_mat), name = "CNV", col = c("gain" = "red", "loss" = "blue"),
#     border = TRUE, row_title = "character matrix",
#     heatmap_legend_param = list(direction = "horizontal", title_position = "lefttop", nrow = 1)) \%v\%
# HeatmapAnnotation(label = anno_mark(at = at, labels = labels, side = "bottom")) \%v\%
# HeatmapAnnotation(pt = anno_points(v, gp = gpar(col = 4:5), pch = c(1, 16)),
#     annotation_name_side = "left", height = unit(2, "cm")) \%v\%
# HeatmapAnnotation(bar = anno_barplot(v[, 1], gp = gpar(col = ifelse(v[ ,1] > 0, 2, 3))), 
#     annotation_name_side = "left", height = unit(2, "cm"))
# draw(ht_list, heatmap_legend_side = "bottom", merge_legend = TRUE)
# }
normalize_genomic_signals_to_bins = function(gr, value, value_column = NULL, method = "weighted", 
    empty_value = NA, window = GHEATMAP_ENV$chr_window) {
    
    if(is.null(window)) {
        stop_wrap("`bin_genome()` should be executed first.")
    }

    nm = paste0(as.vector(GenomicRanges::seqnames(window)), ":", GenomicRanges::start(window), "-", GenomicRanges::end(window))


    if(!inherits(gr, "GRanges")) {
        if(is.data.frame(gr)) {
            oe = try({
                gr <- GenomicRanges::GRanges(GenomicRanges::seqnames(gr[, 1]), ranges = IRanges::IRanges(gr[, 2], gr[, 3]))
                if(ncol(gr) > 3) {
                    GenomicRanges::mcols(gr) = gr[, -(1:3), drop = FALSE]
                }
            })
            if(inherits(oe, "try-error")) {
                stop_wrap("Failed to convert `gr` to a `GRanges` object. Please provide `gr` as a `GRanges` object.")
            }
        } else {
            stop_wrap("`gr` must be a `GRanges` object.")
        }
    }

    if(missing(value) && is.null(value_column)) {
        mtch = as.matrix(GenomicRanges::findOverlaps(window, gr))
        u = matrix(FALSE, nrow = length(window), ncol = 1)
        rownames(u) = nm
        u[mtch[, 1], 1] = TRUE
        return(u)
    }

    if(is.null(value) && is.null(value_column)) {
        mtch = as.matrix(GenomicRanges::findOverlaps(window, gr))
        u = matrix(FALSE, nrow = length(window), ncol = 1)
        rownames(u) = nm
        u[mtch[, 1], 1] = TRUE
        return(u)
    }

    if(!is.null(value_column)) {
        value = GenomicRanges::mcols(gr)[, value_column]
        value = as.matrix(as.data.frame(value))
    }

    if(is.atomic(value) && is.vector(value)) value = cbind(value)

    value = as.matrix(value)
    if(is.character(value) && ncol(value) > 1) {
        stop("For character signals, `value` can only be a single character vector or `value_column` can only contain one column.")
    }

    if(length(empty_value) == 1) {
        empty_value = rep(empty_value, ncol(value))
    }

    u = matrix(rep(empty_value, each = length(window)), nrow = length(window), ncol = ncol(value))
    rownames(u) = nm

    mtch = as.matrix(GenomicRanges::findOverlaps(window, gr))
    intersect = GenomicRanges::pintersect(window[mtch[,1]], gr[mtch[,2]])
    w = GenomicRanges::width(intersect)
    value = value[mtch[,2], , drop = FALSE]
    n = nrow(value)

    ind_list = split(seq_len(n), mtch[, 1])
    window_index = as.numeric(names(ind_list))
    window_w = GenomicRanges::width(window)

    if(is.character(value)) {
        for(i in seq_along(ind_list)) {
            ind = ind_list[[i]]
            if(is.function(method)) {
                u[window_index[i], ] = method(value[ind], w[ind], window_w[i])
            } else {
                tb = tapply(w[ind], value[ind], sum)
                u[window_index[i], ] = names(tb[which.max(tb)])
            }
        }
    } else {
        if(method == "w0") {
            gr2 = GenomicRanges::reduce(gr, min.gapwidth = 0)
            mtch2 = as.matrix(GenomicRanges::findOverlaps(window, gr2))
            intersect2 = GenomicRanges::pintersect(window[mtch2[, 1]], gr2[mtch2[, 2]])

            width_intersect = tapply(GenomicRanges::width(intersect2), mtch2[, 1], sum)
            ind = unique(mtch2[, 1])
            width_setdiff = GenomicRanges::width(window[ind]) - width_intersect

            w2 = GenomicRanges::width(window[ind])

            for(i in seq_along(ind_list)) {
                ind = ind_list[[i]]
                x = colSums(value[ind, , drop = FALSE]*w[ind])/sum(w[ind])
                u[window_index[i], ] = (x*width_intersect[i] + empty_value*width_setdiff[i])/w2[i]
            }

        } else if(method == "absolute") {
            for(i in seq_along(ind_list)) {
                u[window_index[i], ] = colMeans(value[ind_list[[i]], , drop = FALSE])
            }
            
        } else if(method == "weighted") {
            for(i in seq_along(ind_list)) {
                ind = ind_list[[i]]
                u[window_index[i], ] = colSums(value[ind, , drop = FALSE]*w[ind])/sum(w[ind])
            }
        } else {
            if(is.function(method)) {
                for(i in seq_along(ind_list)) {
                    ind = ind_list[[i]]
                    u[window_index[i], ] = method(value[ind], w[ind], window_w[i])
                }
            } else {
                stop_wrap("Wrong method.")
            }
        }
    }

    return(u)
}
jokergoo/ComplexHeatmap documentation built on Nov. 17, 2023, 11:27 a.m.