R/intervals.R

Defines functions grandom_genome gintervals.distance gintervals.normalize gintervals.expand gintervals.centers convert_misha_intervals_to_10x_peak_names convert_10x_peak_names_to_misha_intervals get_promoters gextract.left_join gintervals.filter gintervals.neighbors1

Documented in convert_10x_peak_names_to_misha_intervals convert_misha_intervals_to_10x_peak_names get_promoters gextract.left_join gintervals.centers gintervals.distance gintervals.expand gintervals.filter gintervals.neighbors1 gintervals.normalize grandom_genome

#' Finds neighbors between two sets of intervals (and does not return conflicting column names)
#'
#' @param fields select only these fields from \code{intervals2}. Note that when there are conflicting
#' names - the repaired name should be used, see \code{tibble::repair_names}.
#'
#' @inheritParams misha::gintervals.neighbors
#'
#' @return a data frame containing the pairs of intervals from 'intervals1', intervals from 'intervals2' (with a suffix of '1', i.e. chrom1, start1 and end1), and an additional column named 'dist' ('dist1' and 'dist2' for 2D intervals) representing the distance between the corresponding intervals.
#'
#' @export
#' @seealso \link[misha]{gintervals.neighbors}
gintervals.neighbors1 <- function(intervals1 = NULL,
                                  intervals2 = NULL,
                                  maxneighbors = 1,
                                  mindist = -1e+09,
                                  maxdist = 1e+09,
                                  na.if.notfound = TRUE,
                                  fields = NULL) {
    res <-
        gintervals.neighbors(
            intervals1 = intervals1,
            intervals2 = intervals2,
            maxneighbors = maxneighbors,
            mindist = mindist,
            maxdist = maxdist,
            na.if.notfound = na.if.notfound
        ) %>%
        tibble::repair_names()

    if (!is.null(fields)) {
        res <- res %>%
            select(any_of(colnames(intervals1)), fields, starts_with("dist"))
    }

    return(res %>% as_tibble())
}


#' Filter intervals set by distance to another.
#' A wrapper around gintervals.neighbours and filter(dist <= max_distance)
#'
#' @param intervals1 intervals set
#' @param intervals2 intervals set by which to filter intervals1
#' @param max_distance maximal distance of every interval in intervals1 from intervals2 (defualt 0)
#' @param abs_dist take the absolute distance
#' @param bind_intervals2 cbind add intervals2 to result
#' @param ... additional parameters to gintervals.neighbours1
#'
#' @return \code{intervals1} filtered to only intervals that are closer than \code{max_distance} to intervals2.
#' @export
gintervals.filter <- function(intervals1, intervals2, max_distance = 0, abs_dist = TRUE, bind_intervals2 = FALSE, ...) {
    intervals1_cols <- colnames(intervals1)
    res <- intervals1 %>% gintervals.neighbors1(intervals2, ...)
    if (abs_dist) {
        res$dist <- abs(res$dist)
    }
    res <- res %>% filter(dist <= max_distance)
    if (!bind_intervals2) {
        res <- res %>% select(one_of(intervals1_cols))
    }
    return(res)
}

#' Returns the result of track expressions evaluation for each of the
#' iterator intervals, and cbinds the intervals (instead of intervalID)
#'
#' @inheritParams misha::gextract
#' @param expr track expression
#' @param suffix suffix for conflicting column names
#'
#' @return The result of 'gextract' with additional columns from \code{intervals}
#'
#' @export
#'
#' @seealso \link[misha]{gextract}
gextract.left_join <- function(expr, intervals = NULL, colnames = NULL, iterator = NULL, band = NULL, file = NULL, intervals.set.out = NULL, suffix = "1") {
    if ("character" %in% class(intervals)) {
        intervals <- gintervals.load(intervals)
    }
    d <- gextract(expr, intervals = intervals, colnames = colnames, iterator = iterator, band = band, file = file, intervals.set.out = intervals.set.out)
    conflict_names <- which(colnames(intervals) %in% colnames(d))
    colnames(intervals)[conflict_names] <- paste0(colnames(intervals)[conflict_names], suffix)
    intervals$intervalID <- 1:nrow(intervals)
    d <- d %>%
        arrange(intervalID) %>%
        left_join(intervals, by = "intervalID") %>%
        select(-intervalID)
    return(d)
}


#' Define promoter regions
#' @param tss_intervals name of the intervals set with the TSS
#' @param upstream bp upstream to TSS
#' @param downstream bp downstread from tss
#'
#' @export
get_promoters <- function(upstream = 500, downstream = 50, tss_intervals = "intervs.global.tss") {
    gintervals.load(tss_intervals) %>%
        mutate(start = ifelse(strand == 1, start - upstream, start - downstream), end = ifelse(strand == 1, end + downstream, end + upstream)) %>%
        gintervals.force_range()
}

#' Convert 10X-format peak names to misha intervals
#' @param pn peak names as generated by 10X ATAC pipeline ("{chr}-{start}-{end}" format)
#' @return intervals - the peak names converted to misha-compatible intervals
#' @examples
#' \dontrun{
#' pn <- rownames(my_scatac_mat)
#' my_intervals <- convert_10x_peak_names_to_misha_intervals(pn)
#' }
#' @export
convert_10x_peak_names_to_misha_intervals <- function(pn) {
    intervals <- as.data.frame(do.call("rbind", stringr::str_split(pn, "-|:")))
    intervals[, 2:3] <- apply(intervals[, 2:3], 2, as.numeric)
    colnames(intervals) <- c("chrom", "start", "end")
    intervals$peak_name <- pn
    intervals <- intervals[with(intervals, order(chrom, start)), ]
    return(intervals)
}

#' Convert misha intervals to 10X-format peak names
#' @param ints misha intervals - dataframe with columns "chrom" (character), "start" (numeric), "end" (numeric)
#' @return pn - peak names, collapsed with hyphens
#' @examples
#' \dontrun{
#' ints <- my_mcatac_object@peaks
#' ints_pn <- convert_misha_intervals_to_10x_peak_names(ints)
#' }
#' @export
convert_misha_intervals_to_10x_peak_names <- function(ints) {
    if (!all(c("chrom", "start", "end") %in% colnames(ints))) {
        stop('intervals are not standard misha intervals,
            should be data frame with columns "chrom", "start", "end")')
    }
    pn <- stringr::str_c(ints$chrom, ":", ints$start, "-", ints$end)
    return(pn)
}


#' Calculate the centers of intervals set
#'
#' @param inv intervals set
#'
#' @return a single basepair intervals set with the center of each interval
#'
#' @export
gintervals.centers <- function(inv) {
    inv %>%
        mutate(center = floor((start + end) / 2), start = center, end = center + 1) %>%
        select(-center)
}

#' Exapnd an intervals set
#'
#' @param inv intervals set
#' @param expansion number of bp to add from each side
#'
#' @return intervals set expanded by \code{expansion} bp, where intervals outside of the
#' chromosome boundries are truncated
#'
#' @export
gintervals.expand <- function(inv, expansion = 100) {
    if (is.character(inv)) {
        inv <- gintervals.load(inv)
    }
    inv %>%
        mutate(start = start - expansion, end = end + expansion) %>%
        as.data.frame() %>%
        gintervals.force_range()
}

#' Normalize an intervals set to be of the same length
#'
#' @param inv intervals set
#' @param size desired length of the intervals
#'
#' @return The normalized intervals set
#'
#' @description The function adds \code{floor(size / 2)} bp for each side from each
#' interval's center
#'
#'
#' @export
gintervals.normalize <- function(inv, size) {
    centers <- gintervals.centers(inv) %>%
        mutate(end = end - 1)
    return(gintervals.expand(centers, floor(size / 2)))
}

#' Calculate pairwise distance between interval sets
#'
#' @param intervals1 first intervals set
#' @param intervals2 first intervals set (should be the same length as \code{intervals1})
#'
#' @return a vector containing for each i..N the distance between each interval1\\[i\\] and intervals2\\[i\\], where N is the number
#' of intervals
#'
#' @export
gintervals.distance <- function(intervals1, intervals2) {
    if (nrow(intervals1) != nrow(intervals2)) {
        stop("intervals1 and intervals2 should have the same length")
    }
    return(pmax(pmax(intervals1$start, intervals2$start) - pmin(intervals1$end, intervals2$end), 0))
}

#' Generate random genome intervals
#'
#' Generate a random genome intervals with a specified number of regions of a specified size.
#'
#' @param size The size of the intervals to generate
#' @param n The number of intervals to generate
#' @param dist_from_edge The minimum distance from the edge of the chromosome for a region to start or end(default: 3e6)
#' @param chromosomes The chromosomes to sample from (default: all chromosomes)
#'
#' @return A data.frame with columns chrom, start, and end
#'
#' @examples
#' \dontrun{
#' library(misha)
#' gdb.init_examples()
#' intervals <- grandom_genome(100, 1000, dist_from_edge = 100)
#' head(intervals)
#' }
#'
#' @export
grandom_genome <- function(size, n, dist_from_edge = 3e6, chromosomes = NULL) {
    all_genome <- misha::gintervals.all()

    if (!is.null(chromosomes)) {
        all_genome <- all_genome %>%
            filter(chrom %in% chromosomes)
        if (nrow(all_genome) == 0) {
            cli_abort("No chromosomes named {.val {chromosomes}} found in the genome.")
        }
    } else {
        all_genome <- all_genome %>%
            mutate(l = end - start) %>%
            filter(l >= dist_from_edge)
    }

    all_genome <- all_genome %>%
        mutate(l = end - start, frac = l / sum(l))

    if (any(all_genome$l < size)) {
        cli_abort("Some chromosomes are too short to generate intervals of size {.val {size}}.")
    }

    if (any(all_genome$l < dist_from_edge)) {
        cli_abort("Some chromosomes are too short to generate intervals with a minimum distance from the edge of {.val {dist_from_edge}}.")
    }

    chroms <- sample(all_genome$chrom, n, replace = TRUE, prob = all_genome$frac)

    res <- purrr::map_dfr(chroms, ~ {
        chrom_len <- all_genome %>%
            filter(chrom == !!.x) %>%
            pull(l)
        start <- sample.int(chrom_len - dist_from_edge - size, 1, replace = FALSE)
        start[start < dist_from_edge] <- dist_from_edge
        end <- start + size
        data.frame(chrom = .x, start = start, end = end)
    })

    res <- misha::gintervals.force_range(res)

    return(res)
}
tanaylab/misha.ext documentation built on Sept. 18, 2024, 2:53 a.m.