#' Finds neighbors between two sets of intervals (and does not return conflicting column names)
#'
#' @param fields select only these fields from \code{intervals2}. Note that when there are conflicting
#' names - the repaired name should be used, see \code{tibble::repair_names}.
#'
#' @inheritParams misha::gintervals.neighbors
#'
#' @return a data frame containing the pairs of intervals from 'intervals1', intervals from 'intervals2' (with a suffix of '1', i.e. chrom1, start1 and end1), and an additional column named 'dist' ('dist1' and 'dist2' for 2D intervals) representing the distance between the corresponding intervals.
#'
#' @export
#' @seealso \link[misha]{gintervals.neighbors}
gintervals.neighbors1 <- function(intervals1 = NULL,
intervals2 = NULL,
maxneighbors = 1,
mindist = -1e+09,
maxdist = 1e+09,
na.if.notfound = TRUE,
fields = NULL) {
res <-
gintervals.neighbors(
intervals1 = intervals1,
intervals2 = intervals2,
maxneighbors = maxneighbors,
mindist = mindist,
maxdist = maxdist,
na.if.notfound = na.if.notfound
) %>%
tibble::repair_names()
if (!is.null(fields)) {
res <- res %>%
select(any_of(colnames(intervals1)), fields, starts_with("dist"))
}
return(res %>% as_tibble())
}
#' Filter intervals set by distance to another.
#' A wrapper around gintervals.neighbours and filter(dist <= max_distance)
#'
#' @param intervals1 intervals set
#' @param intervals2 intervals set by which to filter intervals1
#' @param max_distance maximal distance of every interval in intervals1 from intervals2 (defualt 0)
#' @param abs_dist take the absolute distance
#' @param bind_intervals2 cbind add intervals2 to result
#' @param ... additional parameters to gintervals.neighbours1
#'
#' @return \code{intervals1} filtered to only intervals that are closer than \code{max_distance} to intervals2.
#' @export
gintervals.filter <- function(intervals1, intervals2, max_distance = 0, abs_dist = TRUE, bind_intervals2 = FALSE, ...) {
intervals1_cols <- colnames(intervals1)
res <- intervals1 %>% gintervals.neighbors1(intervals2, ...)
if (abs_dist) {
res$dist <- abs(res$dist)
}
res <- res %>% filter(dist <= max_distance)
if (!bind_intervals2) {
res <- res %>% select(one_of(intervals1_cols))
}
return(res)
}
#' Returns the result of track expressions evaluation for each of the
#' iterator intervals, and cbinds the intervals (instead of intervalID)
#'
#' @inheritParams misha::gextract
#' @param expr track expression
#' @param suffix suffix for conflicting column names
#'
#' @return The result of 'gextract' with additional columns from \code{intervals}
#'
#' @export
#'
#' @seealso \link[misha]{gextract}
gextract.left_join <- function(expr, intervals = NULL, colnames = NULL, iterator = NULL, band = NULL, file = NULL, intervals.set.out = NULL, suffix = "1") {
if ("character" %in% class(intervals)) {
intervals <- gintervals.load(intervals)
}
d <- gextract(expr, intervals = intervals, colnames = colnames, iterator = iterator, band = band, file = file, intervals.set.out = intervals.set.out)
conflict_names <- which(colnames(intervals) %in% colnames(d))
colnames(intervals)[conflict_names] <- paste0(colnames(intervals)[conflict_names], suffix)
intervals$intervalID <- 1:nrow(intervals)
d <- d %>%
arrange(intervalID) %>%
left_join(intervals, by = "intervalID") %>%
select(-intervalID)
return(d)
}
#' Define promoter regions
#' @param tss_intervals name of the intervals set with the TSS
#' @param upstream bp upstream to TSS
#' @param downstream bp downstread from tss
#'
#' @export
get_promoters <- function(upstream = 500, downstream = 50, tss_intervals = "intervs.global.tss") {
gintervals.load(tss_intervals) %>%
mutate(start = ifelse(strand == 1, start - upstream, start - downstream), end = ifelse(strand == 1, end + downstream, end + upstream)) %>%
gintervals.force_range()
}
#' Convert 10X-format peak names to misha intervals
#' @param pn peak names as generated by 10X ATAC pipeline ("{chr}-{start}-{end}" format)
#' @return intervals - the peak names converted to misha-compatible intervals
#' @examples
#' \dontrun{
#' pn <- rownames(my_scatac_mat)
#' my_intervals <- convert_10x_peak_names_to_misha_intervals(pn)
#' }
#' @export
convert_10x_peak_names_to_misha_intervals <- function(pn) {
intervals <- as.data.frame(do.call("rbind", stringr::str_split(pn, "-|:")))
intervals[, 2:3] <- apply(intervals[, 2:3], 2, as.numeric)
colnames(intervals) <- c("chrom", "start", "end")
intervals$peak_name <- pn
intervals <- intervals[with(intervals, order(chrom, start)), ]
return(intervals)
}
#' Convert misha intervals to 10X-format peak names
#' @param ints misha intervals - dataframe with columns "chrom" (character), "start" (numeric), "end" (numeric)
#' @return pn - peak names, collapsed with hyphens
#' @examples
#' \dontrun{
#' ints <- my_mcatac_object@peaks
#' ints_pn <- convert_misha_intervals_to_10x_peak_names(ints)
#' }
#' @export
convert_misha_intervals_to_10x_peak_names <- function(ints) {
if (!all(c("chrom", "start", "end") %in% colnames(ints))) {
stop('intervals are not standard misha intervals,
should be data frame with columns "chrom", "start", "end")')
}
pn <- stringr::str_c(ints$chrom, ":", ints$start, "-", ints$end)
return(pn)
}
#' Calculate the centers of intervals set
#'
#' @param inv intervals set
#'
#' @return a single basepair intervals set with the center of each interval
#'
#' @export
gintervals.centers <- function(inv) {
inv %>%
mutate(center = floor((start + end) / 2), start = center, end = center + 1) %>%
select(-center)
}
#' Exapnd an intervals set
#'
#' @param inv intervals set
#' @param expansion number of bp to add from each side
#'
#' @return intervals set expanded by \code{expansion} bp, where intervals outside of the
#' chromosome boundries are truncated
#'
#' @export
gintervals.expand <- function(inv, expansion = 100) {
if (is.character(inv)) {
inv <- gintervals.load(inv)
}
inv %>%
mutate(start = start - expansion, end = end + expansion) %>%
as.data.frame() %>%
gintervals.force_range()
}
#' Normalize an intervals set to be of the same length
#'
#' @param inv intervals set
#' @param size desired length of the intervals
#'
#' @return The normalized intervals set
#'
#' @description The function adds \code{floor(size / 2)} bp for each side from each
#' interval's center
#'
#'
#' @export
gintervals.normalize <- function(inv, size) {
centers <- gintervals.centers(inv) %>%
mutate(end = end - 1)
return(gintervals.expand(centers, floor(size / 2)))
}
#' Calculate pairwise distance between interval sets
#'
#' @param intervals1 first intervals set
#' @param intervals2 first intervals set (should be the same length as \code{intervals1})
#'
#' @return a vector containing for each i..N the distance between each interval1\\[i\\] and intervals2\\[i\\], where N is the number
#' of intervals
#'
#' @export
gintervals.distance <- function(intervals1, intervals2) {
if (nrow(intervals1) != nrow(intervals2)) {
stop("intervals1 and intervals2 should have the same length")
}
return(pmax(pmax(intervals1$start, intervals2$start) - pmin(intervals1$end, intervals2$end), 0))
}
#' Generate random genome intervals
#'
#' Generate a random genome intervals with a specified number of regions of a specified size.
#'
#' @param size The size of the intervals to generate
#' @param n The number of intervals to generate
#' @param dist_from_edge The minimum distance from the edge of the chromosome for a region to start or end(default: 3e6)
#' @param chromosomes The chromosomes to sample from (default: all chromosomes)
#'
#' @return A data.frame with columns chrom, start, and end
#'
#' @examples
#' \dontrun{
#' library(misha)
#' gdb.init_examples()
#' intervals <- grandom_genome(100, 1000, dist_from_edge = 100)
#' head(intervals)
#' }
#'
#' @export
grandom_genome <- function(size, n, dist_from_edge = 3e6, chromosomes = NULL) {
all_genome <- misha::gintervals.all()
if (!is.null(chromosomes)) {
all_genome <- all_genome %>%
filter(chrom %in% chromosomes)
if (nrow(all_genome) == 0) {
cli_abort("No chromosomes named {.val {chromosomes}} found in the genome.")
}
} else {
all_genome <- all_genome %>%
mutate(l = end - start) %>%
filter(l >= dist_from_edge)
}
all_genome <- all_genome %>%
mutate(l = end - start, frac = l / sum(l))
if (any(all_genome$l < size)) {
cli_abort("Some chromosomes are too short to generate intervals of size {.val {size}}.")
}
if (any(all_genome$l < dist_from_edge)) {
cli_abort("Some chromosomes are too short to generate intervals with a minimum distance from the edge of {.val {dist_from_edge}}.")
}
chroms <- sample(all_genome$chrom, n, replace = TRUE, prob = all_genome$frac)
res <- purrr::map_dfr(chroms, ~ {
chrom_len <- all_genome %>%
filter(chrom == !!.x) %>%
pull(l)
start <- sample.int(chrom_len - dist_from_edge - size, 1, replace = FALSE)
start[start < dist_from_edge] <- dist_from_edge
end <- start + size
data.frame(chrom = .x, start = start, end = end)
})
res <- misha::gintervals.force_range(res)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.