#' Subtract one data frame from another based on chromosome, start and end.
#'
#' @param x A dataframe.
#' @param y A dataframe.
#' @param by A character vector with 3 entries which are used to match the chromosome, start and end column.
#' For example: \code{by=c("Chromosome"="chr", "Start"="start", "End"="end")}
#' @return The subtracted dataframe of \code{x} and \code{y} with the new boundaries.
#' @examples
#'
#' library(dplyr)
#'
#' x1 <- data.frame(id = 1:4, bla=letters[1:4],
#' chromosome = c("chr1", "chr1", "chr2", "chr1"),
#' start = c(100, 200, 300, 400),
#' end = c(150, 250, 350, 450))
#'
#' x2 <- data.frame(id = 1:4, BLA=LETTERS[1:4],
#' chromosome = c("chr1", "chr2", "chr1", "chr1"),
#' start = c(120, 210, 300, 400),
#' end = c(125, 240, 320, 415))
#'
#' j <- genome_subtract(x1, x2, by=c("chromosome", "start", "end"))
#' print(j)
#'
#'
#' @export
genome_subtract <- function(x, y, by=NULL){
# Much of this code is copied from https://github.com/dgrtwo/fuzzyjoin
x_groups <- dplyr::groups(x)
x <- dplyr::ungroup(x)
regroup <- function(d) {
if (is.null(x_groups)) {
return(d)
}
g <- purrr::map_chr(x_groups, as.character)
missing <- !(g %in% colnames(d))
g[missing] <- paste0(g[missing], ".x")
dplyr::group_by_(d, .dots = g)
}
by <- dplyr::common_by(by, x, y)
if (length(by$x) != 3) {
stop("genome_join must join on exactly three columns")
}
f <- function(x,y){
# nest around the chromosome column
x$..index <- seq_len(nrow(x))
y$..index <- seq_len(nrow(y))
nested_x <- dplyr::group_by_at(x, 1) %>% tidyr::nest()
nested_y <- dplyr::group_by_at(y, 1) %>% tidyr::nest()
by <- c(colnames(nested_y)[1])
names(by) <- colnames(nested_x)[1]
joined <- dplyr::inner_join(nested_x, nested_y, by = by)
# find matching ranges in each
find_subtractions <- function(xd, yd) {
r1 <- IRanges::IRanges(xd[[1]], xd[[2]])
r2 <- IRanges::IRanges(yd[[1]], yd[[2]])
subtraction <- IRanges::setdiff(r1, r2)
o <- as.data.frame(IRanges::findOverlaps(subtraction, r1))
data.frame(x = xd$..index[o$subjectHits],
..start=pmax(IRanges::start(subtraction)[o$queryHits], IRanges::start(r1)[o$subjectHits]),
..end=pmin(IRanges::end(subtraction)[o$queryHits], IRanges::end(r1)[o$subjectHits]))
}
ret <- purrr::map2_df(joined$data.x, joined$data.y, find_subtractions)
ret
}
d1 <- x[, by$x, drop = FALSE]
d2 <- y[, by$y, drop = FALSE]
matches <- f(d1, d2)
ret <- x %>%
dplyr::select(- dplyr::one_of(by$x[-1])) %>%
dplyr::mutate(..id=seq_len(n())) %>%
dplyr::inner_join(matches[, c("x", "..start", "..end")], by=c("..id"="x")) %>%
dplyr::rename(!! by$x[2] := `..start`, !! by$x[3] := `..end`) %>%
dplyr::select(- `..id`) %>%
regroup()
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.