#' Method to determine the distance between two genomic regions in "ucsc.pos"
#' format.
#'
#' @return an integer, or Inf
#' @author Mark Cowley, 18/4/07
#' @examples
#' ucsc.pos.diff("chr9:1000-2000", "chr9:1500-2100", "min")
#' # [1] 0
#' ucsc.pos.diff("chr9:1000-2000", "chr9:1500-2100", "start")
#' # [1] 500
#' ucsc.pos.diff("chr9:1000-2000", "chr9:1500-2100", "stop")
#' # [1] 100
#' ucsc.pos.diff("chr9:1000-2000", "chr9:1500-2100", "middle")
#' # [1] 300
#' ucsc.pos.diff("chr9:1000-2000", "chr9:2100-2200", "min")
#' # [1] 100
#' ucsc.pos.diff("chr9:1000-2000", "chr9:2100-2200", "start")
#' # [1] 1100
#' ucsc.pos.diff("chr9:1000-2000", "chr9:2100-2200", "stop")
#' # [1] 200
#' ucsc.pos.diff("chr9:1000-2000", "chrX:2100-2200", "stop")
#' # [1] Inf
#' @export
ucsc.pos.diff <- function(from, to, method=c("min", "start", "stop", "middle")) {
method <- method[1]
from <- ucsc.pos.split(from)
to <- ucsc.pos.split(to)
res <- rep(Inf, nrow(from))
na.idx <- which( rowSums(is.na(from)) > 0 | rowSums(is.na(to)) > 0 )
if( length(na.idx) > 0 )
res[na.idx] <- NA
samechr.idx <- which(from[, 1] == to[, 1])
if (length(samechr.idx) == 0)
return(res)
for (i in samechr.idx) {
if (method == "start")
res[i] <- to[i, 2] - from[i, 2]
else if (method == "stop")
res[i] <- to[i, 3] - from[i, 3]
else if (method == "middle")
res[i] <- median(c(to[i, 2], to[i, 3])) - median(c(from[i,
2], from[i, 3]))
else if (method == "min") {
# to ends before from starts; thus negative distance
if( to[i,3] < from[i,2] ) {
res[i] <- to[i,3] - from[i,2]
}
# to starts after from ends; thus positive distance
else if ( to[i,2] > from[i,3] ) {
res[i] <- to[i,2] - from[i,3]
}
# at-least a partial overlap
else {
res[i] <- 0
}
}
# .ucsc.pos.partial.overlap <- function(from, to) { # had to fix this. if to was within from then FALSE was returned.
# return(
# in.range(from[1, 2], to[1, c(2, 3)]) |
# in.range(from[1, 3], to[1, c(2, 3)]) |
# in.range(to[1, 2], from[1, c(2, 3)]) |
# in.range(to[1, 3], from[1, c(2, 3)])
# )
# }
# if (.ucsc.pos.partial.overlap(from[i,], to[i,])) {
# res[i] <- 0
# }
# else {
# tmp <- c(to[i, 2] - from[i, 3], to[i, 3] - from[i,
# 2])
# res[i] <- tmp[which.min(abs(tmp))]
# }
# }
# # depreciated method:
# else if (method == "max") {
# .ucsc.pos.complete.overlap <- function(from, to) {
# widths <- c(from[1, 3] - from[1, 2] + 1, to[1, 3] - to[1, 2] + 1)
# if (widths[1] > widths[2]) {
# swap <- from
# from <- to
# to <- swap
# }
# res <- ((from[1, 2] >= to[1, 2]) & (from[1, 2] <= to[1, 3])) ||
# ((from[1, 3] >= to[1, 2]) & (from[1, 3] <= to[1, 3]))
# return(res)
# }
# if (.ucsc.pos.complete.overlap(from[i, ], to[i, ])) {
# res[i] <- 0
# }
# else {
# tmp <- c(to[i, 2] - from[i, 3], to[i, 3] - from[i, 3], to[i, 2] - from[i, 2], to[i, 3] - from[i, 2])
# res[i] <- tmp[which.max(abs(tmp))]
# }
# }
else {
stop("You have specified an undefined method\n")
}
}
return(res)
}
#' ucsc.pos.split
#'
#' split UCSC-style locations to \code{data.frame} style
#'
#' @param x character vector of UCSC-style locations
#' @return a 3 column \code{data.frame} 'chr', 'start', 'stop'
#' @author Mark Cowley, 2013-05-30
#' @export
ucsc.pos.split <- function(x) {
x <- gsub(",","",x)
tmp <- strsplit(x, ":")
chr <- sapply(tmp, "[", 1)
tmp <- sapply(tmp, "[", 2)
tmp <- strsplit(tmp, "-")
start <- as.numeric(sapply(tmp, "[", 1))
stop <- as.numeric(sapply(tmp, "[", 2))
res <- as.data.frame(cbind(chr = chr, start = start, stop = stop))
colclasses(res) <- c("character", "numeric", "numeric")
return(res)
}
#' ucsc.pos.add
#'
#' Shift a set of UCSC-style intervals a certain number of bases along the
#' same chromosome. This can be + or -, but no error checking is done, only
#' warnings made if start locations become negative.
#'
#' @author Mark Cowley, 2009-01-08
#' @export
ucsc.pos.add <- function(x, bases) {
x.split <- ucsc.pos.split(x)
x.split$start <- x.split$start + bases
x.split$stop <- x.split$stop + bases
if( any(x.split$start) < 0 )
warning("Some start locations are now < 0.")
res <- ucsc.pos(x.split)
res
}
#' ucsc.pos.subset
#'
#' For a set of UCSC style intervals, produce a new set of intervals that are
#' subsets of those intervals. Strand must be provided, because intervals are
#' strand-less, but the logic to subset an interval depends very much on the
#' strand.
#'
#' eg: subset chr1:1000-1100 (+) to the interval that starts at 2-7 ->
#' chr1:1001-1006
#' eg: subset chr1:1000-1100 (-) to the interval that starts at 2-7 ->
#' chr1:1094-1099
#'
#' @param intervals vector of UCSC style intevals, eg "chr1:100-300"
#' @param start vector of numerics defining the starting base within each
#' interval (1-based)
#' @param stop vector of numerics defining the ending base within each
#' interval (1-based). NB stop can be longer than the original interval.
#' @param strand either a charcter vector of "+"/"-", or {0,1}, or {-1,1}
#' @return a vector of UCSC style intervals, adjusted as detailed above
#' @author Mark Cowley, 2009-01-08
#' @examples
#' ucsc.pos.subset("chr1:1000-1100", 1,11, "+")
#' # [1] "chr1:1000-1010"
#' ucsc.pos.subset("chr1:1000-1100", 1,11, "-")
#' # [1] "chr1:1090-1100"
#' # (you can use > 1 entry too.)
#' @export
ucsc.pos.subset <- function(intervals, start, stop, strand) {
if( is.numeric(strand) ) {
if( all(strand) %in% c(-1,1) )
strand <- ifelse(strand==-1, "-", "+")
else if( all(strand) %in% c(0,1) )
strand <- ifelse(strand==0, "-", "+")
else
stop("Can't work out what type of numeric Strand you've provided. Please try '+' or '-'.\n")
}
stopifnot( all(strand %in% c("+", "-")) )
x.split <- ucsc.pos.split(intervals)
res <- x.split
# consider the sense and anti-sense strand independently
SS <- which(strand == "+") # SS for "same strand"
if( length(SS) > 0 ) {
res$start[SS] <- x.split$start[SS] + start[SS] - 1
res$stop[SS] <- x.split$start[SS] + stop[SS] - 1
}
SS <- which(strand == "-")
if( length(SS) > 0 ) {
res$start[SS] <- x.split$stop[SS] - stop[SS] + 1
res$stop[SS] <- x.split$stop[SS] - start[SS] + 1
}
res <- ucsc.pos(res)
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.