#' Convert an alignment to a ucsc location.
#'
#' alignment looks like:
#' name chrom chromStart chromEnd strand
#' 1 NM_011547 chr13 40727709 40745894 -
#' 2 NM_010791 chr11 101693600 101710444 -
#' 3 NM_013684 chr17 15204872 15222390 +
#'
#' @param x an alignment object (see Details)
#' @return a ucsc string, eg \dQuote{chr13:40727709-40745894}
#' @author Mark Cowley
#' @export
#' @importFrom mjcbase alleq
alignment2ucsc <- function(x) {
#
# find the column indices in x that contain the
# chromosome, start, stop.
#
# NB: chromosome is harder to find since most of the other columns
# also contain the word "chrom", so find the other colnames that
# relate to start, stop, then exclude these from the search
# for the column that contains just "chrom", "chr", or "Chrom", "Chr".
#
if( !is.null(colnames(x)) ) {
start.col <- grep("start", colnames(x), ignore.case=TRUE)
stop.col <- grep("stop|end", colnames(x), ignore.case=TRUE)
chr.col <- setdiff(grep("chr", colnames(x), ignore.case=TRUE), c(start.col, stop.col))
}
else {
chr.col <- 1
start.col <- 2
stop.col <- 3
}
stopifnot( alleq( c(1, length(start.col), length(stop.col), length(chr.col)) ) )
res <- apply(x[,c(chr.col, start.col, stop.col)], 1, make.ucsc.string)
return( res )
}
#' Convert an alignment to a ucsc location.
#'
#' alignment looks like:
#' name chrom chromStart chromEnd strand
#' 1 NM_011547 chr13 40727709 40745894 -
#' 2 NM_010791 chr11 101693600 101710444 -
#' 3 NM_013684 chr17 15204872 15222390 +
#'
#' @param chr the chromosome names
#' @param start the start locations
#' @param stop the stop/end locations
#' @return a ucsc string, eg \dQuote{chr13:40727709-40745894}
#' @author Mark Cowley
#' @seealso \code{\link{make.ucsc.string}}
#' @export
ucsc.pos <- function(chr, start=NULL, stop=NULL) {
make.ucsc.string(chr=chr, start=start, stop=stop)
}
# # updated 14/4/07: fixed when there are NA's in the input.
#' Convert an alignment to a ucsc location.
#'
#' alignment looks like:
#' name chrom chromStart chromEnd strand
#' 1 NM_011547 chr13 40727709 40745894 -
#' 2 NM_010791 chr11 101693600 101710444 -
#' 3 NM_013684 chr17 15204872 15222390 +
#'
#' @param chr the chromosome names
#' @param start the start locations
#' @param stop the stop/end locations
#' @return a ucsc string, eg \dQuote{chr13:40727709-40745894}
#' @author Mark Cowley
#' @export
#' @importFrom mjcbase is.matrix.like o2n n2o
make.ucsc.string <- function(chr, start=NULL, stop=NULL) {
# opd <- options()$digits
# on.exit(options(digits=opd))
# options(digits=11) # avoid wrapping long integers into scientific: 1.38e09
if( is.matrix.like(chr) ) {
stopifnot( ncol(chr) %in% c(3,4) )
apply(chr, 1, make.ucsc.string)
}
else {
if( is.null(start) && length(chr) %in% c(3,4) ) {
start <- min(as.numeric(c(chr[2], chr[3])))
stop <- max(as.numeric(c(chr[2], chr[3])))
chr <- chr[1]
}
if( any(is.na(chr), is.na(start), is.na(stop)) ) {
# remove the NA's, search for remaining "good" results,
# then return results with NA's inserted where there was an NA
# in original data.
na.idx <- which(is.na(chr) | is.na(start) | is.na(stop))
ok.idx <- setdiff(1:length(chr), na.idx)
res <- rep(NA, length(chr))
if( length(ok.idx) > 0 )
res[ok.idx] <- make.ucsc.string(chr[ok.idx], start[ok.idx], stop[ok.idx])
return( res )
}
else {
if( any(grepl("///", chr)) ) {
chr <- o2n( chr )[[1]]
start <- o2n( start )[[1]]
stop <- o2n( stop )[[1]]
res <- make.ucsc.string(cbind(chr=chr, start=start, stop=stop))
return( n2o(list(res)) )
# res <- character( length=length(chr) )
# for(i in 1:length(chr))
# res[i] <- make.ucsc.string(chr[i], start[i], stop[i])
# return( n2o(list(res)) )
}
else {
chr <- sub("^chr", "", chr)
pos <- sprintf("chr%s:%d-%d", chr, start, stop)
return( pos )
# #return( paste(chr, ":", format(start, scientific=FALSE), "-", format(stop, scientific=FALSE), sep="") )
# chr <- paste(sep="", "chr", chr)
# return( paste(chr, ":", int2char(start), "-", int2char(stop), sep="") )
}
}
}
}
#' is a UCSC string?
#'
#' @param x a vector of UCSC strings
#' @return logical vector if the elements look like valid UCSC locations
#' @author Mark Cowley
#' @seealso \code{\link{make.ucsc.string}}
#' @export
is.ucsc.string <- function(x) {
tmp <- strsplit(x, ":")
chr <- sapply(tmp, "[", 1)
mod <- sapply(tmp, "[", 2)
tmp <- strsplit(mod, "-")
start <- sapply(tmp, "[", 1)
stop <- sapply(tmp, "[", 2)
res <- is.na(x) | (grepl("^chr[0-9XYM]", chr) & grepl("^[0-9]+$", start) & grepl("^[0-9]+$", stop))
res[is.na(x)] <- NA
return( res )
}
#' Convert integers to characters
#'
#' Really big integers like genomic locations can become scientific when
#' converted to strings eg "1e09" instead of "1000000000"
#' @param x an integer vector
#' @return a character vector
#' @author Mark Cowley, 10/7/07
#' @export
int2char <- function(x) {
.trim <- function(x) {
sub("^[ \t]*([^ \t]+.*[^ \t]+)[ \t]+$", "\\1", x, perl=TRUE)
}
.trim( format(x, scientific=FALSE) )
}
#' interval sizes
#'
#' Calculate the size of each genomic interval listed in a ucsc-style string
#'
#' @param x a vector of ucsc-style location strings
#' @return an integer vector of interval sizes
#' @author Mark Cowley, 2011-08-02
#' @export
ucsc.size <- function(x) {
pos <- ucsc.pos.split(x)
return( pos[,3] - pos[,2] )
}
#' union of 2 genomic intervals
#'
#' @param x a vector of ucsc-style location strings
#' @param y a vector of ucsc-style location strings
#' @return the union of the 2 intervals
#' @author Mark Cowley, 2011-08-02
#' @export
ucsc.union <- function(x,y) {
x.pos <- ucsc.pos.split(x)
y.pos <- ucsc.pos.split(y)
if(x.pos[1,1] == y.pos[1,1] ) {
return(
make.ucsc.string(x.pos[1,1],
min(x.pos[1,2], y.pos[1,2]),
max(x.pos[1,3], y.pos[1,3])
)
)
}
else {
warning( paste(x, "and", y, "aren't on the same chromosome\n") )#'
return( c(x, y) )
}
}
#' union of multiple genomic intervals
#'
#' @param \dots at least 2 vectors of ucsc-style location strings
#' @return the union of the genomic intervals
#' @author Mark Cowley, 2011-08-02
#' @export
ucsc.unionN <- function(...) {
args <- as.list(...)
if( length(args) == 1 ) {
if( is.character(args[[1]]) )
return( args[[1]] )
else if( is.list(args[[1]]) && is.character(args[[1]][[1]]) )
args <- args[[1]]
}
#
# If any markers are on different chromosomes, this function will
# NOT handle it!
#
warn <- options()$warn
on.exit(options(warn=warn))
options(warn=2)
res <- ucsc.union(args[[1]], args[[2]])
if( length(args) > 2 ) {
for(i in 3:length(args)) {
res <- ucsc.union(res, args[[i]])
}
}
return( res )
}
#' chromosome position to genome position
#' Useful for Manhattan style plots where markers on chr 2 should be to the
#' right of those on chr1
#'
#' @param x a vector of ucsc-style genomic locations
#' @param marker.info a table of marker locations, with at least these columns: Mb, GenoMb, chr
#' @return vector of genomic locations
#' @author Mark Cowley, 2011-08-02
#' @export
ucsc.pos2genoMb.pos <- function(x, marker.info) {
chrs <- unique(marker.info$chr)
chr.geno.start <- rep(0, length(chrs))
names(chr.geno.start) <- as.character( chrs )
for(chr in chrs) {
chr.geno.start[as.character(chr)] <- (marker.info$GenoMb - marker.info$Mb)[match(chr, marker.info$chr)]
}
xpos <- ucsc.pos.split(x)
xpos[,2:3] <- xpos[,2:3] / 1e06
for(chr in chrs) {
idx <- which((xpos[,1] == chr) | (xpos[,1] == paste(sep="", "chr",chr)))
if( length(idx) > 0 )
xpos[idx,2:3] <- xpos[idx,2:3] + chr.geno.start[as.character(chr)]
}
xpos <- xpos[,2:3]
colnames(xpos) <- c("Start", "Stop")
return( xpos )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.