# chromsizes.R
# chromosome sizes, in various genome builds and with different flavours of chromosome names
#' @export
.selfnames <- function(x, ...) {
setNames(x, x)
}
#' Shortcut to chromosome lengths for genome build mm10.
#' @param ... passed through to \code{chromsizes()}
#' @return either a named vector of chromosome lengths or a \code{Seqinfo} object; see \code{?chromsizes}
#' @details Chromosome names are in the "chr1", ..., "chrM" (UCSC) style. If you prefer Ensembl ("1", ..., "MT) style,
#' use \code{chromsizes("38")}.
#' @seealso \code{\link{chromsizes}}
#' @export
chromsizes_mm10 <- function(...) {
chromsizes("mm10", ...)
}
#' Shortcut to chromosome lengths for genome build mm9.
#' @param ... passed through to \code{chromsizes()}
#' @return either a named vector of chromosome lengths or a \code{Seqinfo} object; see \code{?chromsizes}
#' @details Chromosome names are in the "chr1", ..., "chrM" (UCSC) style. If you prefer Ensembl ("1", ..., "MT) style,
#' use \code{chromsizes("37")}.
#' PS: Should you really still be using mm9? Consider updating to the post-2011 genome assembly.
#' @seealso \code{\link{chromsizes}}
#' @export
chromsizes_mm9 <- function(...) {
chromsizes("mm10", ...)
}
#' Get chromosome lengths for a specific genome build.
#' @param build name of a genome build: one of \code{"mm9", "mm10", "37", "38"}
#' @param as.seqinfo logical; if \code{TRUE}, return a \code{Seqinfo} object
#' @param ... ignored
#' @return either a named vector of chromosome lengths or a \code{Seqinfo} object for use with Bioconductor stack
#' @details Chromosome names for \code{"mm9"} and \code{"mm10"} are in the "chr1", ..., "chrM" (UCSC) style. If you prefer Ensembl
#' ("1", ..., "MT) style, use \code{"37"} or \code{"38"}, respectively.
#' If a \code{Seqinfo} object is requested but the appropriate package is not installed, the function will default
#' to a named vector but issue a warning.
#' @seealso \code{\link[GenomeInfoDb]{Seqinfo}}
#' @export
chromsizes <- function(build = c("mm9","mm10","37","38","NCBIm37","GRCm38","cf3","hg19"), as.seqinfo = FALSE, correct_offset = FALSE, ...) {
seqnames <- chromnames("mouse")
is.circ <- c(rep(FALSE, 21),TRUE)
.return.sizes <- function(nm, sz, circ, genome) {
if (requireNamespace("GenomeInfoDb", quietly = TRUE)) {
GenomeInfoDb::Seqinfo(nm, sz, circ, genome)
}
else {
warning("Package 'GenomeInfoDb' is required to return a Seqinfo object, but it couldn't be loaded.")
return( setNames(sz, nm) )
}
}
.build <- match.arg(build)
if (.build %in% c("mm10","38","GRCm38")) {
seqlengths <- c(195471971,182113224,160039680,156508116,151834684,
149736546,145441459,129401213,124595110,130694993,
122082543,120129022,120421639,124902244,104043685,
98207768,94987271,90702639,61431566,
171031299,91744698,16299)
if (.build %in% c("38","GRCm38")) {
seqnames <- gsub("^chr","", seqnames)
seqnames[22] <- "MT"
genome <- "GRCm38"
}
else {
genome <- "mm10"
}
if (correct_offset) {
seqlengths <- seqlengths - c(rep(3e6, 20), 0, 0)
}
if (as.seqinfo)
.return.sizes(seqnames, seqlengths, is.circ, genome)
else
return( setNames(seqlengths, seqnames) )
}
else if (.build %in% c("mm9","37","NCBIm37")) {
seqlengths <- c(197195432, 181748087, 159599783, 155630120, 152537259,
149517037, 152524553, 131738871, 124076172, 129993255,
121843856, 121257530, 120284312, 125194864, 103494974,
98319150, 95272651, 90772031, 61342430,
166650296, 91744698, 16299)
if (.build %in% c("37","NCBIm37")) {
seqnames <- gsub("^chr","", seqnames)
seqnames[22] <- "MT"
genome <- "NCBIm37"
}
else {
genome <- "mm9"
}
if (correct_offset) {
seqlengths <- seqlengths - c(rep(3e6, 20), 0, 0)
}
if (as.seqinfo)
.return.sizes(seqnames, seqlengths, is.circ, genome)
else
return( setNames(seqlengths, seqnames) )
}
else if (.build == "cf3") {
# cf3 lives in sysdata.rda
genome <- "cf3"
is.circ <- grepl("chrM", names(cf3))
if (as.seqinfo)
.return.sizes(names(cf3), cf3, is.circ, genome)
else
return(cf3)
}
else if (.build == "hg19") {
# hg19 lives in sysdata.rda
genome <- "hg19"
is.circ <- grepl("chrM", names(hg19))
if (as.seqinfo)
.return.sizes(names(hg19), hg19, is.circ, genome)
else
return(hg19)
}
else {
stop("Genome build not recognized.")
}
}
#' Chromosome lengths in cM.
#' @param ... ignored
#' @return vector of chromosome lenghs in cM
#' @details Lengths are from Collaborative Cross G2:F1 recombination map.
#' @export
chromsizes_cM <- function(...) {
structure(c(98.5452, 103.9064, 82.6975, 88.6454, 90.2329, 79.0238,
89.0678, 76.2476, 75.0692, 77.9816, 88.0028, 63.9015, 67.2537,
66.4371, 59.0785, 57.8389, 61.2641, 59.4257, 56.923, 79.4368),
.Dim = 20L,
.Dimnames = list(
c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7",
"chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chrX")))
}
#' Just return chromosome names.
#' @param species return chromosome names for this species (default is mouse)
#' @param ... ignored
#' @return character vector of chromosome names in the UCSC style: chr1, ..., chrX, chrY, chrM
#' @export
chromnames <- function(species = c("mouse","dog","human"), ...) {
.species <- match.arg(species)
if (.species == "mouse")
paste0("chr", c(1:19, "X","Y","M"))
else if (.species == "dog")
paste0("chr", c(1:38, "X","M")) #NB: dog Y chromosome not assembled yet
else if (.species == "human")
paste0("chr", c(1:22, "X","Y","M"))
else
stop("Species not recognized (yet).")
}
#' Create a factor whose levels are chromosome names.
#' @param x a vector of values
#' @param species return chromosome names for this species (default is mouse)
#' @param ... ignored
#' @return factor whose levels are chromosome names
#' @export
factor_chrom <- function(x, species = c("mouse","dog","human"), ...) {
species <- match.arg(species)
ll <- chromnames(species)
if (!is.factor(x) && is.numeric(x)) {
# assume chromosomes encoded numerically, as 1-20
if (max(x) > 22 && species == "mouse") {
# fix PLINK-style chromosome numbers
x[ x == 23 ] <- 20 # X
x[ x == 24 ] <- 21 # Y
x[ x == 25 ] <- 20 # PAR; on X in mouse
x[ x == 26 ] <- 22 # M
}
rez <- factor( ll[x], ll )
}
else {
# assume chromosomes are character encoded
if (!any(grepl("^chr", x))) {
x[ grepl("^MT", x) ] <- "M"
x <- paste0("chr", x)
}
rez <- factor( as.character(x), ll )
}
attr(rez,"is_chrom") <- TRUE
attr(rez, "species") <- species
return(rez)
}
#' Test if a chromosome identifier does or does not represent an autosome
#' @export
is_autosome <- function(x, ...) {
if (is.null(attr(x, "is_chrom")))
x <- factor_chrom(x, ...)
return( !grepl("[XYMWZ]", x) )
}
#' Classify chromosmes as X or autosome
#' @param x a vector of values
#' @param include.Y logical; if \code{TRUE}, treat Y chromosome as separate class
#' @param ... ignored
#' @return factor whose levels are "X" (X chromosome) "A" (autosome)
#' @details Chromsomes which are not X or autosome will be set to NA.
#' @export
factor_chromtype <- function(x, include_Y = FALSE, include_M = FALSE, ...) {
groups <- c("A","X")
if (include_Y)
groups <- c(groups, "Y")
if (include_M)
groups <- c(groups, "M")
xx <- vector("character", length(x))
xx[ grepl("X", x) ] <- "X"
xx[ !grepl("[YM]", x) & !grepl("X", x) ] <- "A"
if (include_Y)
xx[ grepl("Y", x) ] <- "Y"
if (include_M)
xx[ grepl("MT*$", x) ] <- "M"
xx[ grepl("JH", x) | grepl("GL", x) | grepl("PATCH", x) | grepl("MG", x) ] <- NA
xx[ is.na(x) ] <- NA
factor(xx, levels = groups)
}
#' Classify sex in systematic way
#' @param x a vector of sexes, encoded in any of the acceptable ways (see Details)
#' @param ... ignored
#' @return a factor with one level for each sex
#' @export
factor_sex <- function(x, as_karyotype = FALSE, with_XO = FALSE, format = NULL, ...) {
x <- as.character(x)
if (all(c("F","M") %in% x)) {
rez <- factor(x, c("F","M"))
}
else if (all(c("f","m") %in% x)) {
rez <- factor(x, c("f","m"))
}
else if (all(c("female","male") %in% tolower(x))) {
x <- tolower(x)
rez <- factor(x, c("female","male"))
}
else if (all(c("XX","XY") %in% x)) {
rez <- factor(x, c("XX","XY"))
if (all(c("XX","XY","XO") %in% x)) {
rez <- factor(x, c("XX","XY","XO"))
}
}
else if (all(c("1","2") %in% x)) {
if (!as_karyotype) {
rez <- factor(x, levels = c("2","1","0"), labels = c("female","male","unknown"))
}
else {
if (with_XO)
rez <- factor(x, levels = c("2","1","0"), labels = c("XX","XY","XO"))
else
rez <- factor(x, levels = c("2","1","0"), labels = c("XX","XY","unknown"))
}
}
else {
warning("Can't understand sex encoding; values returned unchanged.")
return(x)
}
if (!is.null(format)) {
if (format == "MF" || format == "FM") {
newl <- c("F","M","U")
}
else if (format == "XY") {
newl <- c("XX","XY","XO")
}
else if (format == "mf" || format == "fm") {
newl <- c("female","male","unknown")
}
levels(rez) <- newl[ 1:nlevels(rez) ]
}
return(rez)
}
#' Return position of pseudoautosomal boundary on X chromosome
#' @param build genome build: see \code{?chromsizes} for options
#' @param ... ignored
#' @return a vector of length two, with the canonical boundary of pseudoautosomal region (PAR) as well as its shifted boundary
# in the strain CAST/EiJ
#' @references
#' White MW, Ikeda A, Payseur BA (2011) A pronounced evolutionary shift of the pseudoautosomal region boundary in house mice. Mammalian Genome 23: 455-466.
#' doi:10.1007/s00335-012-9403-5
#' @export
pseudoautosomal_boundary <- function(build = c("mm9","mm10","37","38","NCBIm37","GRCm38"), ...) {
.build <- match.arg(build)
if (.build %in% c("mm9","37","NCBIm37"))
return( c(166407691, 165980013) )
else if (.build %in% c("mm10","38","GRCm38"))
return( c(169969759, 169542082) )
}
#' @export
#' @rdname pseudoautosomal_boundary
PAR_mm9 <- function(...) pseudoautosomal_boundary("mm9")
#' @export
#' @rdname pseudoautosomal_boundary
PAR_mm10 <- function(...) pseudoautosomal_boundary("mm10")
#' @export
#' @rdname pseudoautosomal_boundary
scale_x_Mb <- function(name = "position (Mb)", ...) {
ggplot2::scale_x_continuous(name = name, labels = function(x) x/1e6, ...)
}
#' @export
#' @rdname pseudoautosomal_boundary
scale_y_Mb <- function(name = "position (Mb)", ...) {
ggplot2::scale_y_continuous(name = name, labels = function(x) x/1e6, ...)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.