### =========================================================================
### VcfStack and RangedVcfStack class
### =========================================================================
.validVcfStack = function(object)
{
msg <- NULL
if (!all(rownames(object) %in% seqlevels(object)))
msg <- c(msg, "all rownames(object) must be in seqlevels(object)")
if (is.null(msg)) TRUE else msg
}
setClass("VcfStack",
representation(
files="VcfFileList",
seqinfo="Seqinfo",
colData="DataFrame"
),
validity=.validVcfStack
)
.validRangedVcfStack = function(object)
{
msg <- NULL
if (!identical(seqinfo(rowRanges(object)), seqinfo(object)))
msg <- c(msg,
"seqinfo() on rowRanges() differs from seqinfo() on object")
if (!all(seqnames(rowRanges(object)) %in% rownames(object)))
msg <- c(msg, "not all 'GRanges' seqnames are in VcfStack")
if (is.null(msg)) TRUE else msg
}
setClass("RangedVcfStack",
contains="VcfStack",
representation(
rowRanges="GRanges"
),
validity=.validRangedVcfStack
)
# check for sample consistency separate function to make optional for
# slow internet connections
.validSamples <- function(files, colData){
msg = NULL
if (length(files)) {
smps = samples(scanVcfHeader(files[[1]]))
if (!all(rownames(colData) %in% smps))
msg <- c(msg,
"all colnames(object) must be sample names in VCF 'files'")
samplesOk <- sapply(files, function(file) {
setequal(samples(scanVcfHeader(file)), smps)
})
if (!all(samplesOk))
msg <- c(msg, "sample names are not consistent between VCF 'files'")
}
if (is.null(msg)) TRUE else msg
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructors
##
VcfStack <-
function(files=NULL, seqinfo=NULL, colData=NULL, index=TRUE, check=TRUE)
{
stopifnot(is.logical(index), length(index) == 1L, !is.na(index))
if (is.null(files)) {
files <- VcfFileList()
header <- NULL
} else {
if (!is(files, "VcfFileList"))
files = VcfFileList(files)
if (index)
files = indexVcf(files)
header <- scanVcfHeader(files[[1]])
}
if (is.null(seqinfo)) {
seqinfo <- if (length(files)) {
seqinfo(files)
} else Seqinfo()
}
if (is.null(colData) && length(files)) {
colData <- DataFrame(row.names=samples(header))
} else {
colData <- as(colData, "DataFrame")
}
if (is.null(rownames(colData)) && length(files))
stop("specify rownames in 'colData'")
if (check) {
res <- .validSamples(files, colData)
if (!isTRUE(res))
stop(res)
}
new("VcfStack", files=files, colData=colData, seqinfo=seqinfo)
}
RangedVcfStack <- function(vs=NULL, rowRanges=NULL)
{
if (is.null(vs) && is.null(rowRanges)) {
vs <- VcfStack()
rowRanges <- GRanges()
} else {
stopifnot(is(vs, "VcfStack"))
if (is.null(rowRanges)){
rowRanges <- GRanges(seqinfo(vs))
if (any(!seqnames(rowRanges) %in% rownames(vs)))
rowRanges <- rowRanges[seqnames(rowRanges) %in% rownames(vs)]
}
new2old <- match(seqlevels(vs), seqlevels(rowRanges))
seqinfo(rowRanges, new2old=new2old) <- seqinfo(vs)
}
new("RangedVcfStack", vs, rowRanges=rowRanges)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Getters and setters
###
setMethod("dimnames", "VcfStack", function(x){
list(names(files(x)), rownames(colData(x)))
})
setMethod("dim", "VcfStack", function(x) {
c(length(files(x)), nrow(colData(x)))
})
setMethod("files", "VcfStack",
function(x, ...) x@files
)
setReplaceMethod("files", c("VcfStack", "character"),
function(x, ..., check=TRUE, value)
{
files(x) <- VcfFileList(value)
if (check) {
res <- .validSamples(files(x), colData(x))
if (!isTRUE(res))
stop(res)
}
x
})
setReplaceMethod("files", c("VcfStack", "VcfFile"),
function(x, ..., check=TRUE, value)
{
files(x) <- VcfFileList(value)
if (check) {
res <- .validSamples(files(x), colData(x))
if (!isTRUE(res))
stop(res)
}
x
})
setReplaceMethod("files", c("VcfStack", "VcfFileList"),
function(x, ..., check=TRUE, value)
{
value <- indexVcf(value)
if (check) {
res <- .validSamples(value, colData(x))
if (!isTRUE(res))
stop(res)
}
initialize(x, files=value)
})
## seqinfo (also seqlevels, genome, seqlevels<-, genome<-)
setMethod(seqinfo, "VcfStack",
function(x) x@seqinfo
)
setReplaceMethod("seqinfo", "VcfStack",
function (x, new2old = NULL, pruning.mode = c("error", "coarse", "fine", "tidy"), value)
{
initialize(x, seqinfo=value)
})
## H.P. 2017-04-29: I renamed 'force' -> 'pruning.mode'. Surprisingly this
## argument is ignored. That doesn't seem right.
setReplaceMethod("seqinfo", "RangedVcfStack",
function (x, new2old = NULL, pruning.mode = c("error", "coarse", "fine", "tidy"), value)
{
if (!is(value, "Seqinfo"))
stop("the supplied 'seqinfo' must be a Seqinfo object")
if (is.null(new2old))
new2old <- match(seqnames(value), seqlevels(rowRanges(x)))
rowRanges <- rowRanges(x)
seqinfo(rowRanges, new2old=new2old) <- value
initialize(x, seqinfo=value, rowRanges=rowRanges)
})
setReplaceMethod("seqlevelsStyle", "VcfStack",
function(x, value)
{
newSeqInfo <- seqinfo(x)
seqlevelsStyle(newSeqInfo) <- value
newFiles <- files(x)
nms = names(newFiles)
seqlevelsStyle(nms) <- value
names(newFiles) <- nms
initialize(x, seqinfo=newSeqInfo, files=newFiles)
})
setReplaceMethod("seqlevelsStyle", "RangedVcfStack",
function(x, value)
{
newSeqInfo <- seqinfo(x)
seqlevelsStyle(newSeqInfo) <- value
newFiles <- files(x)
nms = names(newFiles)
seqlevelsStyle(nms) <- value
names(newFiles) <- nms
newRange <- rowRanges(x)
seqlevelsStyle(newRange) <- value
initialize(x, seqinfo=newSeqInfo, files=newFiles, rowRanges=newRange)
})
setMethod(colData, "VcfStack",
function(x) x@colData
)
setReplaceMethod("colData", c("VcfStack", "DataFrame"),
function(x, ..., value)
{
initialize(x, colData=value)
})
setMethod("rowRanges", "RangedVcfStack",
function(x, ...) x@rowRanges
)
setReplaceMethod("rowRanges", c("RangedVcfStack", "GRanges"),
function(x, ..., value)
{
new2old <- match(seqlevels(x), seqlevels(value))
seqinfo(value, new2old=new2old) <- seqinfo(x)
initialize(x, rowRanges=value)
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Other methods
###
setMethod("vcfFields", "VcfStack", function(x, ...)
{
vcfFields(files(x))
})
setMethod("assay", c("VcfStack", "ANY"),
function(x, i, ..., BPPARAM=bpparam())
{
if (is(i, "GRanges")) {
files <- files(x)[as.character(seqnames(i))]
} else {
files <- if (missing(i)) files(x) else files(x)[i]
i <- GRanges(seqinfo(x))[names(files)]
}
i <- splitAsList(i, seq_along(i))
genotypes <- bpmapply(function(file, grange, genome) {
## FIXME: readGeno or other more efficient input?
vcf <- readVcf(file, genome, grange)
t(as(genotypeToSnpMatrix(vcf)$genotypes, "numeric"))
}, files, i, MoreArgs=list(genome=genome(x)), BPPARAM=BPPARAM)
do.call(rbind, genotypes)
})
setMethod("assay", c("RangedVcfStack", "ANY"),
function(x, i, ...)
{
if (!missing(i))
message(paste(strwrap(
"RangedVcfStack uses rowRanges to subset; ignoring 'i'",
exdent=4), collapse="\n"))
i <- rowRanges(x)
callNextMethod(x=x, i=i)
})
readVcfStack <- function(x, i, j=colnames(x), param=ScanVcfParam())
{
stopifnot(is(x, "VcfStack"))
if ((!missing(i) || !missing(j)) && !missing(param))
stop("'i' and 'j' cannot be used with 'param'")
gr <- NULL
if (missing(param) && missing(i) && is(x, "RangedVcfStack")) {
gr <- rowRanges(x)
i = intersect(names(files(x)), as.character(seqnames(gr)))
} else if (missing(param) && missing(i)) {
gr <- GRanges()
i = names(files(x))
} else if (missing(param) && is(i, "GRanges")) {
gr <- i
i = unique(seqnames(i))
} else if (missing(param)) {
if (is.numeric(i))
i = names(files(x))[i]
gr <- GRanges()
} else { # use param
gr <- GRanges(vcfWhich(param))
i = intersect(names(files(x)), as.character(seqnames(gr)))
}
x = x[i]
if (is.numeric(j)) {
j <- colnames(x)[j]
} else if (!missing(param)) {
j <- vcfSamples(param)
}
genome <- genome(x)
vcfSamples(param) <- j
vcfWhich(param) <- gr
vcf <- lapply(names(files(x)), function(i, files, genome, param) {
file <- files[[i]]
if (length(vcfWhich(param)))
vcfWhich(param) <- vcfWhich(param)[i]
readVcf(file, genome, param)
}, files(x), genome, param)
do.call(rbind, vcf)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Subsetting
###
setMethod("[", c("VcfStack", "ANY", "ANY"),
function(x, i, j, ..., drop=TRUE){
if (1L != length(drop) || (!missing(drop) && drop))
warning("'drop' ignored '[,VcfStack,ANY,ANY-method'")
if (missing(i) && missing(j)) {
x
} else if (missing(j)) {
if (is(i, "GRanges")) {
i <- as.character(seqnames(i))
}
initialize(x, files=files(x)[i])
} else if (missing(i)) {
colData = colData(x)[j,,drop=FALSE]
if (any(is.na(rownames(colData))))
stop("invalid 'j' value; sample not found")
initialize(x, colData=colData)
} else {
if (is(i, "GRanges")) {
i <- as.character(seqnames(i))
}
colData = colData(x)[j,,drop=FALSE]
if (any(is.na(rownames(colData))))
stop("invalid 'j' value; sample not found")
initialize(x, files=files(x)[i], colData=colData)
}
})
setMethod("[", c("RangedVcfStack", "ANY", "ANY"),
function(x, i, j, ..., drop=TRUE) {
if (1L != length(drop) || (!missing(drop) && drop))
warning("'drop' ignored '[,RangedVcfStack,ANY,ANY-method'")
if (missing(i)) {
i <- rownames(x)
} else if (is(i, "GenomicRanges")) {
stopifnot(all(seqnames(i) %in% rownames(x)))
rowRanges(x) <- intersect(rowRanges(x), i)
} else if (is(i, "character")) {
stopifnot(all(i %in% rownames(x)))
value <- rowRanges(x)
rowRanges(x) <- value[seqnames(value) %in% i]
} else {
stopifnot(is(i, "numeric") || is(i, "logical"))
value <- rowRanges(x)
rowRanges(x) <- value[seqnames(value) %in% rownames(x)[i]]
}
if (missing(j))
j <- colnames(x)
callNextMethod(x=x, i=i, j=j)
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### show()
###
setMethod("show", "VcfStack", function(object) {
cat("VcfStack object with ", nrow(object), " files and ",
ncol(object), " samples",
"\n", sep="")
if (is(object, "RangedVcfStack")) {
cat(summary(rowRanges(object)), "\n")
}
cat("Seqinfo object with", summary(seqinfo(object)), "\n")
cat("use 'readVcfStack()' to extract VariantAnnotation VCF.\n")
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Helpers
###
getVCFPath <- function(vs, chrtok) {
.Deprecated("files(vs)[chrtok]")
files(vs)[chrtok]
}
paths1kg <- function(chrtoks) sapply(chrtoks, .path1kg, USE.NAMES=FALSE)
.path1kg <- function (chrtok)
{
stopifnot(length(chrtok)==1 && is.atomic(chrtok))
if (is.numeric(chrtok))
chrtok = as.integer(chrtok)
if (is(chrtok, "integer"))
chrtok = paste0("chr", chrtok)
if (length(grep("chr", chrtok)) < 1)
warning("probably need 'chr' in input string")
tmplate = "http://1000genomes.s3.amazonaws.com/release/20130502/ALL.%%N%%.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz"
if (length(grep("X", chrtok)) > 0)
tmplate = "http://1000genomes.s3.amazonaws.com/release/20130502/ALL.%%N%%.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz"
if (length(grep("Y", chrtok)) > 0)
tmplate = "http://1000genomes.s3.amazonaws.com/release/20130502/ALL.%%N%%.phase3_integrated_v1b.20130502.genotypes.vcf.gz"
ans = as.character(gsub("%%N%%", chrtok, tmplate))
names(ans) = chrtok
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.