### =========================================================================
### Input/output of XStringSet objects
### -------------------------------------------------------------------------
### 'filexp_list' must be a list of "file external pointers" returned by
### XVector::open_input_files() or XVector:::open_output_file().
.close_filexp_list <- function(filexp_list)
{
for (filexp in filexp_list) XVector:::close_filexp(filexp)
}
.normarg_nrec <- function(nrec)
{
if (!isSingleNumber(nrec))
stop(wmsg("'nrec' must be a single integer value"))
if (!is.integer(nrec))
nrec <- as.integer(nrec)
nrec
}
.normarg_skip <- function(skip)
{
if (!isSingleNumber(skip))
stop(wmsg("'skip' must be a single integer value"))
if (!is.integer(skip))
skip <- as.integer(skip)
if (skip < 0L)
stop(wmsg("'skip' cannot be negative"))
skip
}
.is_filexp_list <- function(filepath)
{
## We only check the first list element.
is.list(filepath) && length(filepath) != 0L &&
is(filepath[[1L]], "externalptr")
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### FASTA
###
.read_fasta_files <- function(filexp_list, nrec, skip, seek.first.rec,
use.names, elementType, lkup)
{
nrec <- .normarg_nrec(nrec)
skip <- .normarg_skip(skip)
if (!isTRUEorFALSE(seek.first.rec))
stop(wmsg("'seek.first.rec' must be TRUE or FALSE"))
.Call2("read_fasta_files", filexp_list, nrec, skip, seek.first.rec,
use.names, elementType, lkup,
PACKAGE="Biostrings")
}
fasta.index <- function(filepath, nrec=-1L, skip=0L, seek.first.rec=FALSE,
seqtype="B")
{
filexp_list <- open_input_files(filepath)
on.exit(.close_filexp_list(filexp_list))
nrec <- .normarg_nrec(nrec)
skip <- .normarg_skip(skip)
if (!isTRUEorFALSE(seek.first.rec))
stop(wmsg("'seek.first.rec' must be TRUE or FALSE"))
seqtype <- match.arg(seqtype, c("B", "DNA", "RNA", "AA"))
lkup <- get_seqtype_conversion_lookup("B", seqtype)
ans <- .Call2("fasta_index",
filexp_list, nrec, skip, seek.first.rec, lkup,
PACKAGE="Biostrings")
## 'expath' will usually be the same as 'filepath', except when 'filepath'
## contains URLs which will be replaced by the path to the downloaded file.
expath <- vapply(filexp_list, attr, character(1), "expath", USE.NAMES=FALSE)
ans$filepath <- expath[ans[ , "fileno"]]
ans
}
fasta.seqlengths <- function(filepath, nrec=-1L, skip=0L, seek.first.rec=FALSE,
seqtype="B", use.names=TRUE)
{
if (!isTRUEorFALSE(use.names))
stop(wmsg("'use.names' must be TRUE or FALSE"))
fai <- fasta.index(filepath, nrec=nrec, skip=skip,
seek.first.rec=seek.first.rec,
seqtype=seqtype)
ans <- fai[ , "seqlength"]
if (use.names)
names(ans) <- fai[ , "desc"]
ans
}
.check_fasta_index <- function(fai)
{
.REQUIRED_COLS <- c("recno", "fileno", "offset",
"desc", "seqlength", "filepath")
if (!all(.REQUIRED_COLS %in% colnames(fai)))
stop(wmsg("invalid FASTA index: a FASTA index must be a data frame ",
"with columns: ", paste0(.REQUIRED_COLS, collapse=", ")))
recno <- fai[ , "recno"]
if (!is.integer(recno)
|| S4Vectors:::anyMissingOrOutside(recno, lower=1L))
stop(wmsg("invalid FASTA index: the \"recno\" column must be ",
"an integer vector with no NAs and with positive values"))
fileno <- fai[ , "fileno"]
if (!is.integer(fileno)
|| S4Vectors:::anyMissingOrOutside(fileno, lower=1L))
stop(wmsg("invalid FASTA index: the \"fileno\" column must be ",
"an integer vector with no NAs and with positive values"))
offset <- fai[ , "offset"]
if (!is.numeric(offset) || any(is.na(offset)) || any(offset < 0))
stop(wmsg("invalid FASTA index: the \"offset\" column must be ",
"a numeric vector with no NAs and no negative values"))
desc <- fai[ , "desc"]
if (!is.character(desc) || any(is.na(desc)))
stop(wmsg("invalid FASTA index: the \"desc\" column must be ",
"a character vector with no NAs"))
seqlength <- fai[ , "seqlength"]
if (!is.integer(seqlength)
|| S4Vectors:::anyMissingOrOutside(seqlength, lower=0L))
stop(wmsg("invalid FASTA index: the \"seqlength\" column must be ",
"an integer vector with no NAs and no negative values"))
filepath <- fai[ , "filepath"]
if (!is.character(filepath) || any(is.na(filepath)))
stop(wmsg("invalid FASTA index: the \"filepath\" column must be ",
"a character vector with no NAs"))
}
### "FASTA blocks" are groups of consecutive FASTA records.
### Fasta index 'ssorted_fai' must be strictly sorted by "recno". This is NOT
### checked!
.compute_sorted_fasta_blocks_from_ssorted_fasta_index <- function(ssorted_fai)
{
recno <- ssorted_fai[ , "recno"]
fileno <- ssorted_fai[ , "fileno"]
offset <- ssorted_fai[ , "offset"]
blockid <- recno - seq_along(recno) # this block id is unique only within
# a given file
is_first_in_block <- !duplicatedIntegerPairs(blockid, fileno)
first_in_block_idx <- which(is_first_in_block)
data.frame(fileno=fileno[first_in_block_idx],
nrec=diff(c(first_in_block_idx, nrow(ssorted_fai) + 1L)),
offset=offset[first_in_block_idx])
}
### Fasta index 'ssorted_fai' must be strictly sorted by "recno". This is NOT
### checked!
.read_XStringSet_from_ssorted_fasta_index <- function(ssorted_fai,
elementType, lkup)
{
## Prepare 'nrec_list' and 'offset_list'.
fasta_blocks <-
.compute_sorted_fasta_blocks_from_ssorted_fasta_index(ssorted_fai)
nrec_list <- split(fasta_blocks[ , "nrec"], fasta_blocks[ , "fileno"],
drop=TRUE)
offset_list <- split(fasta_blocks[ , "offset"], fasta_blocks[ , "fileno"],
drop=TRUE)
## Prepare 'filexp_list'.
filepath <- ssorted_fai[ , "filepath"]
fileno <- ssorted_fai[ , "fileno"]
used_fileno <- as.integer(names(nrec_list))
used_filepath <- filepath[match(used_fileno, fileno)]
filexp_list <- open_input_files(used_filepath)
on.exit(.close_filexp_list(filexp_list))
## Prepare 'seqlengths'.
seqlengths <- ssorted_fai[ , "seqlength"]
.Call2("read_fasta_blocks",
seqlengths, filexp_list, nrec_list, offset_list,
elementType, lkup,
PACKAGE="Biostrings")
}
.read_XStringSet_from_fasta_index <- function(fai, use.names, elementType, lkup)
{
.check_fasta_index(fai)
## Create a "strictly sorted" version of 'fai' by removing duplicated rows
## and sorting the remaining rows by ascending "recno".
recno <- fai[ , "recno"]
ssorted_recno <- sort(unique(recno))
ssorted_fai <- fai[match(ssorted_recno, recno), , drop=FALSE]
C_ans <- .read_XStringSet_from_ssorted_fasta_index(ssorted_fai,
elementType, lkup)
## Re-order XStringSet object to make it parallel to 'recno'.
ans <- C_ans[match(recno, ssorted_recno)]
## Set names on XStringSet object.
if (use.names)
names(ans) <- fai[ , "desc"]
ans
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### FASTQ
###
.read_fastq_files <- function(filexp_list, nrec, skip, seek.first.rec,
use.names, elementType, lkup, with.qualities)
{
nrec <- .normarg_nrec(nrec)
skip <- .normarg_skip(skip)
if (!isTRUEorFALSE(seek.first.rec))
stop(wmsg("'seek.first.rec' must be TRUE or FALSE"))
if (!isTRUEorFALSE(with.qualities))
stop(wmsg("'with.qualities' must be TRUE or FALSE"))
C_ans <- .Call2("read_fastq_files",
filexp_list, nrec, skip, seek.first.rec,
use.names, elementType, lkup, with.qualities,
PACKAGE="Biostrings")
if (!with.qualities)
return(C_ans)
ans <- C_ans[[1L]]
mcols(ans)$qualities <- C_ans[[2L]]
ans
}
fastq.seqlengths <- function(filepath, nrec=-1L, skip=0L, seek.first.rec=FALSE)
{
filexp_list <- open_input_files(filepath)
on.exit(.close_filexp_list(filexp_list))
nrec <- .normarg_nrec(nrec)
skip <- .normarg_skip(skip)
if (!isTRUEorFALSE(seek.first.rec))
stop(wmsg("'seek.first.rec' must be TRUE or FALSE"))
.Call2("fastq_seqlengths",
filexp_list, nrec, skip, seek.first.rec,
PACKAGE="Biostrings")
}
fastq.geometry <- function(filepath, nrec=-1L, skip=0L, seek.first.rec=FALSE)
{
seqlengths <- fastq.seqlengths(filepath, nrec, skip, seek.first.rec)
common_seqlength <- runValue(Rle(seqlengths))
if (length(common_seqlength) != 1L)
common_seqlength <- NA_integer_
c(length(seqlengths), common_seqlength)
}
.read_XStringSet_from_fastq <- function(filepath, nrec, skip, seek.first.rec,
use.names, elementType, lkup,
with.qualities)
{
filexp_list <- open_input_files(filepath)
on.exit(.close_filexp_list(filexp_list))
nrec <- .normarg_nrec(nrec)
skip <- .normarg_skip(skip)
if (!isTRUEorFALSE(seek.first.rec))
stop(wmsg("'seek.first.rec' must be TRUE or FALSE"))
if (!isTRUEorFALSE(with.qualities))
stop(wmsg("'with.qualities' must be TRUE or FALSE"))
C_ans <- .Call2("read_fastq_files",
filexp_list, nrec, skip, seek.first.rec,
use.names, elementType, lkup, with.qualities,
PACKAGE="Biostrings")
if (!with.qualities)
return(C_ans)
ans <- C_ans[[1L]]
mcols(ans)$qualities <- C_ans[[2L]]
ans
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### The readBStringSet(), readDNAStringSet(), readRNAStringSet(), and
### readAAStringSet() functions.
###
.read_XStringSet <- function(filepath, format,
nrec=-1L, skip=0L, seek.first.rec=FALSE,
use.names=TRUE, seqtype="B",
with.qualities=FALSE)
{
if (!isSingleString(format))
stop(wmsg("'format' must be a single string"))
format <- match.arg(tolower(format), c("fasta", "fastq"))
if (!isTRUEorFALSE(use.names))
stop(wmsg("'use.names' must be TRUE or FALSE"))
elementType <- paste(seqtype, "String", sep="")
lkup <- get_seqtype_conversion_lookup("B", seqtype)
## Read FASTQ.
if (format == "fastq") {
if (!.is_filexp_list(filepath)) {
filepath <- open_input_files(filepath)
on.exit(.close_filexp_list(filepath))
}
ans <- .read_fastq_files(filepath,
nrec, skip, seek.first.rec,
use.names, elementType, lkup,
with.qualities)
return(ans)
}
## Read FASTA.
if (!identical(with.qualities, FALSE))
stop(wmsg("The 'with.qualities' argument is only supported ",
"when reading a FASTQ file."))
if (.is_filexp_list(filepath)) {
ans <- .read_fasta_files(filepath,
nrec, skip, seek.first.rec,
use.names, elementType, lkup)
return(ans)
}
if (is.data.frame(filepath)) {
if (!(identical(nrec, -1L) &&
identical(skip, 0L) &&
identical(seek.first.rec, FALSE)))
warning(wmsg("'nrec', 'skip', and 'seek.first.rec' are ",
"ignored when 'filepath' is a data frame"))
fai <- filepath
} else {
fai <- fasta.index(filepath, nrec=nrec, skip=skip,
seek.first.rec=seek.first.rec,
seqtype=seqtype)
}
.read_XStringSet_from_fasta_index(fai, use.names, elementType, lkup)
}
readBStringSet <- function(filepath, format="fasta",
nrec=-1L, skip=0L, seek.first.rec=FALSE,
use.names=TRUE, with.qualities=FALSE)
.read_XStringSet(filepath, format, nrec, skip, seek.first.rec,
use.names, "B", with.qualities)
readDNAStringSet <- function(filepath, format="fasta",
nrec=-1L, skip=0L, seek.first.rec=FALSE,
use.names=TRUE, with.qualities=FALSE)
.read_XStringSet(filepath, format, nrec, skip, seek.first.rec,
use.names, "DNA", with.qualities)
readRNAStringSet <- function(filepath, format="fasta",
nrec=-1L, skip=0L, seek.first.rec=FALSE,
use.names=TRUE, with.qualities=FALSE)
.read_XStringSet(filepath, format, nrec, skip, seek.first.rec,
use.names, "RNA", with.qualities)
readAAStringSet <- function(filepath, format="fasta",
nrec=-1L, skip=0L, seek.first.rec=FALSE,
use.names=TRUE, with.qualities=FALSE)
.read_XStringSet(filepath, format, nrec, skip, seek.first.rec,
use.names, "AA", with.qualities)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### writeXStringSet()
###
.write_XStringSet_to_fasta <- function(x, filexp_list, width=80L)
{
if (!isSingleNumber(width))
stop(wmsg("'width' must be a single integer"))
if (!is.integer(width))
width <- as.integer(width)
if (width < 1L)
stop(wmsg("'width' must be an integer >= 1"))
lkup <- get_seqtype_conversion_lookup(seqtype(x), "B")
.Call2("write_XStringSet_to_fasta",
x, filexp_list, width, lkup,
PACKAGE="Biostrings")
}
.write_XStringSet_to_fastq <- function(x, filexp_list, qualities=NULL)
{
if (is.null(qualities)){
if (is(x, "QualityScaledXStringSet"))
qualities <- quality(x)
else
qualities <- mcols(x)$qualities
}
if (!is.null(qualities)) {
if (!is(qualities, "BStringSet"))
stop(wmsg("'qualities' must be NULL or a BStringSet object"))
if (length(qualities) != length(x))
stop(wmsg("'x' and 'qualities' must have the same length"))
}
lkup <- get_seqtype_conversion_lookup(seqtype(x), "B")
.Call2("write_XStringSet_to_fastq",
x, filexp_list, qualities, lkup,
PACKAGE="Biostrings")
}
writeXStringSet <- function(x, filepath, append=FALSE,
compress=FALSE, compression_level=NA,
format="fasta", ...)
{
if (!is(x, "XStringSet"))
stop(wmsg("'x' must be an XStringSet object"))
if (!isSingleString(format))
stop(wmsg("'format' must be a single string"))
format <- match.arg(tolower(format), c("fasta", "fastq"))
filexp_list <- XVector:::open_output_file(filepath, append,
compress, compression_level)
on.exit(.close_filexp_list(filexp_list))
res <- try(switch(format,
"fasta"=.write_XStringSet_to_fasta(x, filexp_list, ...),
"fastq"=.write_XStringSet_to_fastq(x, filexp_list, ...)
),
silent=FALSE)
if (is(res, "try-error") && !append) {
## Get the expanded path and remove the file.
expath <- attr(filexp_list[[1L]], "expath")
if (!file.remove(expath))
warning(wmsg("cannot remove file '", expath, "'"))
}
invisible(NULL)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Serialization of XStringSet objects.
###
saveXStringSet <- function(x, objname, dirpath=".",
save.dups=FALSE, verbose=TRUE)
{
if (!is(x, "XStringSet"))
stop(wmsg("'x' must be an XStringSet object"))
if (!isSingleString(objname))
stop(wmsg("'objname' must be a single string"))
if (!isSingleString(dirpath))
stop(wmsg("'dirpath' must be a single string"))
if (!isTRUEorFALSE(save.dups))
stop(wmsg("'save.dups' must be TRUE or FALSE"))
if (!isTRUEorFALSE(verbose))
stop(wmsg("'verbose' must be TRUE or FALSE"))
x_dups <- NULL
## Don't use 'is(x, "DNAStringSet")' here since we only want to use the
## "pre-compression trick" on a DNAStringSet *instance*. There is no
## guarantee that using this trick on an object deriving from the
## DNAStringSet class won't corrupt the data stored in the extra slots!
if (class(x) == "DNAStringSet") {
## The "pre-compression trick" is based on the following property.
## If 'x_dup2unq' is an integer vector (of the same length as 'x')
## that maps from duplicated to unique elements in 'x', then
## 'x' and 'x[x_dup2unq]' contain exactly the same *values* i.e.
## 'all(x == x[x_dup2unq])' is TRUE. Note that other metadata
## attached to 'x' like the names etc could differ though!
## So by replacing 'x' with 'x[x_dup2unq]' below , we actually don't
## modify the sequences in 'x', but the internal representation of 'x'
## has changed. What has changed is that duplicated elements are not
## duplicated in memory anymore (i.e. they all point to the same place
## in memory) so calling compact() on 'x' will be much more efficient.
## Note that, for efficiency reasons, we use PDict() to extract the
## 'x_dup2unq' mapping. This means that the "pre-compression trick"
## works only if 'x' is a rectangular DNAStringSet instance with no
## IUPAC ambiguity codes.
pdict <- try(PDict(x), silent=TRUE)
if (!is(pdict, "try-error") && !is.null(pdict@dups0)) {
x_dups <- pdict@dups0
x_names <- names(x)
x_dup2unq <- togroup(x_dups)
x <- x[x_dup2unq]
names(x) <- x_names
}
}
x <- compact(x)
assign(objname, x)
objfile <- paste(objname, ".rda", sep="")
filepath <- file.path(dirpath, objfile)
if (save.dups) {
if (is.null(x_dups))
stop(wmsg("could not determine 'x_dups'"))
objname2 <- paste(objname, "_dups", sep="")
assign(objname2, x_dups)
objname <- c(objname, objname2)
}
if (verbose)
cat("Saving ", filepath, " ... ", sep="")
save(list=objname, file=filepath)
if (verbose)
cat("OK\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.