Nothing
#' @title Read and write vcf format files
#' @rdname io_vcfR
#' @name VCF input and output
#'
#' @export
#'
#' @description
#' Read and files in the *.vcf structured text format, as well as the compressed *.vcf.gz format.
#' Write objects of class vcfR to *.vcf.gz.
#'
#' @param file A filename for a variant call format (vcf) file.
#' @param limit amount of memory (in bytes) not to exceed when reading in a file.
#' @param nrows integer specifying the maximum number of rows (variants) to read in.
#' @param skip integer specifying the number of rows (variants) to skip before beginning to read data.
#' @param cols vector of column numbers to extract from file.
#' @param x An object of class vcfR or chromR.
# @param vfile an output filename.
#' @param mask logical vector indicating rows to use.
#' @param APPEND logical indicating whether to append to existing vcf file or write a new file.
#' @param convertNA logical specifying to convert VCF missing data to NA.
#' @param checkFile test if the first line follows the VCF specification.
#' @param check_keys logical determining if \code{check_keys()} is called to test if INFO and FORMAT keys are unique.
#'
#' @param verbose report verbose progress.
#'
#' @details
#' The function \strong{read.vcfR} reads in files in *.vcf (text) and *.vcf.gz (gzipped text) format and returns an object of class vcfR.
#' The parameter 'limit' is an attempt to keep the user from trying to read in a file which contains more data than there is memory to hold.
#' Based on the dimensions of the data matrix, an estimate of how much memory needed is made.
#' If this estimate exceeds the value of 'limit' an error is thrown and execution stops.
#' The user may increase this limit to any value, but is encourages to compare that value to the amout of available physical memory.
#'
#'
#' It is possible to input part of a VCF file by using the parameters nrows, skip and cols.
#' The first eight columns (the fix region) are part of the definition and will always be included.
#' Any columns beyond eight are optional (the gt region).
#' You can specify which of these columns you would like to input by setting the cols parameter.
#' If you want a usable vcfR object you will want to always include nine (the FORMAT column).
#' If you do not include column nine you may experience reduced functionality.
#'
#'
#' According to the VCF specification \strong{missing data} are encoded by a period (".").
#' Within the R language, missing data can be encoded as NA.
#' The parameter `convertNA` allows the user to either retain the VCF representation or the R representation of missing data.
#' Note that the conversion only takes place when the entire value can be determined to be missing.
#' For example, ".|.:48:8:51,51" would be retained because the missing genotype is accompanied by other delimited information.
#' In contrast, ".|." should be converted to NA when \code{convertNA = TRUE}.
#'
#'
#' If file begins with http://, https://, ftp://, or ftps:// it is interpreted as a link.
#' When this happens, file is split on the delimiter '/' and the last element is used as the filename.
#' A check is performed to determine if this file exists in the working directory.
#' If a local file is found it is used.
#' If a local file is not found the remote file is downloaded to the working directory and read in.
#'
#' The function \strong{write.vcf} takes an object of either class vcfR or chromR and writes the vcf data to a vcf.gz file (gzipped text).
#' If the parameter 'mask' is set to FALSE, the entire object is written to file.
#' If the parameter 'mask' is set to TRUE and the object is of class chromR (which has a mask slot), this mask is used to subset the data.
#' If an index is supplied as 'mask', then this index is used, and recycled as necessary, to subset the data.
#'
#' Because vcfR provides the opportunity to manipulate VCF data, it also provides the opportunity for the user to create invalid VCF files.
#' If there is a question regarding the validity of a file you have created one option is the \href{https://vcftools.github.io/perl_module.html#vcf-validator}{VCF validator} from VCF tools.
#'
#'
#' @return read.vcfR returns an object of class \code{\link{vcfR-class}}.
#' See the \strong{vignette:} \code{vignette('vcf_data')}.
#' The function write.vcf creates a gzipped VCF file.
#'
#' @seealso
# \code{\link[PopGenome]{readVCF}}
# \code{\link[pegas]{read.vcf}}
# \link[pegas]{read.vcf}
#'
#' CRAN:
#' \href{https://cran.r-project.org/package=pegas}{pegas}::read.vcf,
#' \href{https://cran.r-project.org/package=PopGenome}{PopGenome}::readVCF,
#' \href{https://cran.r-project.org/package=data.table}{data.table}::fread
#'
#' Bioconductor:
#' \href{https://www.bioconductor.org/packages/release/bioc/html/VariantAnnotation.html}{VariantAnnotation}::readVcf
#'
#' Use: browseVignettes('vcfR') to find examples.
#'
#'
#' @examples
#' data(vcfR_test)
#' vcfR_test
#' head(vcfR_test)
#' # CRAN requires developers to us a tempdir when writing to the filesystem.
#' # You may want to implement this example elsewhere.
#' orig_dir <- getwd()
#' temp_dir <- tempdir()
#' setwd( temp_dir )
#' write.vcf( vcfR_test, file = "vcfR_test.vcf.gz" )
#' vcf <- read.vcfR( file = "vcfR_test.vcf.gz", verbose = FALSE )
#' vcf
#' setwd( orig_dir )
#'
#'
# ' @rdname io_vcfR
#' @aliases read.vcfR
#' @export
#'
read.vcfR <- function(file,
limit=1e7,
nrows = -1,
skip = 0,
cols = NULL,
convertNA = TRUE,
checkFile = TRUE,
check_keys = TRUE,
verbose = TRUE){
# require(memuse)
if( !is.character(file) ){
stop('The parameter file is expected to be a character.')
}
if( grepl('^http://|^https://|^ftp://|^ftps://', file) ){
# We have a link instead os a file.
file_name <- unlist(strsplit(file, split = "/"))
file_name <- file_name[[length(file_name)]]
if(file.exists(file_name)){
message(paste("Local file", file_name, "found."))
message('Using this local copy instead of retrieving a remote copy.')
} else {
message(paste("Downloading remote file", file))
utils::download.file(url = file, destfile = file_name, quiet = FALSE)
message("File downloaded.")
message("It will probably be faster to use this local file in the future instead of re-downloading it.")
}
file <- file_name
}
# gzopen does not appear to deal well with tilde expansion.
if( grepl("^~", file) ){
file <- path.expand(file)
}
# Test that this is a VCF file.
if(checkFile == TRUE){
vcf <- scan(file=file, what = character(), nmax=1, sep="\n", quiet = TRUE, comment.char = "")
if(substr(vcf,start=1, stop=17) != "##fileformat=VCFv"){
msg <- paste("File:", file, "does not appear to be a VCF file.\n")
msg <- paste(msg, " First line of file:\n", file)
msg <- paste(msg, "\n")
msg <- paste(msg, " Should begin with:\n##fileformat=VCFv")
msg <- paste(msg, "\n")
stop(msg)
}
}
vcf <- new(Class="vcfR")
stats <- .vcf_stats_gz(file, nrows=nrows, skip = skip, verbose = as.integer(verbose) )
# stats should be a named vector containing "meta", "header_line", "variants", "columns", and "last_line".
# They should have been initialize to zero.
if( stats['columns'] > 0 & stats['last_line'] > 0 & stats['columns'] != stats['last_line']){
msg <- paste("Your file appears to have", stats['columns'], "header elements")
msg <- paste(msg, "and", stats['last_line'], "columns in the body.\n")
msg <- paste(msg, "This should never happen!")
stop(msg)
}
if( stats['columns'] == 0 & stats['last_line'] > 0 ){
stats['columns'] <- stats['last_line']
}
if(verbose == TRUE){
cat("File attributes:")
cat("\n")
cat( paste(" meta lines:", stats['meta']) )
cat("\n")
cat( paste(" header_line:", stats['header_line']) )
cat("\n")
cat( paste(" variant count:", stats['variants']) )
cat("\n")
cat( paste(" column count:", stats['columns']) )
cat("\n")
}
utils::flush.console()
if( stats['meta'] < 0 ){
stop( paste("stats['meta'] less than zero:", stats['meta'], ", this should never happen.") )
}
if( stats['header_line'] < 0 ){
stop( paste("stats['header_line'] less than zero:", stats['header_line'], ", this should never happen.") )
}
if( stats['variants'] < 0 ){
stop( paste("stats['variants'] less than zero:", stats['variants'], ", this should never happen.") )
}
if( stats['columns'] < 0 ){
stop( paste("stats['columns'] less than zero:", stats['columns'], ", this should never happen.") )
}
if( is.null(cols) ){
cols <- 1:stats['columns']
}
# Make sure we include the first nine columns.
cols <- sort( unique( c(1:8, cols) ) )
# ram_est <- stats['variants'] * stats['columns'] * 8 + 248
ram_est <- memuse::howbig(stats['variants'], stats['columns'])
if(ram_est@size > limit){
message(paste("The number of variants in your file is:", prettyNum(stats['variants'], big.mark=",")))
message(paste("The number of samples in your file is:", prettyNum(stats['columns'] - 1, big.mark=",")))
message(paste("This will result in an object of approximately:", ram_est, "in size"))
stop("Object size limit exceeded")
}
# Read meta
vcf@meta <- .read_meta_gz(file, stats, as.numeric(verbose))
# Read body
body <- .read_body_gz(file, stats = stats,
nrows = nrows, skip = skip, cols = cols,
convertNA = as.numeric(convertNA), verbose = as.numeric(verbose))
vcf@fix <- body[ ,1:8, drop=FALSE ]
if( ncol(body) > 8 ){
vcf@gt <- body[ , -c(1:8), drop=FALSE ]
} else {
vcf@gt <- matrix("a", nrow=0, ncol=0)
}
# Check if keys in meta section are unique.
if( check_keys == TRUE ){
check_keys(vcf)
}
return(vcf)
}
#' @rdname io_vcfR
#' @export
#' @aliases write.vcf
#'
write.vcf <- function(x, file = "", mask = FALSE, APPEND = FALSE){
#if(class(x) == "chromR"){
if( inherits(x, "chromR") ){
if( mask == TRUE ){
is.na( x@vcf@fix[,'FILTER'] ) <- TRUE
x@vcf@fix[,'FILTER'][ x@var.info[,'mask'] ] <- 'PASS'
}
x <- x@vcf
}
# if(class(x) != "vcfR"){
if( !inherits(x, "vcfR") ){
stop("Unexpected class! Expecting an object of class vcfR or chromR.")
}
# gzopen does not appear to deal well with tilde expansion.
file <- path.expand(file)
if(APPEND == FALSE){
gz <- gzfile(file, "w")
if( length(x@meta) > 0 ){
write(x@meta, gz)
}
header <- c(colnames(x@fix), colnames(x@gt))
header[1] <- "#CHROM"
header <- paste(header, collapse="\t")
write(header, gz)
close(gz)
}
if(mask == FALSE){
test <- .write_vcf_body(fix = x@fix, gt = x@gt, filename = file, mask = 0)
} else if (mask == TRUE){
test <- .write_vcf_body(fix = x@fix, gt = x@gt, filename = file, mask = 1)
}
}
##### ##### ##### ##### #####
# EOF
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.