Nothing
#' @useDynLib BinaryDosage, .registration = TRUE
#' @importFrom Rcpp sourceCpp
#' @importFrom digest digest
#' @importFrom prodlim row.match
#' @importFrom utils read.table
NULL
validatebdinput <- function(bdfiles,
format,
subformat,
snpidformat,
bdoptions) {
if (is.character(bdfiles) == FALSE)
stop("bdfiles must be a vector of characters")
if (is.numeric(format) == FALSE && is.integer(format) == FALSE)
stop("format must be an integer value")
if (length(format) != 1)
stop("format must be an integer vector of length 1")
if (is.numeric(format) == TRUE) {
if (floor(format) != format)
stop("format must be an integer")
format <- floor(format)
}
if (format < 1 || format > 4)
stop("format must be an integer value from 1 to 4")
if (is.numeric(subformat) == FALSE && is.integer(subformat) == FALSE)
stop("subformat must be an integer value")
if (length(subformat) != 1)
stop("subformat must be an integer vector of length 1")
if (is.numeric(subformat) == TRUE) {
if (floor(subformat) != subformat)
stop("subformat must be an integer")
subformat <- floor(subformat)
}
if (subformat < 0 || subformat > 4)
stop("subformat must be an integer value from 0 to 4")
if (format < 3 && subformat > 2)
stop("subformat must be an integer value from 0 to 2 for formats 1 and 2")
if (format == 4 & length(bdfiles) != 1)
stop("Only one output file name is needed when using format 4")
if (format < 4 & length(bdfiles) != 3)
stop("Three output file names are required when using formats 1, 2, and 3")
if (is.na(match("", bdfiles)) == FALSE)
stop("Output file names cannot be blank")
if (is.numeric(snpidformat) == FALSE && is.integer(snpidformat) == FALSE)
stop("snpidformat must be an integer value")
if (length(snpidformat) != 1)
stop("snpidformat must be an integer vector of length 1")
if (is.numeric(snpidformat) == TRUE) {
if (floor(snpidformat) != snpidformat)
stop("snpidformat must be an integer")
snpidformat = floor(snpidformat)
}
if (snpidformat < -1 | snpidformat > 3)
stop("snpidformat must be and integer from -1 to 3")
if (is.character(bdoptions) == FALSE)
stop("bdoptions must be a character array")
if (length(bdoptions) > 0 & format != 4)
stop("bdoptions can only be used with format 4")
if (length(bdoptions) > 0) {
if (any(is.na(match(bdoptions, c("aaf", "maf", "rsq")))) == TRUE)
stop("Only valid bdoptions are aaf, maf, and rsq")
}
return(list(format = format,
subformat = subformat))
}
###########################################################
# VCF to Binary Dosage #
###########################################################
#' Convert a VCF file to a binary dosage file
#'
#' Routine to read information from a VCF file and create
#' a binary dosage file. The function is designed to use
#' files return from the Michigan Imputation Server but will
#' run on other VCF files if they contain dosage and genetic
#' probabilities. Note: This routine can take a long time to
#' run if the VCF file is large.
#'
#' @param vcffiles A vector of file names.
#' The first is the name of the vcf file. The
#' second is name of the file that contains information
#' about the imputation of the SNPs. This file is produced
#' by minimac 3 and 4.
#' @param gz Indicator if VCF file is compressed using gzip.
#' Default value is FALSE.
#' @param bdfiles Vector of names of the output files.
#' The binary dosage file name is first. The family and
#' map files follow. For format 4, no family and map file
#' names are needed.
#' @param format The format of the output binary dosage file.
#' Allowed values are 1, 2, 3, and 4. The default value is 4.
#' Using the default value is recommended.
#' @param subformat The subformat of the format of the output
#' binary dosage file. A value of 1 or 3 indicates that only the
#' dosage value is saved. A value of 2 or 4 indicates
#' the dosage and genetic probabilities will be output. Values
#' of 3 or 4 are only allowed with formats 3 and 4. If a value
#' of zero if provided, and genetic probabilities are in the vcf
#' file, subformat 2 will be used for formats 1 and 2, and
#' subformat 4 will be used for formats 3 and 4. If the vcf file
#' does not contain genetic probabilities, subformat 1 will be
#' used for formats 1 and 2, and subformat 3 will be used for
#' formats 3 and 4. The default value is 0.
#' @param snpidformat The format that the SNP ID will be saved as.
#' -1 SNP ID not written
#' 0 - same as in the VCF file
#' 1 - chromosome:location
#' 2 - chromosome:location:reference_allele:alternate_allele
#' If snpidformat is 1 and the VCF file uses format 2, an error is
#' generated. Default value is 0.
#' @param bdoptions Character array containing any of the following
#' value, "aaf", "maf", "rsq". The presence of any of these
#' values indicates that the specified values should be
#' calculates and stored in the binary dosage file. These values only
#' apply to format 4.
#'
#' @return
#' None
#' @export
#'
#' @examples
#' # Find the vcf file names
#' vcf1afile <- system.file("extdata", "set1a.vcf", package = "BinaryDosage")
#' vcf1ainfo <- system.file("extdata", "set1a.info", package = "BinaryDosage")
#' bdfiles <- tempfile()
#' # Convert the file
#' vcftobd(vcffiles = c(vcf1afile, vcf1ainfo), bdfiles = bdfiles)
#' # Verify the file was written correctly
#' bdinfo <- getbdinfo(bdfiles)
vcftobd <- function(vcffiles,
gz = FALSE,
bdfiles,
format = 4L,
subformat = 0L,
snpidformat = 0,
bdoptions = character(0)) {
if (missing(vcffiles) == TRUE)
stop("No VCF file specified")
if (missing(bdfiles) == TRUE)
stop("No output files specified")
validation <- validatebdinput(bdfiles = bdfiles,
format = format,
subformat = subformat,
snpidformat = snpidformat,
bdoptions = bdoptions)
format <- validation$format
subformat <- validation$subformat
if (snpidformat == -1)
readsnpformat = 0
else
readsnpformat = snpidformat
vcfinfo <- getvcfinfo(vcffiles = vcffiles,
gz = gz,
index = FALSE,
snpidformat = readsnpformat)
if (snpidformat == -1)
vcfinfo$snpidformat = 1
else
vcfinfo$snpidformat = 0
if (subformat == 0) {
if (anyNA(vcfinfo$additionalinfo$datacolumns$genotypeprob) == TRUE)
subformat <- 1
else
subformat <- 2
}
WriteBinaryDosageHeader(format = format,
subformat = subformat,
filename = bdfiles,
genefileinfo = vcfinfo,
bdoptions = bdoptions)
headerinfo <- ReadBinaryDosageHeader(filename = bdfiles)
bdwriteinfo <- AllocateBinaryDosageWriteMemory(headerinfo = headerinfo)
vcfapply(vcfinfo = vcfinfo,
func = WriteBinaryDosageData,
writeinfo = bdwriteinfo)
WriteBinaryDosageIndices(writeinfo = bdwriteinfo)
bdinfo <- getbdinfo(bdfiles = bdfiles)
if (is.na(match("aaf", bdoptions)) == FALSE)
updateaaf(bdinfo)
if (is.na(match("maf", bdoptions)) == FALSE)
updatemaf(bdinfo)
if (is.na(match("rsq", bdoptions)) == FALSE)
updatersq(bdinfo)
##return (0)
}
###########################################################
# Gen to Binary Dosage #
###########################################################
#' Convert a gen file to a binary dosage file
#'
#' Routine to read information from a gen file and create
#' a binary dosage file. Note: This routine can take a long
#' time to run if the gen file is large.
#'
#' @param genfiles A vector of file names.
#' The first is the name of the gen file. The
#' second is name of the sample file that contains
#' the subject information.
#' @param snpcolumns Column numbers containing chromosome,
#' snpid, location, reference allele, alternate allele,
#' respectively. This must be an integer vector. All
#' values must be positive except for the chromosome.
#' The value for the chromosome may be -1 or -0.
#' -1 indicates that the chromosome value is passed to
#' the routine using the chromosome parameter.
#' 0 indicates that the chromosome value is in the snpid
#' and that the snpid has the format chromosome:other_data.
#' Default value is c(1L, 2L, 3L, 4L, 5L).
#' @param startcolumn Column number of first column with
#' genetic probabilities or dosages. Must
#' be an integer value. Default value is 6L.
#' @param impformat Number of genetic data values per
#' subject. 1 indicates dosage only, 2 indicates P(g=0)
#' and P(g=1) only, 3 indicates P(g=0), P(g=1), and
#' P(g=2). Default value is 3L.
#' @param chromosome Chromosome value to use if the
#' first value of the snpcolumns is equal to 0.
#' Default value is character().
#' @param header Indicators if the gen and sample files
#' have headers. If the gen file does not have a
#' header. A sample file must be included.
#' Default value is c(FALSE, TRUE).
#' @param gz Indicator if file is compressed using gzip.
#' Default value is FALSE.
#' @param sep Separator used in the gen file. Default
#' value is `"\t"`
#' @param bdfiles Vector of names of the output files.
#' The binary dosage file name is first. The family and
#' map files follow. For format 4, no family and map file
#' names are needed.
#' @param format The format of the output binary dosage file.
#' Allowed values are 1, 2, 3, and 4. The default value is 4.
#' Using the default value is recommended.
#' @param subformat The subformat of the format of the output
#' binary dosage file. A value of 1 or 3 indicates that only the
#' dosage value is saved. A value of 2 or 4 indicates
#' the dosage and genetic probabilities will be output. Values
#' of 3 or 4 are only allowed with formats 3 and 4. If a value
#' of zero if provided, and genetic probabilities are in the vcf
#' file, subformat 2 will be used for formats 1 and 2, and
#' subformat 4 will be used for formats 3 and 4. If the vcf file
#' does not contain genetic probabilities, subformat 1 will be
#' used for formats 1 and 2, and subformat 3 will be used for
#' formats 3 and 4. The default value is 0.
#' @param snpidformat The format that the SNP ID will be saved as.
#' -1 - SNP ID not written.
#' 0 - same as in the VCF file.
#' 1 - chromosome:location.
#' 2 - chromosome:location:reference_allele:alternate_allele.
#' If snpidformat is 1 and the VCF file uses format 2, an error is
#' generated. Default value is 0.
#' @param bdoptions Character array containing any of the following
#' value, "aaf", "maf", "rsq". The presence of any of these
#' values indicates that the specified values should be
#' calculates and stored in the binary dosage file. These values only
#' apply to format 4.
#'
#' @return
#' None
#' @export
#'
#' @examples
#' # Find the gen file names
#' gen3afile <- system.file("extdata", "set3a.imp", package = "BinaryDosage")
#' gen3asample <- system.file("extdata", "set3a.sample", package = "BinaryDosage")
#' # Get temporary output file name
#' bdfiles <- tempfile()
#' # Convert the file
#' gentobd(genfiles = c(gen3afile, gen3asample),
#' snpcolumns = c(0L, 2L:5L),
#' bdfiles = bdfiles)
#' # Verify the file was written correctly
#' bdinfo <- getbdinfo(bdfiles = bdfiles)
gentobd <- function(genfiles,
snpcolumns = 1L:5L,
startcolumn = 6L,
impformat = 3L,
chromosome = character(),
header = c(FALSE, TRUE),
gz = FALSE,
sep = "\t",
bdfiles,
format = 4L,
subformat = 0L,
snpidformat = 0L,
bdoptions = character(0)) {
if (missing(genfiles) == TRUE)
stop("No gen file specified")
if (missing(bdfiles) == TRUE)
stop("No output files specified")
validation <- validatebdinput(bdfiles = bdfiles,
format = format,
subformat = subformat,
snpidformat = snpidformat,
bdoptions = bdoptions)
format <- validation$format
subformat <- validation$subformat
if (snpidformat == -1)
readsnpformat = 0L
else
readsnpformat = as.integer(snpidformat)
geninfo <- getgeninfo(genfiles = genfiles,
snpcolumns = snpcolumns,
startcolumn = startcolumn,
impformat = impformat,
chromosome = chromosome,
header = header,
gz = gz,
index = FALSE,
snpidformat = readsnpformat,
sep = sep)
if (snpidformat == -1)
geninfo$snpidformat = 1
else
geninfo$snpidformat = 0
if (subformat == 0) {
if (geninfo$additionalinfo$format == 1L)
subformat <- 1L
else
subformat <- 2L
}
WriteBinaryDosageHeader(format = format,
subformat = subformat,
filename = bdfiles,
genefileinfo = geninfo,
bdoptions = bdoptions)
headerinfo <- ReadBinaryDosageHeader(filename = bdfiles)
bdwriteinfo <- AllocateBinaryDosageWriteMemory(headerinfo = headerinfo)
genapply(geninfo = geninfo,
func = WriteBinaryDosageData,
writeinfo = bdwriteinfo)
WriteBinaryDosageIndices(writeinfo = bdwriteinfo)
bdinfo <- getbdinfo(bdfiles = bdfiles)
if (is.na(match("aaf", bdoptions)) == FALSE)
updateaaf(bdinfo)
if (is.na(match("maf", bdoptions)) == FALSE)
updatemaf(bdinfo)
if (is.na(match("rsq", bdoptions)) == FALSE)
updatersq(bdinfo)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.