Nothing
#######################################################################
#
# Package Name: SeqArray
#
# Description:
# Data Management of Large-scale Whole-Genome Sequence Variant Calls
#
#######################################################################
# Convert a SeqArray GDS file to a VCF file
#
seqGDS2VCF <- function(gdsfile, vcf.fn, info.var=NULL, fmt.var=NULL,
chr_prefix="", use_Rsamtools=TRUE, verbose=TRUE)
{
# check
stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
if (is.character(gdsfile))
stopifnot(length(gdsfile)==1L)
if (!inherits(vcf.fn, "connection"))
stopifnot(is.character(vcf.fn), length(vcf.fn)==1L)
stopifnot(is.null(info.var) | is.character(info.var))
stopifnot(is.null(fmt.var) | is.character(fmt.var))
stopifnot(is.character(chr_prefix), length(chr_prefix)==1L)
stopifnot(is.logical(use_Rsamtools), length(use_Rsamtools)==1L)
stopifnot(is.logical(verbose), length(verbose)==1L)
if (is.character(gdsfile))
{
gdsfile <- seqOpen(gdsfile)
on.exit(seqClose(gdsfile))
}
# get a summary
z <- seqSummary(gdsfile, check="none", verbose=FALSE)
# the INFO field
if (!is.null(info.var))
{
s <- z$info$ID
if (is.null(s)) s <- character()
if (length(setdiff(info.var, s)) > 0L)
stop(paste("Not exist:", paste(setdiff(info.var, s), collapse=",")))
if (length(info.var) > 0L)
z$info <- z$info[match(info.var, z$info$ID), ]
else
z$info <- list()
}
# the FORMAT field
if (!is.null(fmt.var))
{
s <- z$format$ID[-1L]
if (is.null(s)) s <- character()
if (length(setdiff(fmt.var, s)) > 0L)
stop(paste("Not exist:", paste(setdiff(fmt.var, s), collapse=",")))
if (length(fmt.var) > 0L)
z$format <- z$format[match(fmt.var, z$format$ID), ]
else
z$format <- list()
} else {
z$format <- z$format[-1L, ]
}
## double quote text if needed
dq <- function(s, text=FALSE)
{
.Call(SEQ_Quote, s, text)
}
######################################################
# create an output text file
bgzf <- FALSE
if (!inherits(vcf.fn, "connection"))
{
ext <- substring(vcf.fn, nchar(vcf.fn)-2L)
if (ext == ".gz")
{
if (.Platform$OS.type == "windows")
use_Rsamtools <- FALSE
if (isTRUE(use_Rsamtools) && requireNamespace("Rsamtools"))
{
ofile <- .Call(SEQ_bgzip_create, vcf.fn)
bgzf <- TRUE
} else {
if (verbose)
{
if (.Platform$OS.type != "windows")
message("Hint: install Rsamtools to enable the bgzf output.")
}
ofile <- gzfile(vcf.fn, "wb")
}
} else if (ext == ".bz")
{
ofile <- bzfile(vcf.fn, "wb")
} else if (ext == ".xz")
{
ofile <- xzfile(vcf.fn, "wb")
} else {
ofile <- file(vcf.fn, open="wb")
}
on.exit(close(ofile), add=TRUE)
} else {
ofile <- vcf.fn
}
op <- options("useFancyQuotes")
options(useFancyQuotes = FALSE)
on.exit(options(op), add=TRUE)
if (verbose)
{
cat(date(), "\n", sep="")
cat("VCF Export: ", basename(vcf.fn), "\n", sep="")
s <- .seldim(gdsfile)
cat(" ", .pretty(s[2L]), " sample", .plural(s[2L]), ", ",
.pretty(s[3L]), " variant", .plural(s[3L]), "\n", sep="")
s <- paste(z$info$ID, collapse=", ")
cat(" INFO Field: ", ifelse(s!="", s, "<none>"), "\n", sep="")
s <- paste(z$format$ID, collapse=", ")
cat(" FORMAT Field: ", ifelse(s!="", s, "<none>"), "\n", sep="")
cat(ifelse(bgzf,
" output to BGZF format\n", " output to a general gzip file\n"))
}
######################################################
# write the header
txt <- character()
a <- get.attr.gdsn(index.gdsn(gdsfile, "description"))
# fileformat
if (is.null(a$vcf.fileformat))
a$vcf.fileformat <- "VCFv4.2"
txt <- c(txt, paste0("##fileformat=", a$vcf.fileformat))
# fileDate
txt <- c(txt, paste0("##fileDate=", format(Sys.time(), "%Y%m%d")))
# program, source
aa <- get.attr.gdsn(gdsfile$root)
if (is.null(aa$FileVersion))
aa$FileVersion <- "v1.0"
txt <- c(txt, paste0("##source=SeqArray_Format_", aa$FileVersion))
# reference
if (length(z$reference) > 0L)
txt <- c(txt, paste0("##reference=", z$reference[1L]))
# assembly
if (!is.null(a$vcf.assembly))
txt <- c(txt, paste0("##assembly=", dq(a$vcf.assembly)))
# ALT=<ID=type,Description=description>
for (i in seq_len(nrow(z$allele)))
{
s <- sprintf("##ALT=<ID=%s,Description=%s>",
as.character(z$allele[i,1L]), dq(z$allele[i,2L]))
txt <- c(txt, s)
}
# contig=<ID=ctg1,URL=ftp://somewhere.org/assembly.fa,...>
n <- index.gdsn(gdsfile, "description/vcf.contig", silent=TRUE)
if (!is.null(n))
{
dat <- read.gdsn(n)
nm <- names(dat)
for (i in seq_len(nrow(dat)))
{
s <- NULL
for (j in seq_len(ncol(dat)))
s[j] <- paste(nm[j], "=", dq(dat[i,j]), sep="")
s <- paste(s, collapse=",")
txt <- c(txt, paste0("##contig=<", s, ">"))
}
}
# INFO field
for (nm in z$info$ID)
{
a <- get.attr.gdsn(index.gdsn(gdsfile,
paste("annotation/info/", nm, sep="")))
s <- sprintf("ID=%s,Number=%s,Type=%s,Description=%s",
nm, dq(a$Number), dq(a$Type), dq(a$Description, TRUE))
if (!is.null(a$Source))
s <- paste(s, ",Source=", dq(a$Source, TRUE), sep="")
if (!is.null(a$Version))
s <- paste(s, ",Version=", dq(a$Version, TRUE), sep="")
txt <- c(txt, paste0("##INFO=<", s, ">"))
}
# FILTER field
n <- index.gdsn(gdsfile, "annotation/filter", silent=TRUE)
if (!is.null(n))
{
at <- get.attr.gdsn(n)
id <- at$R.levels; dp <- at$Description
for (i in seq_along(id))
{
txt <- c(txt, sprintf("##FILTER=<ID=%s,Description=%s>",
dq(id[i]), dq(dp[i], TRUE)))
}
}
# FORMAT field
a <- get.attr.gdsn(index.gdsn(gdsfile, "genotype"))
txt <- c(txt, sprintf(
"##FORMAT=<ID=%s,Number=1,Type=String,Description=%s>",
a$VariableName, dq(a$Description, TRUE)))
for (nm in z$format$ID)
{
a <- get.attr.gdsn(index.gdsn(gdsfile,
paste("annotation/format/", nm, sep="")))
txt <- c(txt, sprintf(
"##FORMAT=<ID=%s,Number=%s,Type=%s,Description=%s>",
nm, dq(a$Number), dq(a$Type), dq(a$Description, TRUE)))
}
# others
n <- index.gdsn(gdsfile, "description/vcf.header", silent=TRUE)
if (!is.null(n))
{
dat <- read.gdsn(n)
for (i in seq_len(nrow(dat)))
{
s <- dat[i,1L]
if (!(s %in% c("fileDate", "source")))
txt <- c(txt, paste0("##", s, "=", dq(dat[i,2L])))
}
}
######################################################
# write the header -- samples
txt <- c(txt, paste(
c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
"FORMAT", seqGetData(gdsfile, "sample.id")), collapse="\t"))
writeLines(txt, ofile)
######################################################
# write the contents
# the INFO field
nm.info <- character()
if (!is.null(z$info$ID))
{
s <- z$info$ID
if (length(s) > 0L)
nm.info <- paste("annotation/info/", s, sep="")
}
# the FORMAT field
nm.format <- character()
if (!is.null(z$format$ID))
{
s <- z$format$ID
if (length(s) > 0L)
nm.format <- paste("annotation/format/", s, sep="")
}
# initialize the variable length of INFO
len.info <- NULL
for (n in nm.info)
{
a <- get.attr.gdsn(index.gdsn(gdsfile, n))
a$Number <- if (is.null(a$Number)) "." else a$Number[1L]
len.info <- c(len.info, a$Number)
}
len.info <- suppressWarnings(as.integer(len.info))
# initialize the variable length of FORMAT
len.fmt <- NULL
for (n in nm.format)
{
a <- get.attr.gdsn(index.gdsn(gdsfile, n))
a$Number <- if (is.null(a$Number)) "." else a$Number[1L]
len.fmt <- c(len.fmt, a$Number)
}
len.fmt <- suppressWarnings(as.integer(len.fmt))
# initialize
dm <- .seldim(gdsfile)
.Call(SEQ_ToVCF_Init, dm, chr_prefix, len.info, len.fmt, ofile, verbose)
on.exit({ .Call(SEQ_ToVCF_Done) }, add=TRUE)
# variable names
nm <- c("chromosome", "position", "annotation/id", "allele",
"annotation/qual", "annotation/filter", "genotype", "phase")
if (length(nm.info) > 0L) nm <- c(nm, nm.info)
if (length(nm.format) > 0L) nm <- c(nm, nm.format)
s <- c("chr", "pos", "id", "allele", "qual", "filter", "geno", "phase")
# the INFO field
if (length(nm.info) > 0L)
s <- c(s, paste("info.", z$info$ID, sep=""))
# the FORMAT field
if (length(nm.format) > 0L)
s <- c(s, paste("fmt.", z$format$ID, sep=""))
names(nm) <- s
# C function name
if (is.null(index.gdsn(gdsfile, "genotype/data", silent=TRUE)))
{
nm <- nm[!(nm %in% c("genotype", "phase"))]
cfn <- "SEQ_ToVCF_NoGeno"
} else if (is.null(index.gdsn(gdsfile, "phase/data", silent=TRUE)))
{
nm <- nm[nm != "phase"]
cfn <- "SEQ_ToVCF_Haploid"
} else {
if (length(nm.format)>0L | dm[1L]!=2L)
cfn <- "SEQ_ToVCF"
else
cfn <- "SEQ_ToVCF_Di_WrtFmt"
}
# output lines by variant
seqApply(gdsfile, nm, margin="by.variant", as.is="none",
FUN=.cfunction(cfn), .useraw=NA, .progress=verbose)
# finalize
.Call(SEQ_ToVCF_Done)
on.exit({
if (verbose)
cat(date(), " Done.\n", sep="")
}, add=TRUE)
# output
if (!inherits(vcf.fn, "connection"))
invisible(normalizePath(vcf.fn))
else
invisible()
}
#######################################################################
# Convert a SeqArray GDS file to a SNP GDS file
#
seqGDS2SNP <- function(gdsfile, out.gdsfn, dosage=FALSE,
compress.geno="LZMA_RA", compress.annotation="LZMA_RA",
ds.type=c("packedreal16", "float", "double"), optimize=TRUE, verbose=TRUE)
{
# check
stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L)
stopifnot(is.logical(dosage) | is.character(dosage), length(dosage)==1L)
stopifnot(is.character(compress.geno), length(compress.geno)==1L)
stopifnot(is.character(compress.annotation), length(compress.annotation)==1L)
stopifnot(is.logical(optimize), length(optimize)==1L)
stopifnot(is.logical(verbose), length(verbose)==1L)
ds.type <- match.arg(ds.type)
if (verbose)
{
cat(date(), "\n", sep="")
if (isTRUE(dosage) | is.character(dosage))
cat("SeqArray GDS to SNP GDS dosage format:\n")
else
cat("SeqArray GDS to SNP GDS format:\n")
}
# if it is a file name
if (is.character(gdsfile))
{
gdsfile <- seqOpen(gdsfile)
on.exit({ seqClose(gdsfile) })
}
# dosage variable name
if (isTRUE(dosage) | is.character(dosage))
{
if (isTRUE(dosage))
dosage <- "annotation/format/DS"
else if (substr(dosage, 1L, 18L) != "annotation/format/")
dosage <- paste0("annotation/format/", dosage)
}
if (verbose)
{
dm <- .seldim(gdsfile)
cat(" # of samples: ", .pretty(dm[2L]), "\n", sep="")
cat(" # of variants: ", .pretty(dm[3L]), "\n", sep="")
if (is.character(dosage))
{
cat(" dosage compression: ", compress.geno, ", from [",
dosage, "]\n", sep="")
} else {
cat(" genotype compression: ", compress.geno, "\n", sep="")
}
cat(" annotation compression: ", compress.annotation, "\n", sep="")
}
# create GDS file
gfile <- createfn.gds(out.gdsfn)
# close the file at the end
on.exit({
if (!is.null(gfile)) closefn.gds(gfile)
}, add=TRUE)
# add a flag
if (!is.character(dosage))
{
put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY")
} else {
put.attr.gdsn(gfile$root, "FileFormat", "IMPUTED_DOSAGE")
}
# add "sample.id"
sampid <- seqGetData(gdsfile, "sample.id")
add.gdsn(gfile, "sample.id", sampid,
compress=compress.annotation, closezip=TRUE)
# add "snp.id"
add.gdsn(gfile, "snp.id", seqGetData(gdsfile, "variant.id"),
compress=compress.annotation, closezip=TRUE)
# add "snp.rs.id"
if (!is.null(index.gdsn(gdsfile, "annotation/id", silent=TRUE)))
{
add.gdsn(gfile, "snp.rs.id", seqGetData(gdsfile, "annotation/id"),
compress=compress.annotation, closezip=TRUE)
}
# add "snp.position"
add.gdsn(gfile, "snp.position", seqGetData(gdsfile, "position"),
compress=compress.annotation, closezip=TRUE)
# add "snp.chromosome"
add.gdsn(gfile, "snp.chromosome", seqGetData(gdsfile, "chromosome"),
compress=compress.annotation, closezip=TRUE)
# add "snp.allele"
add.gdsn(gfile, "snp.allele",
.cfunction("FC_AlleleStr")(seqGetData(gdsfile, "allele")),
compress=compress.annotation, closezip=TRUE)
# add "genotype"
if (!is.character(dosage))
{
gGeno <- add.gdsn(gfile, "genotype", storage="bit2",
valdim=c(length(sampid), 0L), compress=compress.geno)
put.attr.gdsn(gGeno, "sample.order")
seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose,
FUN = .cfunction("FC_GDS2SNP"))
} else {
.cfunction("FC_SetNumSamp")(length(sampid))
gGeno <- add.gdsn(gfile, "genotype", storage=ds.type,
valdim=c(length(sampid), 0L), compress=compress.geno)
put.attr.gdsn(gGeno, "sample.order")
seqApply(gdsfile, dosage, as.is=gGeno, .progress=verbose,
FUN = .cfunction("FC_GDS2Dosage"))
}
readmode.gdsn(gGeno)
closefn.gds(gfile)
gfile <- NULL
if (verbose)
cat("Done.\n", date(), "\n", sep="")
if (optimize)
{
if (verbose) cat("Optimize the access efficiency ...\n")
cleanup.gds(out.gdsfn, verbose=verbose)
if (verbose) cat(date(), "\n", sep="")
}
# output
invisible(normalizePath(out.gdsfn))
}
#######################################################################
# Convert a SNP GDS file to a SeqArray GDS file
#
seqSNP2GDS <- function(gds.fn, out.fn, storage.option="LZMA_RA", major.ref=TRUE,
ds.type=c("packedreal16", "float", "double"), optimize=TRUE, digest=TRUE,
verbose=TRUE)
{
# check
stopifnot(is.character(gds.fn), length(gds.fn)==1L)
stopifnot(is.character(out.fn), length(out.fn)==1L)
ds.type <- match.arg(ds.type)
stopifnot(is.logical(major.ref), length(major.ref)==1L)
stopifnot(is.logical(optimize), length(optimize)==1L)
stopifnot(is.logical(digest) | is.character(digest), length(digest)==1L)
stopifnot(is.logical(verbose), length(verbose)==1L)
if (is.character(storage.option))
storage.option <- seqStorageOption(storage.option)
stopifnot(inherits(storage.option, "SeqGDSStorageClass"))
if (verbose)
{
cat(date(), "\n", sep="")
cat("SNP GDS to SeqArray GDS Format:\n")
}
# open the SNP GDS
srcfile <- openfn.gds(gds.fn)
on.exit({ closefn.gds(srcfile) })
nSamp <- prod(objdesp.gdsn(index.gdsn(srcfile, "sample.id"))$dim)
nSNP <- prod(objdesp.gdsn(index.gdsn(srcfile, "snp.id"))$dim)
# check genotype type
n <- index.gdsn(srcfile, "genotype")
dm <- objdesp.gdsn(n)$dim
if (length(dm) != 2L)
stop("'genotype' of SNP GDS should be a matrix.")
geno_type <- as.character(objdesp.gdsn(n)$type)
if (!(geno_type %in% c("Integer", "Real")))
stop("'genotype' should be integers or real numbers.")
# determine how genotypes are stored
snpfirstdim <- TRUE
rd <- names(get.attr.gdsn(n))
if ("snp.order" %in% rd) snpfirstdim <- TRUE
if ("sample.order" %in% rd) snpfirstdim <- FALSE
if (snpfirstdim)
{
if ((dm[1L]!=nSNP) || (dm[2L]!=nSamp))
stop("Invalid dimension of 'genotype'.")
} else {
if ((dm[1L]!=nSamp) || (dm[2L]!=nSNP))
stop("Invalid dimension of 'genotype'.")
}
# create GDS file
dstfile <- createfn.gds(out.fn)
# close the file at the end
on.exit({ closefn.gds(dstfile) }, add=TRUE)
put.attr.gdsn(dstfile$root, "FileFormat", "SEQ_ARRAY")
put.attr.gdsn(dstfile$root, "FileVersion", "v1.0")
n <- addfolder.gdsn(dstfile, "description")
put.attr.gdsn(n, "source.format", "SNPRelate GDS Format")
# add sample.id
if (verbose) cat(" sample.id")
s <- read.gdsn(index.gdsn(srcfile, "sample.id"))
n <- .AddVar(storage.option, dstfile, "sample.id", s, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add variant.id
if (verbose) cat(" variant.id")
s <- read.gdsn(index.gdsn(srcfile, "snp.id"))
n <- .AddVar(storage.option, dstfile, "variant.id", s, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add position
if (verbose) cat(" position")
s <- read.gdsn(index.gdsn(srcfile, "snp.position"))
n <- .AddVar(storage.option, dstfile, "position", s, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add chromosome
if (verbose) cat(" chromosome")
s <- read.gdsn(index.gdsn(srcfile, "snp.chromosome"))
s <- as.character(s)
n <- .AddVar(storage.option, dstfile, "chromosome", s, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add allele
nd_allele <- .AddVar(storage.option, dstfile, "allele", storage="character")
# add a folder for genotypes
if (verbose) cat(" genotype")
varGeno <- addfolder.gdsn(dstfile, "genotype")
put.attr.gdsn(varGeno, "VariableName", "GT")
put.attr.gdsn(varGeno, "Description", "Genotype")
# major reference
.cfunction("FC_SNP2GDS_Ref")(major.ref)
nd_geno <- .AddVar(storage.option, varGeno, "data", storage="bit2",
valdim=c(2L, nSamp, 0L))
nd_geno_idx <- .AddVar(storage.option, varGeno, "@data",
storage="uint8", visible=FALSE)
n <- .AddVar(storage.option, varGeno, "extra.index", storage="int32",
valdim=c(3L,0L), closezip=TRUE)
put.attr.gdsn(n, "R.colnames",
c("sample.index", "variant.index", "length"))
.AddVar(storage.option, varGeno, "extra", storage="int16", closezip=TRUE)
if (geno_type == "Integer")
{
# add genotypes to genotype/data
apply.gdsn(list(index.gdsn(srcfile, "genotype"),
index.gdsn(srcfile, "snp.allele")),
c(ifelse(snpfirstdim, 1L, 2L), 1L),
FUN=.cfunction("FC_SNP2GDS"), as.is="gdsnode",
target.node=list(nd_geno, nd_allele))
readmode.gdsn(nd_geno)
.DigestCode(nd_geno, digest, verbose)
if (verbose) cat(" allele")
readmode.gdsn(nd_allele)
.DigestCode(nd_allele, digest, verbose)
.append_rep_gds(nd_geno_idx, as.raw(1L), nSNP)
readmode.gdsn(nd_geno_idx)
.DigestCode(nd_geno_idx, digest, FALSE)
sync.gds(dstfile)
} else {
# add dosages to annotation/format/DS
readmode.gdsn(nd_geno)
readmode.gdsn(nd_geno_idx)
}
# add a folder for phase information
if (verbose) cat(" phase")
varPhase <- addfolder.gdsn(dstfile, "phase")
n <- .AddVar(storage.option, varPhase, "data", storage="bit1",
valdim=c(nSamp, 0L))
if (geno_type == "Integer")
.append_rep_gds(n, as.raw(0L), as.double(nSNP)*nSamp)
readmode.gdsn(n)
.DigestCode(n, digest, verbose)
n <- .AddVar(storage.option, varPhase, "extra.index", storage="int32",
valdim=c(3L,0L), closezip=TRUE)
put.attr.gdsn(n, "R.colnames",
c("sample.index", "variant.index", "length"))
.AddVar(storage.option, varPhase, "extra", storage="bit1", closezip=TRUE)
sync.gds(dstfile)
# add annotation folder
varAnnot <- addfolder.gdsn(dstfile, "annotation")
# add annotation/id
if (verbose) cat(" annotation/id")
n <- .AddVar(storage.option, varAnnot, "id", storage="string")
if (is.null(index.gdsn(srcfile, "snp.rs.id", silent=TRUE)))
assign.gdsn(n, index.gdsn(srcfile, "snp.id"))
else
assign.gdsn(n, index.gdsn(srcfile, "snp.rs.id"))
readmode.gdsn(n)
.DigestCode(n, digest, verbose)
# add annotation/qual
n <- .AddVar(storage.option, varAnnot, "qual", storage="float")
.append_rep_gds(n, 100.0, nSNP)
readmode.gdsn(n)
.DigestCode(n, digest, FALSE)
# add filter
n <- .AddVar(storage.option, varAnnot, "filter", storage="int32")
.append_rep_gds(n, as.raw(1L), nSNP)
readmode.gdsn(n)
put.attr.gdsn(n, "R.class", "factor")
put.attr.gdsn(n, "R.levels", c("PASS"))
put.attr.gdsn(n, "Description", c("All filters passed"))
.DigestCode(n, digest, FALSE)
# add the INFO field
n1 <- addfolder.gdsn(varAnnot, "info")
n2 <- index.gdsn(srcfile, "snp.annot", silent=TRUE)
if (!is.null(n2))
{
for (i in ls.gdsn(n2))
{
if (verbose) cat(" annotation/info/", i, sep="")
copyto.gdsn(n1, index.gdsn(n2, i))
.DigestCode(index.gdsn(n1, i), digest, verbose)
}
}
# add the FORMAT field
n <- addfolder.gdsn(varAnnot, "format")
# add dosages to annotation/format/DS
if (geno_type == "Real")
{
if (verbose) cat(" annotation/format/DS")
varGeno <- addfolder.gdsn(n, "DS")
put.attr.gdsn(varGeno, "Number", "1")
put.attr.gdsn(varGeno, "Type", "Float")
put.attr.gdsn(varGeno, "Description", "Estimated alternate allele dosage")
nd_geno <- .AddVar(storage.option, varGeno, "data", storage=ds.type,
valdim=c(nSamp, 0L))
# add genotypes to genotype/data
apply.gdsn(list(index.gdsn(srcfile, "genotype"),
index.gdsn(srcfile, "snp.allele")),
c(ifelse(snpfirstdim, 1L, 2L), 1L),
FUN=.cfunction("FC_Dosage2GDS"), as.is="gdsnode",
target.node=list(nd_geno, nd_allele))
readmode.gdsn(nd_geno)
.DigestCode(nd_geno, digest, verbose)
if (verbose) cat(" allele")
readmode.gdsn(nd_allele)
.DigestCode(nd_allele, digest, verbose)
nd_geno_idx <- .AddVar(storage.option, varGeno, "@data",
storage="uint8", visible=FALSE)
.append_rep_gds(nd_geno_idx, as.raw(1L), nSNP)
readmode.gdsn(nd_geno_idx)
.DigestCode(nd_geno_idx, digest, FALSE)
sync.gds(dstfile)
}
# add sample annotation
if (verbose) cat(" sample.annotation\n")
n <- addfolder.gdsn(dstfile, "sample.annotation")
n1 <- index.gdsn(srcfile, "sample.annot", silent=TRUE)
if (!is.null(n1))
{
for (i in ls.gdsn(n1))
{
if (verbose) cat(" sample.annotation/", i, sep="")
copyto.gdsn(n, index.gdsn(n1, i))
.DigestCode(index.gdsn(n, i), digest, verbose)
}
}
on.exit()
closefn.gds(srcfile)
closefn.gds(dstfile)
##################################################
# optimize access efficiency
if (verbose)
{
cat("Done.\n")
cat(date(), "\n", sep="")
}
if (optimize)
{
if (verbose)
cat("Optimize the access efficiency ...\n")
cleanup.gds(out.fn, verbose=verbose)
if (verbose) cat(date(), "\n", sep="")
}
# output
invisible(normalizePath(out.fn))
}
#######################################################################
# Convert a PLINK BED file to a SeqArray GDS file
#
seqBED2GDS <- function(bed.fn, fam.fn, bim.fn, out.gdsfn,
compress.geno="LZMA_RA", compress.annotation="LZMA_RA", chr.conv=TRUE,
optimize=TRUE, digest=TRUE, parallel=FALSE, verbose=TRUE)
{
# check
stopifnot(is.character(bed.fn), length(bed.fn)==1L)
if (missing(fam.fn) && missing(bim.fn))
{
bed.fn <- gsub("\\.bed$", "", bed.fn, ignore.case=TRUE)
fam.fn <- paste0(bed.fn, ".fam")
bim.fn <- paste0(bed.fn, ".bim")
bed.fn <- paste0(bed.fn, ".bed")
}
stopifnot(is.character(fam.fn), length(fam.fn)==1L)
stopifnot(is.character(bim.fn), length(bim.fn)==1L)
stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L)
stopifnot(is.character(compress.geno), length(compress.geno)==1L)
stopifnot(is.character(compress.annotation), length(compress.annotation)==1L)
stopifnot(is.logical(chr.conv), length(chr.conv)==1L)
stopifnot(is.logical(optimize), length(optimize)==1L)
stopifnot(is.logical(digest) | is.character(digest), length(digest)==1L)
stopifnot(is.logical(verbose), length(verbose)==1L)
pnum <- .NumParallel(parallel)
if (verbose)
{
cat(date(), "\n", sep="")
cat("PLINK BED to SeqArray GDS:\n")
}
## open and detect bed.fn ##
bedfile <- .open_bin(bed.fn)
on.exit({ .close_conn(bedfile) })
b <- as.integer(readBin(bedfile$con, raw(), 3L))
if ((b[1L] != 0x6C) || (b[2L] != 0x1B))
stop(sprintf("Invalid magic number (0x%02X,0x%02X).", b[1L], b[2L]))
bed_flag <- b[3L] == 0L
if (verbose)
{
cat(" bed file: ", sQuote(bed.fn), "\n ",
.pretty_size(file.size(bed.fn)), ", ",
ifelse(bed_flag, "sample-major mode: [SNP, sample]",
"SNP-major mode: [sample, SNP]"), "\n", sep="")
}
## read fam.fn ##
f <- .open_text(fam.fn, TRUE)
famD <- read.table(f$con, header=FALSE, comment.char="",
stringsAsFactors=FALSE)
.close_conn(f)
names(famD) <- c("FamilyID", "InvID", "PatID", "MatID", "Sex", "Pheno")
if (anyDuplicated(famD$InvID) == 0L)
{
sample.id <- famD$InvID
} else {
sample.id <- paste(famD$FamilyID, famD$InvID, sep="-")
if (length(unique(sample.id)) != dim(famD)[1])
stop("IDs in PLINK BED are not unique!")
}
if (verbose)
{
n <- nrow(famD)
cat(" fam file: ", sQuote(fam.fn), "\n ",
.pretty_size(file.size(fam.fn)), ", ",
.pretty(n), " sample", .plural(n), "\n", sep="")
}
## read bim.fn ##
f <- .open_text(bim.fn, TRUE)
bimD <- read.table(f$con, header=FALSE, comment.char="",
stringsAsFactors=FALSE)
.close_conn(f)
names(bimD) <- c("chr", "snp.id", "map", "pos", "allele1", "allele2")
if (verbose)
{
n <- nrow(bimD)
cat(" bim file: ", sQuote(bim.fn), "\n ",
.pretty_size(file.size(bim.fn)), ", ",
.pretty(n), " variant", .plural(n), "\n", sep="")
}
if (chr.conv)
{
x <- bimD$chr
x23 <- sum(x==23L, na.rm=TRUE)
x24 <- sum(x==24L, na.rm=TRUE)
x25 <- sum(x==25L, na.rm=TRUE)
x26 <- sum(x==26L, na.rm=TRUE)
if (x23 && verbose)
cat(" chromosome code 23 => X (", .pretty(x23), ")\n", sep="")
if (x23) x[x == 23L] <- "X"
if (x24 && verbose)
cat(" chromosome code 24 => Y (", .pretty(x24), ")\n", sep="")
if (x24) x[x == 24L] <- "Y"
if (x25 && verbose)
cat(" chromosome code 25 => XY (", .pretty(x25), ")\n", sep="")
if (x25) x[x == 25L] <- "XY"
if (x26 && verbose)
cat(" chromosome code 26 => MT (", .pretty(x26), ")\n", sep="")
if (x26) x[x == 26L] <- "MT"
bimD$chr <- x
}
## create GDS file ##
dstfile <- createfn.gds(out.gdsfn)
# close the file at the end
on.exit({ closefn.gds(dstfile) }, add=TRUE)
put.attr.gdsn(dstfile$root, "FileFormat", "SEQ_ARRAY")
put.attr.gdsn(dstfile$root, "FileVersion", "v1.0")
n <- addfolder.gdsn(dstfile, "description")
put.attr.gdsn(n, "source.format", "PLINK BED Format")
# add sample.id
if (verbose) cat(" sample.id")
n <- add.gdsn(dstfile, "sample.id", sample.id, compress=compress.annotation,
closezip=TRUE)
.DigestCode(n, digest, verbose)
# add variant.id
if (verbose) cat(" variant.id")
n <- add.gdsn(dstfile, "variant.id", seq_len(nrow(bimD)),
compress=compress.annotation, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add position
if (verbose) cat(" position")
n <- add.gdsn(dstfile, "position", bimD$pos, storage="int32",
compress=compress.annotation, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add chromosome
if (verbose) cat(" chromosome")
n <- add.gdsn(dstfile, "chromosome", bimD$chr, storage="string",
compress=compress.annotation, closezip=TRUE)
.DigestCode(n, digest, verbose)
# RLE-coded chromosome
.optim_chrom(dstfile)
# add allele
if (verbose) cat(" allele")
n <- add.gdsn(dstfile, "allele", paste(bimD$allele2, bimD$allele1, sep=","),
storage="string", compress=compress.annotation, closezip=TRUE)
.DigestCode(n, digest, verbose)
# add a folder for genotypes
if (verbose)
cat(" genotype", ifelse(verbose, ":\n", ""), sep="")
n <- addfolder.gdsn(dstfile, "genotype")
put.attr.gdsn(n, "VariableName", "GT")
put.attr.gdsn(n, "Description", "Genotype")
# sync file
sync.gds(dstfile)
# add genotypes to genotype/data
num4 <- ifelse(bed_flag, nrow(bimD), nrow(famD))
cnt4 <- ifelse(bed_flag, nrow(famD), nrow(bimD))
vg <- add.gdsn(n, "data", storage="bit2", valdim=c(2L, num4, 0L),
compress=ifelse(bed_flag, "", compress.geno))
# num of cores/processes
if (pnum <= 1L)
{
# convert
.Call(SEQ_ConvBED2GDS, vg, cnt4, bedfile$con, readBin, new.env(),
if (verbose) stdout() else NULL)
readmode.gdsn(vg)
} else {
# close bed file
on.exit()
.close_conn(bedfile)
bedfile <- NULL
# split to multiple files
psplit <- .file_split(cnt4, pnum)
# need unique temporary file names
ptmpfn <- .get_temp_fn(pnum,
sub("^([^.]*).*", "\\1", basename(out.gdsfn)), dirname(out.gdsfn))
if (verbose)
{
cat(sprintf(" >>> writing to %d files: <<<\n", pnum))
cat(sprintf(" %s\t[%s .. %s]\n", basename(ptmpfn),
.pretty(psplit[[1L]]),
.pretty(psplit[[1L]] + psplit[[2L]] - 1L)), sep="")
flush.console()
}
# working flags
.packageEnv$work_idx <- 1L
.packageEnv$work_flag <- rep(FALSE, pnum)
# conversion in parallel
seqParallel(parallel, NULL, FUN = function(bed.fn, tmp.fn, num4, psplit, cp)
{
# the process id, starting from one
i <- process_index
# open the bed file
bedfile <- .open_bin(bed.fn)
on.exit({ .close_conn(bedfile) })
# create gds file
f <- createfn.gds(tmp.fn[i])
on.exit({ closefn.gds(f) }, add=TRUE)
# progress file
progfile <- file(paste0(tmp.fn[i], ".progress"), "wt")
cat(tmp.fn[i], ":\n", file=progfile, sep="")
on.exit({ close(progfile) }, add=TRUE)
# new a gds node
vg <- add.gdsn(f, "data", storage="bit2", valdim=c(2L, num4, 0L),
compress=cp)
# re-position the file
cnt <- psplit[[2L]][i]
if (cnt > 0L)
{
n4 <- (num4 %/% 4L) + (num4 %% 4L > 0L)
n4 <- 3L + n4 * (psplit[[1L]][i] - 1L)
if (bedfile$fmt == "")
{
seek(bedfile$con, n4)
} else {
# gz or xz
while (n4 > 0L)
{
m <- if (n4 <= 65536L) n4 else 65536L
readBin(bedfile$con, raw(), m)
n4 <- n4 - m
}
}
# convert
.Call(SEQ_ConvBED2GDS, vg, cnt, bedfile$con, readBin, new.env(),
progfile)
}
readmode.gdsn(vg)
# output
process_index
}, split="none", bed.fn=bed.fn, tmp.fn=ptmpfn, num4=num4,
psplit=psplit, cp=ifelse(bed_flag, "", compress.geno),
# combine the files as many as possible
.combine = function(fn_idx)
{
# set TRUE to indicate the file completed
.packageEnv$work_flag[fn_idx] <- TRUE
if (verbose && fn_idx==1L)
cat(" >>> merging the files: <<<\n")
# check whether merging the file or not
if (.packageEnv$work_idx == fn_idx)
{
while (isTRUE(.packageEnv$work_flag[fn_idx]))
{
if (verbose)
cat(" ", basename(ptmpfn[fn_idx]))
if (psplit[[2L]][fn_idx] > 0L)
{
f <- openfn.gds(ptmpfn[fn_idx])
append.gdsn(vg, index.gdsn(f, "data"))
closefn.gds(f)
}
if (verbose) cat("\t[Done]\n")
fn_idx <- fn_idx + 1L
}
.packageEnv$work_idx <- fn_idx
}
})
# delete temporary files
unlink(c(ptmpfn, paste0(ptmpfn, ".progress")), force=TRUE)
.packageEnv$work_idx <- NULL
.packageEnv$work_flag <- NULL
if (verbose && !isFALSE(digest)) cat(" ")
}
n1 <- add.gdsn(n, "@data", storage="uint8", compress=compress.annotation,
visible=FALSE)
.append_rep_gds(n1, as.raw(1L), nrow(bimD))
readmode.gdsn(n1)
.DigestCode(n1, digest, FALSE)
n1 <- add.gdsn(n, "extra.index", storage="int32", valdim=c(3L,0L),
compress=compress.geno, closezip=TRUE)
put.attr.gdsn(n1, "R.colnames",
c("sample.index", "variant.index", "length"))
add.gdsn(n, "extra", storage="int16", compress=compress.geno, closezip=TRUE)
# sync file
sync.gds(dstfile)
# close the BED file
on.exit({ closefn.gds(dstfile) })
if (!is.null(bedfile)) .close_conn(bedfile)
if (bed_flag)
{
cat(" (transposed)")
permdim.gdsn(vg, c(2L,1L))
}
if (verbose) cat(" ")
.DigestCode(vg, digest, verbose)
# add a folder for phase information
if (verbose) cat(" phase")
n <- addfolder.gdsn(dstfile, "phase")
# add phase data
n1 <- add.gdsn(n, "data", storage="bit1", valdim=c(nrow(famD), 0L),
compress=compress.annotation)
if (pnum <= 1L)
{
.append_rep_gds(n1, as.raw(0L), as.double(nrow(bimD))*nrow(famD))
} else {
# split to multiple files
psplit <- .file_split(nrow(bimD), pnum)
if (verbose)
cat(sprintf(":\n >>> writing to %d files: <<<\n", pnum))
# conversion in parallel
seqParallel(parallel, NULL, FUN = function(tmp.fn, nsamp, psplit, cp)
{
# the process id, starting from one
i <- process_index
# create gds file
f <- createfn.gds(tmp.fn[i])
on.exit(closefn.gds(f))
# new a gds node
vg <- add.gdsn(f, "data", storage="bit1", valdim=c(nsamp, 0L), compress=cp)
# re-position the file
cnt <- psplit[[2L]][i]
if (cnt > 0L)
.append_rep_gds(vg, as.raw(0L), as.double(cnt)*nsamp)
readmode.gdsn(vg)
invisible()
}, split="none", tmp.fn=ptmpfn, nsamp=nrow(famD), psplit=psplit,
cp=compress.annotation)
if (verbose) cat(" merging files ...")
lapply(ptmpfn, function(fn) {
f <- openfn.gds(fn)
on.exit({ closefn.gds(f); unlink(fn, force=TRUE) })
append.gdsn(n1, index.gdsn(f, "data"))
})
if (verbose) cat(" [Done]\n ")
}
readmode.gdsn(n1)
.DigestCode(n1, digest, verbose)
n1 <- add.gdsn(n, "extra.index", storage="int32", valdim=c(3L,0L),
compress=compress.annotation, closezip=TRUE)
put.attr.gdsn(n1, "R.colnames",
c("sample.index", "variant.index", "length"))
add.gdsn(n, "extra", storage="bit1", compress=compress.annotation,
closezip=TRUE)
# add annotation folder
n <- addfolder.gdsn(dstfile, "annotation")
# add annotation/id
n1 <- add.gdsn(n, "id", bimD$snp.id, storage="string",
compress=compress.annotation, closezip=TRUE)
if (verbose) cat(" annotation/id")
.DigestCode(n1, digest, verbose)
# add annotation/qual
n1 <- add.gdsn(n, "qual", storage="float", compress=compress.annotation)
.append_rep_gds(n1, 100.0, nrow(bimD))
readmode.gdsn(n1)
if (verbose) cat(" annotation/qual")
.DigestCode(n1, digest, verbose)
# add filter
n1 <- add.gdsn(n, "filter", storage="int32", compress=compress.annotation)
.append_rep_gds(n1, as.raw(1L), nrow(bimD))
readmode.gdsn(n1)
put.attr.gdsn(n1, "R.class", "factor")
put.attr.gdsn(n1, "R.levels", c("PASS"))
put.attr.gdsn(n1, "Description", c("All filters passed"))
if (verbose) cat(" annotation/filter")
.DigestCode(n1, digest, verbose)
# add the INFO field
addfolder.gdsn(n, "info")
# add the FORMAT field
addfolder.gdsn(n, "format")
# add sample annotation
if (verbose) cat(" sample.annotation\n")
n <- addfolder.gdsn(dstfile, "sample.annotation")
n1 <- add.gdsn(n, "family", famD$FamilyID, compress=compress.annotation,
closezip=TRUE)
.DigestCode(n1, digest, FALSE)
n1 <- add.gdsn(n, "father", famD$PatID, compress=compress.annotation,
closezip=TRUE)
.DigestCode(n1, digest, FALSE)
n1 <- add.gdsn(n, "mother", famD$MatID, compress=compress.annotation,
closezip=TRUE)
.DigestCode(n1, digest, FALSE)
sex <- rep("", length(sample.id))
sex[famD$Sex==1L] <- "M"; sex[famD$Sex==2L] <- "F"
n1 <- add.gdsn(n, "sex", sex, compress=compress.annotation, closezip=TRUE)
.DigestCode(n1, digest, FALSE)
n1 <- add.gdsn(n, "phenotype", famD$Pheno, compress=compress.annotation,
closezip=TRUE)
.DigestCode(n1, digest, FALSE)
##################################################
# optimize access efficiency
if (verbose)
cat("Done.\n", date(), "\n", sep="")
on.exit()
closefn.gds(dstfile)
if (optimize)
{
if (verbose)
cat("Optimize the access efficiency ...\n")
cleanup.gds(out.gdsfn, verbose=verbose)
if (verbose) cat(date(), "\n", sep="")
}
# output
invisible(normalizePath(out.gdsfn))
}
#######################################################################
# Convert a SeqArray GDS file to a PLINK BED file
#
seqGDS2BED <- function(gdsfile, out.fn, multi.row=FALSE, verbose=TRUE)
{
# check
stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
if (is.character(gdsfile))
stopifnot(length(gdsfile)==1L)
stopifnot(is.character(out.fn), length(out.fn)==1L, !is.na(out.fn))
stopifnot(is.logical(multi.row), length(multi.row)==1L)
stopifnot(is.logical(verbose), length(verbose)==1L)
if (multi.row) stop("'multi.row=TRUE' is not implemented yet.")
if (verbose)
{
cat(date(), "\n", sep="")
cat("SeqArray GDS to PLINK BED:\n")
}
if (is.character(gdsfile))
{
if (verbose) cat(" open ", sQuote(gdsfile), "\n", sep="")
gdsfile <- seqOpen(gdsfile)
on.exit(seqClose(gdsfile))
}
if (verbose)
{
dm <- .seldim(gdsfile)
cat(" # of samples: ", .pretty(dm[2L]), "\n", sep="")
cat(" # of variants: ", .pretty(dm[3L]), "\n", sep="")
}
# fam file
s <- seqGetData(gdsfile, "sample.id")
n <- length(s)
fam <- data.frame(FID=rep(0L, n), IID=s, FAT=rep(0L, n), MOT=rep(0L, n),
sex=rep(0L, n), pheno=rep(-9L, n), stringsAsFactors=FALSE)
nm <- "sample.annotation/sex"
if (!is.null(index.gdsn(gdsfile, nm, silent=TRUE)))
fam$sex <- read.gdsn(index.gdsn(gdsfile, nm))
nm <- "sample.annotation/phenotype"
if (!is.null(index.gdsn(gdsfile, nm, silent=TRUE)))
fam$pheno <- read.gdsn(index.gdsn(gdsfile, nm))
famfn <- paste0(out.fn, ".fam")
if (verbose) cat(" fam file: ", sQuote(famfn), "\n", sep="")
write.table(fam, file=famfn, quote=FALSE, sep="\t", row.names=FALSE,
col.names=FALSE)
remove(fam)
# bim file
n <- .seldim(gdsfile)[3L]
s <- seqGetData(gdsfile, "$alt")
s[s == ""] <- "0"
bim <- data.frame(
chr = seqGetData(gdsfile, "chromosome"),
var = seqGetData(gdsfile, "annotation/id"),
mrg = rep(0L, n),
pos = seqGetData(gdsfile, "position"),
alt1 = gsub(",", "/", s, fixed=TRUE),
alt2 = seqGetData(gdsfile, "$ref"),
stringsAsFactors=FALSE)
bimfn <- paste0(out.fn, ".bim")
if (verbose) cat(" bim file: ", sQuote(bimfn), "\n", sep="")
write.table(bim, file=bimfn, quote=FALSE, sep="\t", row.names=FALSE,
col.names=FALSE)
remove(bim)
# bed file
bedfn <- paste0(out.fn, ".bed")
if (verbose) cat(" bed file: ", sQuote(bedfn), "\n", sep="")
outf <- file(bedfn, "w+b")
on.exit(close(outf), add=TRUE)
writeBin(as.raw(c(0x6C, 0x1B, 0x01)), outf)
seqApply(gdsfile, "$dosage", .cfunction("FC_GDS2BED"), as.is=outf,
.useraw=TRUE, .progress=verbose)
if (verbose)
cat("Done.\n", date(), "\n", sep="")
# output
invisible(normalizePath(c(famfn, bimfn, bedfn)))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.