SeqArray: Data management of large-scale whole-genome sequence variant calls using GDS files

Documented in seqBED2GDS seqEmptyFile seqGDS2BED seqGDS2SNP seqGDS2VCF seqSNP2GDS

#######################################################################
#
# Package Name: SeqArray
#
# Description:
#     Data Management of Large-scale Whole-Genome Sequence Variant Calls
#


# Create a bgzf file
.bgzf_create <- function(fn) .Call(SEQ_bgzip_create, fn)


#######################################################################
# Convert a SeqArray GDS file to a VCF file
#

seqGDS2VCF <- function(gdsfile, vcf.fn, info.var=NULL, fmt.var=NULL,
    chr_prefix="", use_Rsamtools=TRUE, verbose=TRUE)
{
    # check
    stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
    if (is.character(gdsfile))
        stopifnot(length(gdsfile)==1L)
    if (!inherits(vcf.fn, "connection"))
        stopifnot(is.character(vcf.fn), length(vcf.fn)==1L)
    stopifnot(is.null(info.var) | is.character(info.var))
    stopifnot(is.null(fmt.var) | is.character(fmt.var))
    stopifnot(is.character(chr_prefix), length(chr_prefix)==1L)
    stopifnot(is.logical(use_Rsamtools), length(use_Rsamtools)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)

    if (is.character(gdsfile))
    {
        gdsfile <- seqOpen(gdsfile, allow.duplicate=TRUE)
        on.exit(seqClose(gdsfile))
    }

    # get a summary
    z <- seqSummary(gdsfile, check="none", verbose=FALSE)

    # the INFO field
    if (!is.null(info.var))
    {
        s <- z$info$ID
        if (is.null(s)) s <- character()
        if (length(setdiff(info.var, s)) > 0L)
            stop(paste("Not exist:", paste(setdiff(info.var, s), collapse=",")))
        if (length(info.var) > 0L)
            z$info <- z$info[match(info.var, z$info$ID), ]
        else
            z$info <- list()
    }

    # the FORMAT field
    gt <- exist.gdsn(gdsfile, "genotype/data")
    if (!is.null(fmt.var))
    {
        s <- z$format$ID
        if (gt) s <- s[-1L]  # the first is GT
        if (is.null(s)) s <- character()
        if (length(setdiff(fmt.var, s)) > 0L)
            stop(paste("Not exist:", paste(setdiff(fmt.var, s), collapse=",")))
        if (length(fmt.var) > 0L)
            z$format <- z$format[match(fmt.var, z$format$ID), ]
        else
            z$format <- list()
    } else {
        if (gt) z$format <- z$format[-1L,]
    }


    ## double quote text if needed
    dq <- function(s, text=FALSE) .Call(SEQ_Quote, s, text)


    ######################################################
    # create an output text file

    outfmt <- 1L
    if (!inherits(vcf.fn, "connection"))
    {
        # get the file extension
        pos <- regexpr("\\.([[:alnum:]]+)$", vcf.fn)
        ext <- tolower(ifelse(pos > -1L, substring(vcf.fn, pos+1L), ""))
        if (ext %in% c("gz", "bgz"))
        {
            if (.Platform$OS.type == "windows")
            {
                if (isTRUE(use_Rsamtools))
                {
                    warning("Rsamtools is not used on Windows.",
                        immediate.=TRUE)
                }
                use_Rsamtools <- FALSE
            }
            if (isTRUE(use_Rsamtools) && requireNamespace("Rsamtools"))
            {
                ofile <- .bgzf_create(vcf.fn)
                outfmt <- 2L
            } else {
                if (verbose)
                {
                    if (.Platform$OS.type != "windows")
                        message("Hint: install Rsamtools to enable the BGZF-format output.")
                }
                ofile <- gzfile(vcf.fn, "wb")
                outfmt <- 3L
            }
        } else if (ext == "bz")
        {
            ofile <- bzfile(vcf.fn, "wb")
            outfmt <- 4L
        } else if (ext == "xz")
        {
            ofile <- xzfile(vcf.fn, "wb")
            outfmt <- 5L
        } else {
            ofile <- file(vcf.fn, open="wb")
        }
        on.exit(close(ofile), add=TRUE)
    } else {
        ofile <- vcf.fn
    }

    op <- options("useFancyQuotes")
    options(useFancyQuotes = FALSE)
    on.exit(options(op), add=TRUE)

    if (verbose)
    {
        .cat(date())
        .cat("VCF Export: ", basename(vcf.fn))
        s <- .seldim(gdsfile)
        .cat("    ", .pretty(s[2L]), " sample", .plural(s[2L]), ", ",
            .pretty(s[3L]), " variant", .plural(s[3L]))
        s <- paste(z$info$ID, collapse=", ")
        .cat("    INFO Field: ", ifelse(s!="", s, "<none>"))
        s <- paste(z$format$ID, collapse=", ")
        .cat("    FORMAT Field: ", ifelse(s!="", s, "<none>"))
        .cat("    output to ",
            c("a VCF text file", "a BGZF-format gzip file",
            "a general gzip file", "a bz file", "a xz file")[outfmt])
    }


    ######################################################
    # write the header

    txt <- character()
    a <- get.attr.gdsn(index.gdsn(gdsfile, "description"))

    # fileformat
    if (is.null(a$vcf.fileformat))
        a$vcf.fileformat <- "VCFv4.2"
    txt <- c(txt, paste0("##fileformat=", a$vcf.fileformat))

    # fileDate
    txt <- c(txt, paste0("##fileDate=", format(Sys.time(), "%Y%m%d")))

    # program, source
    aa <- get.attr.gdsn(gdsfile$root)
    if (is.null(aa$FileVersion))
        aa$FileVersion <- "v1.0"
    txt <- c(txt, paste0("##source=SeqArray_Format_", aa$FileVersion))

    # reference
    if (length(z$reference) > 0L)
        txt <- c(txt, paste0("##reference=", z$reference[1L]))

    # assembly
    if (!is.null(a$vcf.assembly))
        txt <- c(txt, paste0("##assembly=", dq(a$vcf.assembly)))

    # ALT=<ID=type,Description=description>
    for (i in seq_len(nrow(z$allele)))
    {
        s <- sprintf("##ALT=<ID=%s,Description=%s>",
            as.character(z$allele[i,1L]), dq(z$allele[i,2L]))
        txt <- c(txt, s)
    }

    # contig=<ID=ctg1,URL=ftp://somewhere.org/assembly.fa,...>
    n <- index.gdsn(gdsfile, "description/vcf.contig", silent=TRUE)
    if (!is.null(n))
    {
        dat <- read.gdsn(n)
        nm <- names(dat)
        for (i in seq_len(nrow(dat)))
        {
            s <- NULL
            for (j in seq_len(ncol(dat)))
                s[j] <- paste(nm[j], "=", dq(dat[i,j]), sep="")
            s <- paste(s, collapse=",")
            txt <- c(txt, paste0("##contig=<", s, ">"))
        }
    }

    # INFO field
    for (nm in z$info$ID)
    {
        a <- get.attr.gdsn(index.gdsn(gdsfile,
            paste("annotation/info/", nm, sep="")))
        s <- sprintf("ID=%s,Number=%s,Type=%s,Description=%s",
            nm, dq(a$Number), dq(a$Type), dq(a$Description, TRUE))
        if (!is.null(a$Source))
            s <- paste(s, ",Source=", dq(a$Source, TRUE), sep="")
        if (!is.null(a$Version))
            s <- paste(s, ",Version=", dq(a$Version, TRUE), sep="")
        txt <- c(txt, paste0("##INFO=<", s, ">"))
    }

    # FILTER field
    n <- index.gdsn(gdsfile, "annotation/filter", silent=TRUE)
    if (!is.null(n))
    {
        at <- get.attr.gdsn(n)
        id <- at$R.levels; dp <- at$Description
        for (i in seq_along(id))
        {
            txt <- c(txt, sprintf("##FILTER=<ID=%s,Description=%s>",
                dq(id[i]), dq(dp[i], TRUE)))
        }
    }

    # FORMAT field
    a <- get.attr.gdsn(index.gdsn(gdsfile, "genotype"))
    txt <- c(txt, sprintf(
        "##FORMAT=<ID=%s,Number=1,Type=String,Description=%s>",
        a$VariableName, dq(a$Description, TRUE)))
    for (nm in z$format$ID)
    {
        a <- get.attr.gdsn(index.gdsn(gdsfile,
            paste("annotation/format/", nm, sep="")))
        txt <- c(txt, sprintf(
            "##FORMAT=<ID=%s,Number=%s,Type=%s,Description=%s>",
            nm, dq(a$Number), dq(a$Type), dq(a$Description, TRUE)))
    }

    # others
    n <- index.gdsn(gdsfile, "description/vcf.header", silent=TRUE)
    if (!is.null(n))
    {
        dat <- read.gdsn(n)
        for (i in seq_len(nrow(dat)))
        {
            s <- dat[i,1L]
            if (!(s %in% c("fileDate", "source")))
                txt <- c(txt, paste0("##", s, "=", dq(dat[i,2L])))
        }
    }


    ######################################################
    # write the header -- samples

    a <- c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
    s <- seqGetData(gdsfile, "sample.id")
    if (length(s) > 0L) a <- c(a, "FORMAT", s)
    txt <- c(txt, paste(a, collapse="\t"))
    writeLines(txt, ofile)


    ######################################################
    # write the contents

    # the INFO field
    nm.info <- character()
    if (!is.null(z$info$ID))
    {
        s <- z$info$ID
        if (length(s) > 0L)
            nm.info <- paste("annotation/info/", s, sep="")
    }

    # the FORMAT field
    nm.format <- character()
    if (!is.null(z$format$ID))
    {
        s <- z$format$ID
        if (length(s) > 0L)
            nm.format <- paste("annotation/format/", s, sep="")
    }

    # initialize the variable length of INFO
    len.info <- NULL
    for (n in nm.info)
    {
        a <- get.attr.gdsn(index.gdsn(gdsfile, n))
        a$Number <- if (is.null(a$Number)) "." else a$Number[1L]
        len.info <- c(len.info, a$Number)
    }
    len.info <- suppressWarnings(as.integer(len.info))

    # initialize the variable length of FORMAT
    len.fmt <- NULL
    for (n in nm.format)
    {
        a <- get.attr.gdsn(index.gdsn(gdsfile, n))
        a$Number <- if (is.null(a$Number)) "." else a$Number[1L]
        len.fmt <- c(len.fmt, a$Number)
    }
    len.fmt <- suppressWarnings(as.integer(len.fmt))

    # initialize
    dm <- .seldim(gdsfile)
    .Call(SEQ_ToVCF_Init, dm, chr_prefix, len.info, len.fmt, ofile, verbose)
    on.exit({ .Call(SEQ_ToVCF_Done) }, add=TRUE)

    # variable names
    nm <- c("chromosome", "position", "annotation/id", "allele",
        "annotation/qual", "annotation/filter", "genotype", "phase")
    if (length(nm.info) > 0L) nm <- c(nm, nm.info)
    if (length(nm.format) > 0L) nm <- c(nm, nm.format)

    s <- c("chr", "pos", "id", "allele", "qual", "filter", "geno", "phase")
    # the INFO field
    if (length(nm.info) > 0L)
        s <- c(s, paste("info.", z$info$ID, sep=""))
    # the FORMAT field
    if (length(nm.format) > 0L)
        s <- c(s, paste("fmt.", z$format$ID, sep=""))
    names(nm) <- s

    # C function name
    if (!exist.gdsn(gdsfile, "genotype/data"))
    {
        nm <- nm[!(nm %in% c("genotype", "phase"))]
        cfn <- "SEQ_ToVCF_NoGeno"
    } else if (!exist.gdsn(gdsfile, "phase/data"))
    {
        nm <- nm[nm != "phase"]
        cfn <- "SEQ_ToVCF_Haploid"
    } else {
        if (length(nm.format)>0L || dm[1L]!=2L)
            cfn <- "SEQ_ToVCF"
        else
            cfn <- "SEQ_ToVCF_Di_WrtFmt"  # for a faster version
    }

    # output lines by variant
    seqApply(gdsfile, nm, margin="by.variant", as.is="none",
        FUN=.cfunction(cfn), .useraw=NA, .progress=verbose)

    # finalize
    .Call(SEQ_ToVCF_Done)
    on.exit({
        if (verbose)
            cat(date(), "    Done.\n", sep="")
    }, add=TRUE)

    # output
    if (!inherits(vcf.fn, "connection"))
        invisible(normalizePath(vcf.fn))
    else
        invisible()
}



#######################################################################
# Convert a SeqArray GDS file to a SNP GDS file
#

seqGDS2SNP <- function(gdsfile, out.gdsfn, dosage=FALSE,
    compress.geno="LZMA_RA", compress.annotation="LZMA_RA",
    ds.type=c("packedreal16", "float", "double"), optimize=TRUE, verbose=TRUE)
{
    # check
    stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
    stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L)
    stopifnot(is.logical(dosage) | is.character(dosage), length(dosage)==1L)
    stopifnot(is.character(compress.geno), length(compress.geno)==1L)
    stopifnot(is.character(compress.annotation), length(compress.annotation)==1L)
    stopifnot(is.logical(optimize), length(optimize)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)
    ds.type <- match.arg(ds.type)

    if (verbose)
    {
        cat(date(), "\n", sep="")
        if (isTRUE(dosage) | is.character(dosage))
            cat("SeqArray GDS to SNP GDS dosage format:\n")
        else
            cat("SeqArray GDS to SNP GDS format:\n")
    }

    # if it is a file name
    if (is.character(gdsfile))
    {
        gdsfile <- seqOpen(gdsfile, allow.duplicate=TRUE)
        on.exit({ seqClose(gdsfile) })
    }

    # dosage variable name
    if (isTRUE(dosage) | is.character(dosage))
    {
        if (isTRUE(dosage))
            dosage <- "annotation/format/DS"
        else if (substr(dosage, 1L, 18L) != "annotation/format/")
            dosage <- paste0("annotation/format/", dosage)
    }

    if (verbose)
    {
        dm <- .seldim(gdsfile)
        cat("    # of samples: ", .pretty(dm[2L]), "\n", sep="")
        cat("    # of variants: ", .pretty(dm[3L]), "\n", sep="")
        if (is.character(dosage))
        {
            cat("    dosage compression: ", compress.geno, ", from [",
                dosage, "]\n", sep="")
        } else {
            cat("    genotype compression: ", compress.geno, "\n", sep="")
        }
        cat("    annotation compression: ", compress.annotation, "\n", sep="")
    }

    # create GDS file
    gfile <- createfn.gds(out.gdsfn)
    # close the file at the end
    on.exit({
        if (!is.null(gfile)) closefn.gds(gfile)
    }, add=TRUE)

    # add a flag
    if (!is.character(dosage))
    {
        put.attr.gdsn(gfile$root, "FileFormat", "SNP_ARRAY")
    } else {
        put.attr.gdsn(gfile$root, "FileFormat", "IMPUTED_DOSAGE")
    }

    # add "sample.id"
    sampid <- seqGetData(gdsfile, "sample.id")
    add.gdsn(gfile, "sample.id", sampid,
        compress=compress.annotation, closezip=TRUE)

    # add "snp.id"
    add.gdsn(gfile, "snp.id", seqGetData(gdsfile, "variant.id"),
        compress=compress.annotation, closezip=TRUE)

    # add "snp.rs.id"
    if (!is.null(index.gdsn(gdsfile, "annotation/id", silent=TRUE)))
    {
        add.gdsn(gfile, "snp.rs.id", seqGetData(gdsfile, "annotation/id"),
            compress=compress.annotation, closezip=TRUE)
    }

    # add "snp.position"
    add.gdsn(gfile, "snp.position", seqGetData(gdsfile, "position"),
        compress=compress.annotation, closezip=TRUE)

    # add "snp.chromosome"
    add.gdsn(gfile, "snp.chromosome", seqGetData(gdsfile, "chromosome"),
        compress=compress.annotation, closezip=TRUE)

    # add "snp.allele"
    add.gdsn(gfile, "snp.allele",
        .cfunction("FC_AlleleStr")(seqGetData(gdsfile, "allele")),
        compress=compress.annotation, closezip=TRUE)

    # add "genotype"
    if (!is.character(dosage))
    {
        gGeno <- add.gdsn(gfile, "genotype", storage="bit2",
            valdim=c(length(sampid), 0L), compress=compress.geno)
        put.attr.gdsn(gGeno, "sample.order")
        seqApply(gdsfile, "$dosage", as.is=gGeno, .useraw=TRUE, .progress=verbose,
            FUN = .cfunction("FC_GDS2SNP"))
    } else {
        .cfunction("FC_SetNumSamp")(length(sampid))
        gGeno <- add.gdsn(gfile, "genotype", storage=ds.type,
            valdim=c(length(sampid), 0L), compress=compress.geno)
        put.attr.gdsn(gGeno, "sample.order")
        seqApply(gdsfile, dosage, as.is=gGeno, .progress=verbose,
            FUN = .cfunction("FC_GDS2Dosage"))
    }

    readmode.gdsn(gGeno)
    closefn.gds(gfile)
    gfile <- NULL
    if (verbose)
        cat("Done.\n", date(), "\n", sep="")

    if (optimize)
    {
        if (verbose) cat("Optimize the access efficiency ...\n")
        cleanup.gds(out.gdsfn, verbose=verbose)
        if (verbose) cat(date(), "\n", sep="")
    }

    # output
    invisible(normalizePath(out.gdsfn))
}



#######################################################################
# Convert a SNP GDS file to a SeqArray GDS file
#

seqSNP2GDS <- function(gds.fn, out.fn, storage.option="LZMA_RA", major.ref=TRUE,
    ds.type=c("packedreal16", "float", "double"), optimize=TRUE, digest=TRUE,
    verbose=TRUE)
{
    # check
    stopifnot(is.character(gds.fn), length(gds.fn)==1L)
    stopifnot(is.character(out.fn), length(out.fn)==1L)
    ds.type <- match.arg(ds.type)
    stopifnot(is.logical(major.ref), length(major.ref)==1L)
    stopifnot(is.logical(optimize), length(optimize)==1L)
    stopifnot(is.logical(digest) | is.character(digest), length(digest)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)

    if (is.character(storage.option))
        storage.option <- seqStorageOption(storage.option)
    stopifnot(inherits(storage.option, "SeqGDSStorageClass"))

    if (verbose)
    {
        cat(date(), "\n", sep="")
        cat("SNP GDS to SeqArray GDS Format:\n")
    }

    # open the SNP GDS
    srcfile <- openfn.gds(gds.fn)
    on.exit({ closefn.gds(srcfile) })

    nSamp <- prod(objdesp.gdsn(index.gdsn(srcfile, "sample.id"))$dim)
    nSNP <- prod(objdesp.gdsn(index.gdsn(srcfile, "snp.id"))$dim)

    # check genotype type
    n <- index.gdsn(srcfile, "genotype")
    dm <- objdesp.gdsn(n)$dim
    if (length(dm) != 2L)
        stop("'genotype' of SNP GDS should be a matrix.")
    geno_type <- as.character(objdesp.gdsn(n)$type)
    if (!(geno_type %in% c("Integer", "Real")))
        stop("'genotype' should be integers or real numbers.")

    # determine how genotypes are stored
    snpfirstdim <- TRUE
    rd <- names(get.attr.gdsn(n))
    if ("snp.order" %in% rd) snpfirstdim <- TRUE
    if ("sample.order" %in% rd) snpfirstdim <- FALSE
    if (snpfirstdim)
    {
        if ((dm[1L]!=nSNP) || (dm[2L]!=nSamp))
            stop("Invalid dimension of 'genotype'.")
    } else {
        if ((dm[1L]!=nSamp) || (dm[2L]!=nSNP))
            stop("Invalid dimension of 'genotype'.")
    }


    # create GDS file
    dstfile <- createfn.gds(out.fn)
    # close the file at the end
    on.exit({ closefn.gds(dstfile) }, add=TRUE)

    put.attr.gdsn(dstfile$root, "FileFormat", "SEQ_ARRAY")
    put.attr.gdsn(dstfile$root, "FileVersion", "v1.0")

    n <- addfolder.gdsn(dstfile, "description")
    put.attr.gdsn(n, "source.format", "SNPRelate GDS Format")

    # add sample.id
    if (verbose) cat("    sample.id")
    s <- read.gdsn(index.gdsn(srcfile, "sample.id"))
    n <- .AddVar(storage.option, dstfile, "sample.id", s, closezip=TRUE)
    .DigestCode(n, digest, verbose)

    # add variant.id
    if (verbose) cat("    variant.id")
    s <- read.gdsn(index.gdsn(srcfile, "snp.id"))
    n <- .AddVar(storage.option, dstfile, "variant.id", s, closezip=TRUE)
    .DigestCode(n, digest, verbose)

    # add position
    if (verbose) cat("    position")
    s <- read.gdsn(index.gdsn(srcfile, "snp.position"))
    n <- .AddVar(storage.option, dstfile, "position", s, closezip=TRUE)
    .DigestCode(n, digest, verbose)

    # add chromosome
    if (verbose) cat("    chromosome")
    s <- read.gdsn(index.gdsn(srcfile, "snp.chromosome"))
    s <- as.character(s)
    n <- .AddVar(storage.option, dstfile, "chromosome", s, closezip=TRUE)
    .DigestCode(n, digest, verbose)

    # add allele
    nd_allele <- .AddVar(storage.option, dstfile, "allele", storage="character")
    # add a folder for genotypes
    if (verbose) cat("    genotype")
    varGeno <- addfolder.gdsn(dstfile, "genotype")
    put.attr.gdsn(varGeno, "VariableName", "GT")
    put.attr.gdsn(varGeno, "Description", "Genotype")

    # major reference
    .cfunction("FC_SNP2GDS_Ref")(major.ref)

    if (geno_type == "Integer")
    {
        nd_geno <- .AddVar(storage.option, varGeno, "data", storage="bit2",
            valdim=c(2L, nSamp, 0L))
        nd_geno_idx <- .AddVar(storage.option, varGeno, "@data",
            storage="uint8", visible=FALSE)
        n <- .AddVar(storage.option, varGeno, "extra.index", storage="int32",
            valdim=c(3L,0L), closezip=TRUE)
        put.attr.gdsn(n, "R.colnames",
            c("sample.index", "variant.index", "length"))
        .AddVar(storage.option, varGeno, "extra", storage="int16",
            closezip=TRUE)

        # add genotypes to genotype/data
        apply.gdsn(list(index.gdsn(srcfile, "genotype"),
                index.gdsn(srcfile, "snp.allele")),
            c(ifelse(snpfirstdim, 1L, 2L), 1L),
            FUN=.cfunction("FC_SNP2GDS"), as.is="gdsnode",
            target.node=list(nd_geno, nd_allele))
        readmode.gdsn(nd_geno)
        .DigestCode(nd_geno, digest, verbose)

        if (verbose) cat("    allele")
        readmode.gdsn(nd_allele)
        .DigestCode(nd_allele, digest, verbose)

        .append_rep_gds(nd_geno_idx, as.raw(1L), nSNP)
        readmode.gdsn(nd_geno_idx)
        .DigestCode(nd_geno_idx, digest, FALSE)

        sync.gds(dstfile)
    }

    # add a folder for phase information
    if (verbose) cat("    phase")
    varPhase <- addfolder.gdsn(dstfile, "phase")
    if (geno_type == "Integer")
    {
        n <- .AddVar(storage.option, varPhase, "data", storage="bit1",
            valdim=c(nSamp, 0L))
        if (geno_type == "Integer")
            .append_rep_gds(n, as.raw(0L), as.double(nSNP)*nSamp)
        readmode.gdsn(n)
        .DigestCode(n, digest, verbose)
        n <- .AddVar(storage.option, varPhase, "extra.index", storage="int32",
            valdim=c(3L,0L), closezip=TRUE)
        put.attr.gdsn(n, "R.colnames",
            c("sample.index", "variant.index", "length"))
        .AddVar(storage.option, varPhase, "extra", storage="bit1",
            closezip=TRUE)
        sync.gds(dstfile)
    }

    # add annotation folder
    varAnnot <- addfolder.gdsn(dstfile, "annotation")

    # add annotation/id
    if (verbose) cat("    annotation/id")
    n <- .AddVar(storage.option, varAnnot, "id", storage="string")
    if (is.null(index.gdsn(srcfile, "snp.rs.id", silent=TRUE)))
        assign.gdsn(n, index.gdsn(srcfile, "snp.id"))
    else
        assign.gdsn(n, index.gdsn(srcfile, "snp.rs.id"))
    readmode.gdsn(n)
    .DigestCode(n, digest, verbose)

    # add annotation/qual
    n <- .AddVar(storage.option, varAnnot, "qual", storage="float")
    .append_rep_gds(n, 100.0, nSNP)
    readmode.gdsn(n)
    .DigestCode(n, digest, FALSE)

    # add filter
    n <- .AddVar(storage.option, varAnnot, "filter", storage="int32")
    .append_rep_gds(n, as.raw(1L), nSNP)
    readmode.gdsn(n)
    put.attr.gdsn(n, "R.class", "factor")
    put.attr.gdsn(n, "R.levels", c("PASS"))
    put.attr.gdsn(n, "Description", c("All filters passed"))
    .DigestCode(n, digest, FALSE)

    # add the INFO field
    n1 <- addfolder.gdsn(varAnnot, "info")
    n2 <- index.gdsn(srcfile, "snp.annot", silent=TRUE)
    if (!is.null(n2))
    {
        for (i in ls.gdsn(n2))
        {
            if (verbose) cat("    annotation/info/", i, sep="")
            copyto.gdsn(n1, index.gdsn(n2, i))
           .DigestCode(index.gdsn(n1, i), digest, verbose)
        }
    }

    # add the FORMAT field
    n <- addfolder.gdsn(varAnnot, "format")

    # add dosages to annotation/format/DS
    if (geno_type == "Real")
    {
        if (verbose) cat("    annotation/format/DS")
        varGeno <- addfolder.gdsn(n, "DS")
        put.attr.gdsn(varGeno, "Number", "1")
        put.attr.gdsn(varGeno, "Type", "Float")
        put.attr.gdsn(varGeno, "Description", "Estimated alternate allele dosage")

        nd_geno <- .AddVar(storage.option, varGeno, "data", storage=ds.type,
            valdim=c(nSamp, 0L))
        # add genotypes to genotype/data
        apply.gdsn(list(index.gdsn(srcfile, "genotype"),
                index.gdsn(srcfile, "snp.allele")),
            c(ifelse(snpfirstdim, 1L, 2L), 1L),
            FUN=.cfunction("FC_Dosage2GDS"), as.is="gdsnode",
            target.node=list(nd_geno, nd_allele))

        readmode.gdsn(nd_geno)
        .DigestCode(nd_geno, digest, verbose)

        if (verbose) cat("    allele")
        readmode.gdsn(nd_allele)
        .DigestCode(nd_allele, digest, verbose)

        nd_geno_idx <- .AddVar(storage.option, varGeno, "@data",
            storage="uint8", visible=FALSE)
        .append_rep_gds(nd_geno_idx, as.raw(1L), nSNP)
        readmode.gdsn(nd_geno_idx)
        .DigestCode(nd_geno_idx, digest, FALSE)

        sync.gds(dstfile)
    }

    # add sample annotation
    if (verbose) cat("    sample.annotation\n")
    n <- addfolder.gdsn(dstfile, "sample.annotation")
    n1 <- index.gdsn(srcfile, "sample.annot", silent=TRUE)
    if (!is.null(n1))
    {
        for (i in ls.gdsn(n1))
        {
            if (verbose) cat("    sample.annotation/", i, sep="")
            copyto.gdsn(n, index.gdsn(n1, i))
           .DigestCode(index.gdsn(n, i), digest, verbose)
        }
    }

    on.exit()
    closefn.gds(srcfile)
    closefn.gds(dstfile)

    ##################################################
    # optimize access efficiency

    if (verbose)
    {
        cat("Done.\n")
        cat(date(), "\n", sep="")
    }
    if (optimize)
    {
        if (verbose)
            cat("Optimize the access efficiency ...\n")
        cleanup.gds(out.fn, verbose=verbose)
        if (verbose) cat(date(), "\n", sep="")
    }

    # output
    invisible(normalizePath(out.fn))
}



#######################################################################
# Convert a PLINK BED file to a SeqArray GDS file
#

seqBED2GDS <- function(bed.fn, fam.fn, bim.fn, out.gdsfn,
    compress.geno="LZMA_RA", compress.annotation="LZMA_RA", chr.conv=TRUE,
    include.pheno=TRUE, optimize=TRUE, digest=TRUE, parallel=FALSE,
    verbose=TRUE)
{
    # check
    stopifnot(is.character(bed.fn), length(bed.fn)==1L)
    if (missing(fam.fn) && missing(bim.fn))
    {
        fn <- gsub("\\.bed$", "", bed.fn, ignore.case=TRUE)
        fam.fn <- paste0(fn, ".fam")
        bim.fn <- paste0(fn, ".bim")
        if (!grepl("\\.bed$", bed.fn, ignore.case=TRUE))
            bed.fn <- paste0(fn, ".bed")
    }
    stopifnot(is.character(fam.fn), length(fam.fn)==1L)
    stopifnot(is.character(bim.fn), length(bim.fn)==1L)
    stopifnot(is.character(out.gdsfn), length(out.gdsfn)==1L)
    stopifnot(is.character(compress.geno), length(compress.geno)==1L)
    stopifnot(is.character(compress.annotation), length(compress.annotation)==1L)
    stopifnot(is.logical(chr.conv), length(chr.conv)==1L)
    stopifnot(is.logical(optimize), length(optimize)==1L)
    stopifnot(is.logical(include.pheno) | is.character(include.pheno))
    nm_pheno <- c("family", "father", "mother", "sex", "phenotype")
    if (is.logical(include.pheno))
    {
        if (!isTRUE(include.pheno) && !isFALSE(include.pheno))
            stop("'include.pheno' should be TRUE, FALSE or a character vector.")
        include.pheno <- if (include.pheno) nm_pheno else character()
    }
    if (is.character(include.pheno))
    {
        if (!all(include.pheno %in% nm_pheno))
        {
            stop("'include.pheno' should be one of ",
                paste(nm_pheno, collapse=", "), ".")
        }
    }
    stopifnot(is.logical(digest) | is.character(digest), length(digest)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)
    pnum <- .NumParallel(parallel)
    parallel <- .McoreParallel(parallel)

    if (verbose)
    {
        cat(date(), "\n", sep="")
        cat("PLINK BED to SeqArray GDS:\n")
    }

    ##  open and detect bed.fn  ##
    bedfile <- .open_bin(bed.fn)
    on.exit({ .close_conn(bedfile) })
    b <- as.integer(readBin(bedfile$con, raw(), 3L))
    if ((b[1L] != 0x6C) || (b[2L] != 0x1B))
        stop(sprintf("Invalid magic number (0x%02X,0x%02X).", b[1L], b[2L]))
    bed_flag <- b[3L] == 0L
    if (verbose)
    {
        cat("    BED file: ", sQuote(bed.fn), "\n        ",
            .pretty_size(file.size(bed.fn)), ", ",
            ifelse(bed_flag, "sample-major mode: [SNP, sample]",
            "SNP-major mode: [sample, SNP]"), "\n", sep="")
    }

    ##  read fam.fn  ##
    if (verbose)
        cat("    FAM file: ", sQuote(fam.fn), "\n", sep="")
    f <- .open_text(fam.fn, TRUE)
    famD <- read.table(f$con, header=FALSE, comment.char="",
        stringsAsFactors=FALSE)
    .close_conn(f)
    names(famD) <- c("FamilyID", "InvID", "PatID", "MatID", "Sex", "Pheno")
    if (anyDuplicated(famD$InvID) == 0L)
    {
        sample.id <- famD$InvID
    } else {
        sample.id <- paste(famD$FamilyID, famD$InvID, sep="-")
        if (length(unique(sample.id)) != dim(famD)[1])
            stop("Sample IDs in PLINK BED are not unique!")
    }
    if (verbose)
    {
        n <- nrow(famD)
        cat("        ",
            .pretty_size(file.size(fam.fn)), ", ",
            .pretty(n), " sample", .plural(n), "\n", sep="")
    }

    ##  read bim.fn  ##
    if (verbose)
        cat("    BIM file: ", sQuote(bim.fn), "\n", sep="")
    f <- .open_text(bim.fn, TRUE)
    bimD <- read.table(f$con, header=FALSE, comment.char="",
        stringsAsFactors=FALSE)
    .close_conn(f)
    names(bimD) <- c("chr", "snp.id", "map", "pos", "allele1", "allele2")
    if (verbose)
    {
        n <- nrow(bimD)
        cat("        ",
            .pretty_size(file.size(bim.fn)), ", ",
            .pretty(n), " variant", .plural(n), "\n", sep="")
    }
    if (chr.conv)
    {
        x <- bimD$chr
        x23 <- sum(x==23L, na.rm=TRUE)
        x24 <- sum(x==24L, na.rm=TRUE)
        x25 <- sum(x==25L, na.rm=TRUE)
        x26 <- sum(x==26L, na.rm=TRUE)
        if (x23 && verbose)
            cat("        chromosome code 23 => X (", .pretty(x23), ")\n", sep="")
        if (x23) x[x == 23L] <- "X"
        if (x24 && verbose)
            cat("        chromosome code 24 => Y (", .pretty(x24), ")\n", sep="")
        if (x24) x[x == 24L] <- "Y"
        if (x25 && verbose)
            cat("        chromosome code 25 => XY (", .pretty(x25), ")\n", sep="")
        if (x25) x[x == 25L] <- "XY"
        if (x26 && verbose)
            cat("        chromosome code 26 => MT (", .pretty(x26), ")\n", sep="")
        if (x26) x[x == 26L] <- "MT"
        bimD$chr <- x
    }

    ##  create GDS file  ##

    dstfile <- createfn.gds(out.gdsfn)
    # close the file at the end
    on.exit({ closefn.gds(dstfile) }, add=TRUE)
    if (verbose)
        cat("    Output:\n        ", out.gdsfn, "\n", sep="")

    put.attr.gdsn(dstfile$root, "FileFormat", "SEQ_ARRAY")
    put.attr.gdsn(dstfile$root, "FileVersion", "v1.0")

    n <- addfolder.gdsn(dstfile, "description")
    put.attr.gdsn(n, "source.format", "PLINK BED Format")

    # add sample.id
    if (verbose) cat("    sample.id  ")
    n <- add.gdsn(dstfile, "sample.id", sample.id, compress=compress.annotation,
        closezip=TRUE)
    .DigestCode(n, digest, verbose, FALSE)

    # add variant.id
    if (verbose) cat("    variant.id  ")
    n <- add.gdsn(dstfile, "variant.id", seq_len(nrow(bimD)),
        compress=compress.annotation, closezip=TRUE)
    .DigestCode(n, digest, verbose, FALSE)

    # add position
    if (verbose) cat("    position  ")
    n <- add.gdsn(dstfile, "position", bimD$pos, storage="int32",
        compress=compress.annotation, closezip=TRUE)
    .DigestCode(n, digest, verbose, FALSE)

    # add chromosome
    if (verbose) cat("    chromosome  ")
    n <- add.gdsn(dstfile, "chromosome", bimD$chr, storage="string",
        compress=compress.annotation, closezip=TRUE)
    .DigestCode(n, digest, verbose, FALSE)
    # RLE-coded chromosome
    .optim_chrom(dstfile)

    # add allele
    if (verbose) cat("    allele  ")
    n <- add.gdsn(dstfile, "allele", paste(bimD$allele2, bimD$allele1, sep=","),
        storage="string", compress=compress.annotation, closezip=TRUE)
    .DigestCode(n, digest, verbose, FALSE)

    # add a folder for genotypes
    if (verbose)
        cat("    genotype", ifelse(verbose, ":\n", ""), sep="")
    n <- addfolder.gdsn(dstfile, "genotype")
    put.attr.gdsn(n, "VariableName", "GT")
    put.attr.gdsn(n, "Description", "Genotype")

    # sync file
    sync.gds(dstfile)

    # add genotypes to genotype/data
    num4 <- ifelse(bed_flag, nrow(bimD), nrow(famD))
    cnt4 <- ifelse(bed_flag, nrow(famD), nrow(bimD))
    vg <- add.gdsn(n, "data", storage="bit2", valdim=c(2L, num4, 0L),
        compress=ifelse(bed_flag, "", compress.geno))

    # num of cores/processes
    if (pnum <= 1L)
    {
        # convert
        .Call(SEQ_ConvBED2GDS, vg, cnt4, bedfile$con, readBin, new.env(),
            if (verbose) stdout() else NULL)
        readmode.gdsn(vg)

    } else {
        # close bed file
        on.exit()
        .close_conn(bedfile)
        bedfile <- NULL
        # split to multiple files
        psplit <- .file_split(cnt4, pnum)
        # need unique temporary file names
        ptmpfn <- .get_temp_fn(pnum,
            sub("^([^.]*).*", "\\1", basename(out.gdsfn)), dirname(out.gdsfn))
        if (verbose)
        {
            cat(sprintf("    >>> writing to %d files: <<<\n", pnum))
            cat(sprintf("        %s\t[%s .. %s]\n", basename(ptmpfn),
                .pretty(psplit[[1L]]),
                .pretty(psplit[[1L]] + psplit[[2L]] - 1L)), sep="")
            flush.console()
        }
        # working flags
        .packageEnv$work_idx <- 1L
        .packageEnv$work_flag <- rep(FALSE, pnum)

        # conversion in parallel
        seqParallel(parallel, NULL, FUN = function(bed.fn, tmp.fn, num4, psplit, cp)
        {
            # the process id, starting from one
            i <- process_index
            # open the bed file
            bedfile <- .open_bin(bed.fn)
            on.exit({ .close_conn(bedfile) })
            # create gds file
            f <- createfn.gds(tmp.fn[i])
            on.exit({ closefn.gds(f) }, add=TRUE)
            # progress file
            progfile <- file(paste0(tmp.fn[i], ".progress"), "wt")
            cat(tmp.fn[i], ":\n", file=progfile, sep="")
            on.exit({ close(progfile) }, add=TRUE)
            # new a gds node
            vg <- add.gdsn(f, "data", storage="bit2", valdim=c(2L, num4, 0L),
                compress=cp)
            # re-position the file
            cnt <- psplit[[2L]][i]
            if (cnt > 0L)
            {
                n4 <- (num4 %/% 4L) + (num4 %% 4L > 0L)
                n4 <- 3L + n4 * (psplit[[1L]][i] - 1L)
                if (bedfile$fmt == "")
                {
                    seek(bedfile$con, n4)
                } else {
                    # gz or xz
                    while (n4 > 0L)
                    {
                        m <- if (n4 <= 65536L) n4 else 65536L
                        readBin(bedfile$con, raw(), m)
                        n4 <- n4 - m
                    }
                }
                # convert
                .Call(SEQ_ConvBED2GDS, vg, cnt, bedfile$con, readBin, new.env(),
                    progfile)
            }
            readmode.gdsn(vg)
            # output
            process_index

        }, split="none", bed.fn=bed.fn, tmp.fn=ptmpfn, num4=num4,
            psplit=psplit, cp=ifelse(bed_flag, "", compress.geno),
            # combine the files as many as possible
            .combine = function(fn_idx)
            {
                # set TRUE to indicate the file completed
                .packageEnv$work_flag[fn_idx] <- TRUE
                if (verbose && fn_idx==1L)
                    cat("    >>> merging the files: <<<\n")
                # check whether merging the file or not
                if (.packageEnv$work_idx == fn_idx)
                {
                    while (isTRUE(.packageEnv$work_flag[fn_idx]))
                    {
                        if (verbose)
                            cat("       ", basename(ptmpfn[fn_idx]))
                        if (psplit[[2L]][fn_idx] > 0L)
                        {
                            f <- openfn.gds(ptmpfn[fn_idx])
                            append.gdsn(vg, index.gdsn(f, "data"))
                            closefn.gds(f)
                        }
                        if (verbose) cat("\t[Done]\n")
                        fn_idx <- fn_idx + 1L
                    }
                    .packageEnv$work_idx <- fn_idx
                }
            })

        # delete temporary files
        unlink(c(ptmpfn, paste0(ptmpfn, ".progress")), force=TRUE)
        .packageEnv$work_idx <- NULL
        .packageEnv$work_flag <- NULL
        if (verbose && !isFALSE(digest)) cat("    ")
    }

    n1 <- add.gdsn(n, "@data", storage="uint8", compress=compress.annotation,
        visible=FALSE)
    .append_rep_gds(n1, as.raw(1L), nrow(bimD))
    readmode.gdsn(n1)
    .DigestCode(n1, digest, FALSE)

    n1 <- add.gdsn(n, "extra.index", storage="int32", valdim=c(3L,0L),
        compress=compress.geno, closezip=TRUE)
    put.attr.gdsn(n1, "R.colnames",
        c("sample.index", "variant.index", "length"))
    add.gdsn(n, "extra", storage="int16", compress=compress.geno, closezip=TRUE)

    # sync file
    sync.gds(dstfile)

    # close the BED file
    on.exit({ closefn.gds(dstfile) })
    if (!is.null(bedfile)) .close_conn(bedfile)

    if (bed_flag)
    {
        cat(" (transposed)")
        permdim.gdsn(vg, c(2L,1L))
    }
    if (verbose) cat("  ")
    .DigestCode(vg, digest, verbose)

    # add a folder for phase information
    if (verbose) cat("    phase")
    n <- addfolder.gdsn(dstfile, "phase")

    # add phase data
    n1 <- add.gdsn(n, "data", storage="bit1", valdim=c(nrow(famD), 0L),
        compress=compress.annotation)
    if (pnum <= 1L)
    {
        .append_rep_gds(n1, as.raw(0L), as.double(nrow(bimD))*nrow(famD))
    } else {
        # split to multiple files
        psplit <- .file_split(nrow(bimD), pnum)
        if (verbose)
            cat(sprintf(":\n    >>> writing to %d files: <<<\n", pnum))
        # conversion in parallel
        seqParallel(parallel, NULL, FUN = function(tmp.fn, nsamp, psplit, cp)
        {
            # the process id, starting from one
            i <- process_index
            # create gds file
            f <- createfn.gds(tmp.fn[i])
            on.exit(closefn.gds(f))
            # new a gds node
            vg <- add.gdsn(f, "data", storage="bit1", valdim=c(nsamp, 0L), compress=cp)
            # re-position the file
            cnt <- psplit[[2L]][i]
            if (cnt > 0L)
                .append_rep_gds(vg, as.raw(0L), as.double(cnt)*nsamp)
            readmode.gdsn(vg)
            invisible()
        }, split="none", tmp.fn=ptmpfn, nsamp=nrow(famD), psplit=psplit,
            cp=compress.annotation)

        if (verbose) cat("        merging files ... [")
        lapply(seq_along(ptmpfn), function(i)
        {
            cat(ifelse(i>1L, ",", ""), i, sep="")
            fn <- ptmpfn[i]
            f <- openfn.gds(fn)
            on.exit({ closefn.gds(f); unlink(fn, force=TRUE) })
            append.gdsn(n1, index.gdsn(f, "data"))
        })
        if (verbose) cat(" Done]\n      ")
    }
    readmode.gdsn(n1)
    .DigestCode(n1, digest, verbose)

    n1 <- add.gdsn(n, "extra.index", storage="int32", valdim=c(3L,0L),
        compress=compress.annotation, closezip=TRUE)
    put.attr.gdsn(n1, "R.colnames",
        c("sample.index", "variant.index", "length"))
    add.gdsn(n, "extra", storage="bit1", compress=compress.annotation,
        closezip=TRUE)


    # add annotation folder
    n <- addfolder.gdsn(dstfile, "annotation")

    # add annotation/id
    n1 <- add.gdsn(n, "id", bimD$snp.id, storage="string",
        compress=compress.annotation, closezip=TRUE)
    if (verbose) cat("    annotation/id")
    .DigestCode(n1, digest, verbose)

    # add annotation/qual
    n1 <- add.gdsn(n, "qual", storage="float", compress=compress.annotation)
    .append_rep_gds(n1, 100.0, nrow(bimD))
    readmode.gdsn(n1)
    if (verbose) cat("    annotation/qual")
    .DigestCode(n1, digest, verbose)

    # add filter
    n1 <- add.gdsn(n, "filter", storage="int32", compress=compress.annotation)
    .append_rep_gds(n1, as.raw(1L), nrow(bimD))
    readmode.gdsn(n1)
    put.attr.gdsn(n1, "R.class", "factor")
    put.attr.gdsn(n1, "R.levels", c("PASS"))
    put.attr.gdsn(n1, "Description", c("All filters passed"))
    if (verbose) cat("    annotation/filter")
    .DigestCode(n1, digest, verbose)

    # add the INFO field
    addfolder.gdsn(n, "info")
    # add the FORMAT field
    addfolder.gdsn(n, "format")

    # add sample annotation
    if (verbose) cat("    sample.annotation\n")
    n <- addfolder.gdsn(dstfile, "sample.annotation")

    if (length(include.pheno) && verbose) cat("       ")
    if ("family" %in% include.pheno)
    {
        if (verbose) cat(" family")
        n1 <- add.gdsn(n, "family", famD$FamilyID, compress=compress.annotation,
            closezip=TRUE)
        .DigestCode(n1, digest, FALSE)
    }
    if ("father" %in% include.pheno)
    {
        if (verbose) cat(" father")
        n1 <- add.gdsn(n, "father", famD$PatID, compress=compress.annotation,
            closezip=TRUE)
        .DigestCode(n1, digest, FALSE)
    }
    if ("mother" %in% include.pheno)
    {
        if (verbose) cat(" mother")
        n1 <- add.gdsn(n, "mother", famD$MatID, compress=compress.annotation,
            closezip=TRUE)
        .DigestCode(n1, digest, FALSE)
    }
    if ("sex" %in% include.pheno)
    {
        if (verbose) cat(" sex")
        sex <- rep("", length(sample.id))
        sex[famD$Sex==1L] <- "M"; sex[famD$Sex==2L] <- "F"
        n1 <- add.gdsn(n, "sex", sex, compress=compress.annotation,
            closezip=TRUE)
        .DigestCode(n1, digest, FALSE)
    }
    if ("phenotype" %in% include.pheno)
    {
        if (verbose) cat(" phenotype")
        n1 <- add.gdsn(n, "phenotype", famD$Pheno, compress=compress.annotation,
            closezip=TRUE)
        .DigestCode(n1, digest, FALSE)
    }
    if (length(include.pheno) && verbose) cat("\n")

    ##################################################
    # optimize access efficiency

    if (verbose)
        cat("Done.\n", date(), "\n", sep="")
    on.exit()
    closefn.gds(dstfile)
    if (optimize)
    {
        if (verbose)
            cat("Optimize the access efficiency ...\n")
        cleanup.gds(out.gdsfn, verbose=verbose)
        if (verbose) cat(date(), "\n", sep="")
    }

    # output
    invisible(normalizePath(out.gdsfn))
}



#######################################################################
# Convert a SeqArray GDS file to a PLINK BED file
#

seqGDS2BED <- function(gdsfile, out.fn,
    write.rsid=c("auto", "annot_id", "chr_pos_ref_alt"), multi.row=FALSE,
    verbose=TRUE)
{
    # check
    stopifnot(is.character(gdsfile) | inherits(gdsfile, "SeqVarGDSClass"))
    stopifnot(is.character(out.fn), length(out.fn)==1L, !is.na(out.fn))
    stopifnot(is.logical(multi.row), length(multi.row)==1L)
    stopifnot(is.logical(verbose), length(verbose)==1L)
    write.rsid <- match.arg(write.rsid)
    if (multi.row)
        stop("'multi.row=TRUE' is reserved for future implementation.")

    if (verbose)
    {
        .cat(date())
        .cat("SeqArray GDS to PLINK BED:")
    }
    if (is.character(gdsfile))
    {
        if (verbose)
            .cat("    open ", sQuote(gdsfile))
        gdsfile <- seqOpen(gdsfile, allow.duplicate=TRUE)
        on.exit(seqClose(gdsfile))
    }
    if (verbose)
    {
        dm <- .seldim(gdsfile)
        .cat("    # of samples: ", .pretty(dm[2L]))
        .cat("    # of variants: ", .pretty(dm[3L]))
    }

    # fam file
    s <- seqGetData(gdsfile, "sample.id")
    n <- length(s)
    fam <- data.frame(FID=rep(0L, n), IID=s, FAT=rep(0L, n), MOT=rep(0L, n),
        sex=rep(0L, n), pheno=rep(-9L, n), stringsAsFactors=FALSE)
    nm <- "sample.annotation/sex"
    if (exist.gdsn(gdsfile, nm))
        fam$sex <- seqGetData(gdsfile, nm)
    nm <- "sample.annotation/phenotype"
    if (exist.gdsn(gdsfile, nm))
        fam$pheno <- seqGetData(gdsfile, nm)
    famfn <- paste0(out.fn, ".fam")
    if (verbose)
        .cat("    fam file: ", sQuote(famfn))
    write.table(fam, file=famfn, quote=FALSE, sep="\t", row.names=FALSE,
        col.names=FALSE)
    remove(fam)

    # bim file
    n <- .seldim(gdsfile)[3L]
    # rs id
    if (write.rsid == "annot_id")
    {
        rsid <- seqGetData(gdsfile, "annotation/id")
        rsid[is.na(rsid)] <- ""
    } else if (write.rsid == "chr_pos_ref_alt")
    {
        rsid <- gsub(":|,", "_", seqGetData(gdsfile, "$chrom_pos_allele"))
        rsid[is.na(rsid)] <- ""
    } else {
        rsid <- seqGetData(gdsfile, "annotation/id")
        rsid[is.na(rsid)] <- ""
        x <- rsid %in% c("", ".")
        if (any(x))
        {
            rsid[x] <- gsub(":|,", "_",
                seqGetData(gdsfile, "$chrom_pos_allele")[x])
        }
    }
    # ref, alt
    s <- seqGetData(gdsfile, "$alt"); s[s == ""] <- "0"
    bim <- data.frame(
        chr = seqGetData(gdsfile, "chromosome"),
        var = rsid,
        mrg = rep(0L, n),
        pos = seqGetData(gdsfile, "position"),
        alt1 = gsub(",", "/", s, fixed=TRUE),
        alt2 = seqGetData(gdsfile, "$ref"),
        stringsAsFactors=FALSE)
    bimfn <- paste0(out.fn, ".bim")
    if (verbose)
        .cat("    bim file: ", sQuote(bimfn))
    write.table(bim, file=bimfn, quote=FALSE, sep="\t", row.names=FALSE,
        col.names=FALSE)
    remove(bim)
    
    # bed file
    bedfn <- paste0(out.fn, ".bed")
    if (verbose)
        .cat("    bed file: ", sQuote(bedfn))
    outf <- file(bedfn, "w+b")
    on.exit(close(outf), add=TRUE)
    writeBin(as.raw(c(0x6C, 0x1B, 0x01)), outf)

    # using genotypes or dosages
    gv <- .has_geno(gdsfile)
    if (gv)
    {
        nm <- "$dosage_alt"
        ploidy <- .dim(gdsfile)[1L]
    } else {
        nm <- .has_dosage(gdsfile)
        if (verbose)
            .cat("    using ", sQuote(nm))
        ploidy <- getOption("seqarray.ploidy", 2L)[1L]
    }
    if (is.na(ploidy))
        stop("'ploidy' is not known.")
    else if (ploidy != 2L)
        stop("'ploidy' should be 2 for diploidy.")
    # call C function
    seqApply(gdsfile, nm, .cfunction("FC_GDS2BED"), as.is=outf,
        .useraw=TRUE, .progress=verbose)

    if (verbose) .cat("Done.\n", date())

    # output
    invisible(normalizePath(c(famfn, bimfn, bedfn)))
}



#######################################################################
# Create a SeqArray GDS file
#

seqEmptyFile <- function(outfn, sample.id=character(), numvariant=1L,
    verbose=TRUE)
{
    #check
    stopifnot(is.character(outfn), length(outfn)==1L)
    stopifnot(is.vector(sample.id))
    stopifnot(is.numeric(numvariant), length(numvariant)==1L, numvariant>=0L)
    stopifnot(is.logical(verbose), length(verbose)==1L)

    # create a new GDS file
    f <- createfn.gds(outfn)
    on.exit({ if (!is.null(f)) closefn.gds(f) }, add=TRUE)
    if (verbose)
        .cat("Output: ", sQuote(outfn))
    put.attr.gdsn(f$root, "FileFormat", "SEQ_ARRAY")
    put.attr.gdsn(f$root, "FileVersion", "v1.0")
    addfolder.gdsn(f, "description")

    # add sample.id
    add.gdsn(f, "sample.id", sample.id, closezip=TRUE,
        compress=ifelse(length(sample.id), "LZMA_ra", ""))

    # add basic site information
    add.gdsn(f, "variant.id", seq_len(numvariant))
    add.gdsn(f, "position", integer(numvariant))
    add.gdsn(f, "chromosome", character(numvariant))
    add.gdsn(f, "allele", character(numvariant))

    # add folders
    n <- addfolder.gdsn(f, "genotype")
    put.attr.gdsn(n, "VariableName", "GT")
    put.attr.gdsn(n, "Description", "Genotype")
    addfolder.gdsn(f, "phase")
    nd <- addfolder.gdsn(f, "annotation")
    add.gdsn(nd, "id", character(numvariant))
    add.gdsn(nd, "qual", double(numvariant), storage="float")
    n <- add.gdsn(nd, "filter", rep(1L, numvariant), storage="int32")
    put.attr.gdsn(n, "R.class", "factor")
    put.attr.gdsn(n, "R.levels", c("PASS"))
    put.attr.gdsn(n, "Description", c("All filters passed"))
    addfolder.gdsn(nd, "info")
    addfolder.gdsn(nd, "format")
    addfolder.gdsn(f, "sample.annotation")

    invisible()
}
zhengxwen/SeqArray documentation built on April 14, 2025, 2:19 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
zhengxwen/SeqArray
Data management of large-scale whole-genome sequence variant calls using GDS files

R/Conversion.R
In zhengxwen/SeqArray: Data management of large-scale whole-genome sequence variant calls using GDS files

Defines functions seqEmptyFile seqGDS2BED seqBED2GDS seqSNP2GDS seqGDS2SNP seqGDS2VCF .bgzf_create

Documented in seqBED2GDS seqEmptyFile seqGDS2BED seqGDS2SNP seqGDS2VCF seqSNP2GDS

R Package Documentation

Browse R Packages

We want your feedback!

zhengxwen/SeqArray Data management of large-scale whole-genome sequence variant calls using GDS files

R/Conversion.R In zhengxwen/SeqArray: Data management of large-scale whole-genome sequence variant calls using GDS files

Defines functions seqEmptyFile seqGDS2BED seqBED2GDS seqSNP2GDS seqGDS2SNP seqGDS2VCF .bgzf_create

Documented in seqBED2GDS seqEmptyFile seqGDS2BED seqGDS2SNP seqGDS2VCF seqSNP2GDS

R Package Documentation

Browse R Packages

We want your feedback!

zhengxwen/SeqArray
Data management of large-scale whole-genome sequence variant calls using GDS files

R/Conversion.R
In zhengxwen/SeqArray: Data management of large-scale whole-genome sequence variant calls using GDS files