R/makeVariantExperimentFromSNPGDS.R

Defines functions makeVariantExperimentFromSNPGDS .showAvailable_snparray .showAvailable_check_dims .showAvailable_check .colData_snpgds .empty_colData_DF .colDataColumns_check_msg .colDataColumns_check .sampnode_snpgds_ondisk .sampnodes_snpgds .rowRanges_snpgds .empty_rowData_DF .rowDataColumns_check_msg .rowDataColumns_check .varnode_snpgds_inmem .granges_snpgds

Documented in makeVariantExperimentFromSNPGDS

## In "makeVariantExperimentFromSNPGDS()", we have customized the "ID"
## (from "snp.rs.id" node if available, "ALLELE1" and "ALLELE2" (from
## "snp.allele" node if available) columns as part of the
## mcols(rowRanges(ve)). "snp.id" was used as default feature id, and
## "sample.id" was used as default sample id. These 2 nodes are
## required by "SNPGDSFileClass" in SNPRelate. 

#' @import GDSArray
#' @importFrom GenomicRanges GRanges ranges seqnames 
#' @importFrom IRanges IRanges
#' @import gdsfmt
#' @import SNPRelate

## .granges_gdsdata
.granges_snpgds <- function(snpgdsfile, ...){
    f <- acquireGDS(snpgdsfile, type = "snpgds")
    vid <- read.gdsn(index.gdsn(f, "snp.id"))
    chr <- read.gdsn(index.gdsn(f, "snp.chromosome"))
    pos <- read.gdsn(index.gdsn(f, "snp.position"))
    ## reflen <- nchar(seqGetData(x, "$ref"))  
    ## reflen[reflen < 1L] <- 1L
    reflen <- 1L  ## all snps, length == 1L
    gr <- GenomicRanges::GRanges(seqnames=chr,
                                 ranges=IRanges(start=pos, end=pos+reflen-1L),
                                 ...)
    names(gr) <- as.integer(vid)
    gr
}

## return a DataFrame
.varnode_snpgds_inmem <- function(snpgdsfile, node){
    f <- acquireGDS(snpgdsfile)
    res <- read.gdsn(index.gdsn(f, node))
    resDF <- setNames(DataFrame(res), node)
    if(node == "snp.allele"){
        a <- strsplit(res, split="/")
        ## genotype data, possible for >2 alt? if yes, use sub() here
        ## and use DNAStringSetList class for "allele2"
        a1 <- Biostrings::DNAStringSet(unlist(a)[c(TRUE, FALSE)])
        a2 <- Biostrings::DNAStringSet(unlist(a)[c(FALSE, TRUE)]) 
        resDF <- setNames(DataFrame(a1, a2), paste0("snp.allele", seq_len(2)))
    }
    resDF  ## returns a DataFrame with names.
}

## #' @importFrom methods new

.rowDataColumns_check <- function(gdsfile, ftnode, rowDataColumns) {
    if (is.null(rowDataColumns)) {
        rowDataColumns <- showAvailable(gdsfile, args = "rowDataColumns", ftnode = ftnode)[[1]]
    } else {
        idx.within <- rowDataColumns %in%
            showAvailable(gdsfile, args = "rowDataColumns", ftnode = ftnode)[[1]]
        if(any(!idx.within)) {
            misrdcs <- paste(rowDataColumns[!idx.within], collapse = ", ")
            warning(.rowDataColumns_check_msg(misrdcs))
            rowDataColumns <- rowDataColumns[idx.within]
            ## if none, return character(0)
        }
    }
    tolower(rowDataColumns)
}

.rowDataColumns_check_msg <- function(missingRDCs) {
    paste('The feature (snp/variant) annotation of "',
          missingRDCs, 
          '" does not exist!', "\n",
          'Please use showAvailable(file, "rowDataColumns") ',
          'to get the available columns for "rowData()."', "\n")
}

## "snp.id", "variant.id"
.empty_rowData_DF <- function(gdsfile, ftnode, rowDataOnDisk) {
    f <- acquireGDS(gdsfile)
    ft.id <- read.gdsn(index.gdsn(f, ftnode))
    if (rowDataOnDisk) {
        DelayedDataFrame(row.names=ft.id)
    } else {
        DataFrame(row.names=ft.id)
    }
}

#' @importFrom Biostrings DNAStringSet
#' @import SNPRelate
#' @importMethodsFrom DelayedArray sub

## .rowRanges_gdsdata
.rowRanges_snpgds <- function(snpgdsfile, ftnode = "snp.id", rowDataColumns, rowDataOnDisk){
    
    rr <- .granges_snpgds(snpgdsfile)
    ## following code generates the mcols(SummarizedExperiment::rowRanges(se))
    rowDataColumns <- .rowDataColumns_check(snpgdsfile, ftnode, rowDataColumns)

    ## if no available rowDataColumns are selected, i.e.,
    ## rowDataColumns = character(0), return an empty (Delayed)DataFrame
    ## for mcols()
    if (is.character(rowDataColumns) && length(rowDataColumns) == 0) {
        resDF <- .empty_rowData_DF(snpgdsfile, ftnode, rowDataOnDisk)
        mcols(rr) <- resDF
        return(rr)
    }
    ## if there are valid rowDataColumns
    if(rowDataOnDisk){
        res <- setNames(
            lapply(rowDataColumns, function(x) GDSArray(snpgdsfile, x)), 
            ## .varnode_snpgds_ondisk(snpgdsfile, name=x)),
            rowDataColumns)
        resDF <- DelayedDataFrame(lapply(res, I))
        if("snp.allele" %in% names(resDF)){ ## for snpgds
            resDF$snp.allele1 <- sub("/.$", "", resDF$snp.allele)
            resDF$snp.allele2 <- sub("[TCGA]*/", "", resDF$snp.allele)
            resDF[["snp.allele"]] <- NULL 
        }
    }else{ ## rowDataOnDisk = FALSE...
        resDF <- DataFrame(lapply(rowDataColumns, function(x)
            .varnode_snpgds_inmem(snpgdsfile, x)))
    }
    mcols(rr) <- resDF
    rr
}

.sampnodes_snpgds <- function(snpgdsfile) {
    f <- acquireGDS(snpgdsfile)
    ls.gdsn(index.gdsn(f, "sample.annot"))
    
}

## returns Delayed/GDSArray, not DF.
.sampnode_snpgds_ondisk <- function(snpgdsfile, name) {
    sampnodes <- .sampnodes_snpgds(snpgdsfile)
    if(name %in% sampnodes){
        node <- paste0("sample.annot/", name)
    }else{
        node <- name
    }
    GDSArray(snpgdsfile, node)
}

###
## colData for samples
###

.colDataColumns_check <- function(gdsfile, colDataColumns, smpnode) {
    allcols <- showAvailable(gdsfile, args = "colDataColumns", smpnode = smpnode)[[1]]
    if (is.null(colDataColumns)) {
        colDataColumns <- allcols
    } else {
        idx.within <- colDataColumns %in% allcols
        if(any(!idx.within)) {
            misrdcs <- paste(colDataColumns[!idx.within], collapse = ", ")
            warning(.colDataColumns_check_msg(misrdcs))
            colDataColumns <- colDataColumns[idx.within]
            ## if none, return character(0)
        }
    }
    colDataColumns
}

.colDataColumns_check_msg <- function(missingCDCs) {
    paste("\n", 'The sample annotation of "',
          missingCDCs,
          '" does not exist!', "\n",
          'Please use showAvailable(file, "colDataColumns") ',
          'to get the available columns for "colData()."', "\n")
}

.empty_colData_DF <- function(gdsfile, smpnode, colDataOnDisk) {
    f <- acquireGDS(gdsfile)
    sample.id <- read.gdsn(index.gdsn(f, smpnode))
    if (colDataOnDisk) {
        DelayedDataFrame(row.names=sample.id)
    } else {
        DataFrame(row.names=sample.id)
    }
}

.colData_snpgds <- function(snpgdsfile, smpnode, colDataColumns, colDataOnDisk) {
    colDataColumns <- .colDataColumns_check(snpgdsfile, colDataColumns, "sample.id")

    ## if no available colDataColumns are selected, i.e.,
    ## colDataColumns = character(0), return an empty (Delayed)DataFrame    
    if (is.character(colDataColumns) && length(colDataColumns) == 0) { ## character(0)
        .empty_colData_DF(snpgdsfile, "sample.id", colDataOnDisk)
    } else { ## if there are valid rowDataColumns
        if (colDataOnDisk) {
            sample.id <- .sampnode_snpgds_ondisk(snpgdsfile, smpnode)
            annot <- setNames(
                lapply(colDataColumns, function(x)
                    .sampnode_snpgds_ondisk(snpgdsfile, x)),
                colDataColumns)
            DelayedDataFrame(lapply(annot, I),
                             row.names=as.character(sample.id))
        } else {
            f <- acquireGDS(snpgdsfile)
            sample.id <- read.gdsn(index.gdsn(f, smpnode))
            node <- paste0("sample.annot/", colDataColumns)
            annot <- lapply(node, function(x) read.gdsn(index.gdsn(f, x)))
            names(annot) <- colDataColumns
            DataFrame(annot, row.names=sample.id)
        }
    }
}

.showAvailable_check <- function(file, args = c("colDataColumns", "infoColumns"), ftnode, smpnode) {
    ff <- .get_gds_fileFormat(file)
    smpannonode <- switch(ff, SEQ_ARRAY = "sample.annotation", SNP_ARRAY = "sample.annot")
    infonode <- "annotation/info"  ## only applies to 'SEQ_ARRAY'
    res <- switch(args,
                  colDataColumns = .showAvailable_check_dims(file, smpannonode, smpnode),
                  infoColumns = .showAvailable_check_dims(file, infonode, ftnode),
                  character(0))
    return(res)
}

.showAvailable_check_dims <- function(file, nodename, idxnode) {
    allnodes <- gdsnodes(file)
    if (any(grepl(nodename, allnodes))) {
        nds <- gdsnodes(file, nodename)
        ndlens <- vapply(nds, function(x) .get_gdsnode_desp(file, x, "dim"), integer(1))
        nds <- nds[ndlens == .get_gdsnode_desp(file, idxnode, "dim")]
            res <- gsub(".*\\/", "", nds)
        } else {
            res <- character(0)
        }
    res
}    

## http://bioconductor.org/packages/release/bioc/vignettes/SNPRelate/inst/doc/SNPRelate.html
## SNPRelate requires 5 variables: sample.id, snp.id, snp.position, snp.chromosome, genotype
## 2 optional variables: snp.rs.id, snp.allele (when merging gentypes from diff platforms)
.showAvailable_snparray <- function(file,
                                    args = c("assayNames",
                                             "rowDataColumns",
                                             "colDataColumns"),
                                    ftnode = "snp.id",
                                    smpnode = "sample.id") {
    res <- CharacterList()
    allnodes <- gdsnodes(file)
    if("assayNames" %in% args)
        res$assayNames <- "genotype"
    if("rowDataColumns" %in% args) {
        ## "snp.rs.id" and "snp.allele" are options gds nodes for "SNP_ARRAY"
        rowcols <- allnodes[allnodes %in% c("snp.rs.id", "snp.allele")]
        if (length(rowcols) > 0) {
            rowlens <- vapply(rowcols, function(x) .get_gdsnode_desp(file, x, "dim"), integer(1))
            rowcols <- rowcols[rowlens == .get_gdsnode_desp(file, ftnode, "dim")]
        }
        res$rowDataColumns <- rowcols
    }
    if ("colDataColumns" %in% args) {
        res$colDataColumns <- .showAvailable_check(file, "colDataColumns", ftnode, smpnode)
    }
    res
}

#' @rdname makeVariantExperimentFromGDS
#' @export
#' 
makeVariantExperimentFromSNPGDS <- function(file,
                                            ftnode = "snp.id",
                                            smpnode = "sample.id", 
                                            assayNames=NULL,
                                            rowDataColumns = NULL,
                                            colDataColumns = NULL,
                                            rowDataOnDisk = TRUE,
                                            colDataOnDisk = TRUE)
{
    ## checkings
    if (!isSingleString(file))
        stop(wmsg("'file' must be a single string specifying the path to ",
                  "the gds file where the dataset is located."))
    file <- tools::file_path_as_absolute(file)
    stopifnot(is.character(assayNames) | is.null(assayNames))
    ## if (!isSingleStringOrNA(assayNames))
    ##     stop("'assayNames' must be a single string or NA")
    if(!isTRUEorFALSE(colDataOnDisk))
        stop("`colDataOnDisk` must be logical.")
    if(!isTRUEorFALSE(rowDataOnDisk))
        stop("`rowDataOnDisk` must be logical.")

    allnames <- showAvailable(file)$assayNames
    if (is.null(assayNames)) {
        assayNames <- allnames
    } else {
        assayNames <- match.arg(assayNames, assayNames)
    }

    ## colData 
    colData <- .colData_snpgds(file, smpnode, colDataColumns, colDataOnDisk)

    ## rowRange with info data if available for seqVarGDSClass
    rowRange <- .rowRanges_snpgds(file, ftnode, rowDataColumns, rowDataOnDisk)

    ## assays: make sure all assays are in correct dimensions (feature*sample*else) 
    assays <- setNames(lapply(assayNames, function(x) GDSArray(file, x)), assayNames)
    ans_nrow <- length(rowRange)
    ans_ncol <- nrow(colData)
    assays <- lapply(assays, .permdim, dim1 = ans_nrow, dim2 = ans_ncol)
    
    se <- VariantExperiment(
        assays = assays,
        colData = colData,
        rowRanges = rowRange)
}
Bioconductor/VariantExperiment documentation built on April 20, 2024, 3:02 p.m.