R/read.bismark.R

Defines functions read.bismark .constructCounts .constructCountsFromSingleFile .readBismarkAsDT .constructFWGRangesFromBismarkFiles .readBismarkAsFWGRanges .guessBismarkFileType

Documented in read.bismark

# Internal functions -----------------------------------------------------------

# TODO: Test against some of Bismark's other output files. May need a stricter
#       and more accurate heuristic to avoid 'passing' bad files.
# TODO: Test for 'bismark_methylation_extractor' and 'bedGraph' formats
#       (https://github.com/FelixKrueger/Bismark/tree/master/Docs#output-1)
#       and error out (they're not supported by .readBismaskAsDT()).
.guessBismarkFileType <- function(files, n_max = 10L) {
    vapply(files, function(file) {
        x <- read.delim(
            file = file,
            header = FALSE,
            nrows = n_max,
            stringsAsFactors = FALSE)
        n_fields <- ncol(x)
        if (isTRUE(all(n_fields == 6L))) {
            return("cov")
        }
        if (isTRUE(all(n_fields == 7L))) {
            return("cytosineReport")
        }
        # Couldn't guess the file type.
        NA_character_
    }, character(1L))
}


# TODO: Document that the default 'sort = TRUE' applies sort(sortSeqlevels())
#       to the output. This is the default behaviour because it results in
#       the smallest returned object (albeit at the small cost of a sort).
.readBismarkAsFWGRanges <- function(file, rmZeroCov = FALSE,
                                    strandCollapse = FALSE, sort = TRUE,
                                    nThread = 1L, verbose = FALSE) {
    # Argument checks ----------------------------------------------------------

    stopifnot(isTRUEorFALSE(rmZeroCov))
    stopifnot(isTRUEorFALSE(strandCollapse))
    stopifnot(isTRUEorFALSE(sort))

    # Quieten R CMD check about 'no visible binding for global variable'
    M <- U <- NULL

    # Read file to construct data.table of valid loci --------------------------
    if (rmZeroCov) {
        dt <- .readBismarkAsDT(
            file = file,
            col_spec = "BSseq",
            check = TRUE,
            verbose = verbose)
        if (strandCollapse && !is.null(dt[["strand"]]) &&
            !dt[, all(strand == "*")]) {
            # Shift loci on negative strand by 1 to the left and then remove
            # strand since no longer valid.
            dt[strand == "-", start := start - 1L][, strand := NULL]
            # Aggregate counts at loci with the same 'seqnames' and 'start'.
            dt <- dt[,
                     list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
        }
        # Identify loci with non-zero coverage then drop 'M' and 'U' as no
        # longer required.
        dt <- dt[(M + U) > 0][, c("M", "U") := list(NULL, NULL)]
    } else {
        dt <- .readBismarkAsDT(
            file = file,
            col_spec = "GRanges",
            check = FALSE,
            nThread = nThread,
            verbose = verbose)
        if (strandCollapse && !is.null(dt[["strand"]]) &&
            !dt[, all(strand == "*")]) {
            # Shift loci on negative strand by 1 to the left and then remove
            # strand since no longer valid.
            dt[strand == "-", start := start - 1L][, strand := NULL]
            dt <- data.table:::funique(dt)
        }
    }

    # Construct FWGRanges from 'dt' --------------------------------------------

    # NOTE: Sorting results in a smaller FWGRanges object because the
    #       'seqnames' and 'strand' slots are more compressible in their Rle
    #       representation.
    if (sort) {
        if (is.null(dt[["strand"]])) {
            setkey(dt, seqnames, start)
        } else {
            setkey(dt, seqnames, strand, start)
        }
    }
    seqnames <- Rle(dt[["seqnames"]])
    dt[, seqnames := NULL]
    seqinfo <- Seqinfo(seqnames = levels(seqnames))
    ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
    dt[, start := NULL]
    mcols <- make_zero_col_DFrame(length(ranges))
    if (is.null(dt[["strand"]])) {
        strand <- strand(Rle("*", length(seqnames)))
    } else {
        strand <- Rle(dt[["strand"]])
        dt[, strand := NULL]
    }
    fwgranges <- .FWGRanges(
        seqnames = seqnames,
        ranges = ranges,
        strand = strand,
        seqinfo = seqinfo,
        elementMetadata = mcols)
    # NOTE: Final sort is to re-order with respect to sorted seqlevels.
    if (sort) {
        fwgranges <- sort(sortSeqlevels(fwgranges))
    }
    fwgranges
}

# TODO: Document that this applies sort(sortSeqlevels()) to the output. It is
#       deliberate that there is no option to override this behaviour.
.constructFWGRangesFromBismarkFiles <- function(files,
                                                rmZeroCov,
                                                strandCollapse,
                                                verbose,
                                                nThread,
                                                BPPARAM) {
    subverbose <- max(as.integer(verbose) - 1L, 0L)

    # TODO: Instead of using the 'largest' file, use the largest
    #       'cytosine report' file, which will have all loci in the
    #       reference genome; provided all samples were aligned to the same
    #       reference genome, this means it contains all loci.
    # TODO: Initialise using the 'largest' file (i.e. largest number of lines)?
    #       Would like to do this without reading the data into memory.
    #       Some benchmarks can be found at
    #       https://gist.github.com/peterhurford/0d62f49fd43b6cf078168c043412f70a
    #       My initial tests using /users/phickey/GTExScripts/FlowSortingProject/hdf5/extdata/methylation/nonCG/5248_BA9_neg_CHG_report.txt (32 GB) give:
    #       wc -l:                                       77.000s
    #       R.utils::readLines():                      1165.299s
    #       nrow(fread(..., select = 1, nThread = 1)):  582.721s (359s re-run)
    #       nrow(fread(..., select = 1, nThread = 10)):  82.029s
    #       nrow(fread(..., select = 1, nThread = 40)):  81.408s
    #       file.size():                                  0.000s
    #       Of course, fread() only works directly with non-[b]gzipped files.
    #       And subsequent runs of fread() benefit from some cacheing effect
    #       that I don't fully understand except to know that subsequent runs
    #       are 'artificially' faster.
    #       And using file.size() will be innaccurate if files are a mix of
    #       compressed and uncompressed files.
    # Initalise `loci_dt` using the first file.
    if (verbose) {
        message("[.constructFWGRangesFromBismarkFiles] Extracting loci from ",
                "'", files[1L], "'")
    }
    loci_from_first_file <- .readBismarkAsFWGRanges(
        file = files[[1L]],
        rmZeroCov = rmZeroCov,
        strandCollapse = strandCollapse,
        nThread = nThread,
        verbose = subverbose)
    # Identify loci not found in first file.
    # TODO: Pre-process loci as a GNCList?
    # Set number of tasks to ensure the progress bar gives frequent updates.
    # NOTE: The progress bar increments once per task
    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
    #       Although it is somewhat of a bad idea to overrides a user-specified
    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
    #       performance in this instance, and so we opt for a useful progress
    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
    #       bptasks<-() method.
    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
        bptasks(BPPARAM) <- length(files) - 1L
    }
    list_of_loci_from_other_files_not_in_first_file <- bplapply(
        files[-1L], function(file, loci_from_first_file) {
            # Read this file.
            loci_from_this_file <- .readBismarkAsFWGRanges(
                file = file,
                rmZeroCov = rmZeroCov,
                strandCollapse = strandCollapse,
                verbose = subverbose)
            subsetByOverlaps(
                x = loci_from_this_file,
                ranges = loci_from_first_file,
                type = "equal",
                invert = TRUE)
        }, loci_from_first_file = loci_from_first_file,
        BPPARAM = BPPARAM)
    # Identify unique FWGRanges.
    loci_non_found_in_first_file <- unique(
        do.call(c, list_of_loci_from_other_files_not_in_first_file))
    loci <- c(loci_from_first_file, loci_non_found_in_first_file)

    # Sort the loci
    sort(sortSeqlevels(loci))
}


# NOTE: In brief benchmarking, readr::read_csv() is ~1.3-1.6x faster than
#       utils::read.delim() when reading a gzipped file, albeit it with ~1.6-2x
#       more total memory allocated. Therefore, there may be times users prefer
#       to trade off faster speed for lower memory usage.
# TODO: (long term) Formalise benchmarks of utils::read.table(),
#       data.table::fread(), and readr::read_tsv() as a document in the bsseq
#       package so that we can readily re-visit these as needed.
.readBismarkAsDT <- function(file,
                             col_spec = c("all", "BSseq", "GRanges"),
                             check = FALSE,
                             showProgress = FALSE,
                             nThread = 1L,
                             verbose = FALSE) {
    # Argument checks ----------------------------------------------------------

    file <- file_path_as_absolute(file)
    file_type <- .guessBismarkFileType(file)
    col_spec <- match.arg(col_spec)
    stopifnot(isTRUEorFALSE(check))
    stopifnot(isTRUEorFALSE(showProgress))
    # NOTE: 'verbose' controls the verbosity of .readBismarkAsDT() and
    #       'fread_verbose' is derived from that to control the verbosity of
    #       data.table::fread().
    fread_verbose <- as.logical(max(verbose - 1L, 0L))

    # Construct the column spec ------------------------------------------------

    if (file_type == "cov") {
        header <- FALSE
        if (col_spec == "BSseq") {
            colClasses <- c("factor", "integer", "NULL", "NULL", "integer",
                            "integer")
            drop <- c(3L, 4L)
            col.names <- c("seqnames", "start", "M", "U")
        } else if (col_spec == "GRanges") {
            colClasses <- c("factor", "integer", "NULL", "NULL", "NULL", "NULL")
            drop <- c(3L, 4L, 5L, 6L)
            col.names <- c("seqnames", "start")
        } else if (col_spec == "all") {
            colClasses <- c("factor", "integer", "integer", "numeric",
                            "integer", "integer")
            drop <- integer(0L)
            col.names <- c("seqnames", "start", "end", "beta", "M", "U")
        }
    } else if (file_type == "cytosineReport") {
        header <- FALSE
        if (col_spec == "BSseq") {
            colClasses <- c("factor", "integer", "factor", "integer",
                            "integer", "NULL", "NULL")
            drop <- c(6L, 7L)
            col.names <- c("seqnames", "start", "strand", "M", "U")
        } else if (col_spec == "GRanges") {
            colClasses <- c("factor", "integer", "factor", "NULL", "NULL",
                            "NULL", "NULL")
            drop <- c(4L, 5L, 6L, 7L)
            col.names <- c("seqnames", "start", "strand")
        } else if (col_spec == "all") {
            colClasses <- c("factor", "integer", "factor", "integer",
                            "integer", "factor", "factor")
            drop <- integer(0L)
            col.names <- c("seqnames", "start", "strand", "M", "U",
                           "dinucleotide_context", "trinucleotide_context")
        }
    }

    # Read the file ------------------------------------------------------------

    if (verbose) {
        message("[.readBismarkAsDT] Parsing '", file, "'")
    }
    ptime1 <- proc.time()
    if (verbose) {
        message("[.readBismarkAsDT] Reading file ...")
    }
    x <- fread(
        # TODO: This should be file = file, but need to wait for data.table
        #       v.1.11.9 for fix.
        input = file,
        sep = "\t",
        header = header,
        verbose = fread_verbose,
        drop = drop,
        colClasses = colClasses,
        col.names = col.names,
        quote = "",
        showProgress = showProgress,
        nThread = nThread)

    # Construct the result -----------------------------------------------------

    if (!is.null(x[["strand"]])) {
        x[, strand := strand(strand)]
    }
    # NOTE: Quieten R CMD check about 'no visible binding for global variable'.
    M <- U <- NULL
    if (check && all(c("M", "U") %in% colnames(x))) {
        if (verbose) {
            message("[.readBismarkAsDT] Checking validity of counts in file.")
        }
        valid <- x[, isTRUE(all(M >= 0L & U >= 0L))]
        if (!valid) {
            stop("[.readBismarkAsDT] Invalid counts detected.\n",
                 "'M' and 'U' columns should be non-negative integers.")
        } else {
            if (verbose) {
                message("[.readBismarkAsDT] All counts in file are valid.")
            }
        }
    }
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if (verbose) {
        message("Done in ", round(stime, 1), " secs")
    }
    x
}

.constructCountsFromSingleFile <- function(b, files, loci, strandCollapse,
                                           grid, M_sink, Cov_sink, sink_lock,
                                           nThread, verbose) {

    # Argument checks ----------------------------------------------------------

    subverbose <- max(as.integer(verbose) - 1L, 0L)
    # NOTE: Quieten R CMD check about 'no visible binding for global variable'.
    M <- U <- NULL

    # Read b-th file to construct data.table of valid loci and their counts ----

    file <- files[b]
    if (verbose) {
        message("[.constructCountsFromBismarkFileAndFWGRanges] Extracting ",
                "counts from ", "'", file, "'")
    }
    dt <- .readBismarkAsDT(
        file = file,
        col_spec = "BSseq",
        check = TRUE,
        nThread = nThread,
        verbose = subverbose)
    # NOTE: Can only collapse by strand if the data are stranded!
    if (strandCollapse && !is.null(dt[["strand"]])) {
        # Shift loci on negative strand by 1 to the left and then remove
        # strand since no longer valid.
        dt[strand == "-", start := start - 1L][, strand := NULL]
        # Aggregate counts at loci with the same 'seqnames' and 'start'.
        dt <- dt[, list(M = sum(M), U = sum(U)), by = c("seqnames", "start")]
    }

    # Construct FWGRanges ------------------------------------------------------

    seqnames <- Rle(dt[["seqnames"]])
    dt[, seqnames := NULL]
    seqinfo <- Seqinfo(seqnames = levels(seqnames))
    ranges <- .FWIRanges(start = dt[["start"]], width = 1L)
    dt[, start := NULL]
    mcols <- make_zero_col_DFrame(length(ranges))
    if (is.null(dt[["strand"]])) {
        strand <- strand(Rle("*", length(seqnames)))
    } else {
        strand <- Rle(dt[["strand"]])
        dt[, strand := NULL]
    }
    loci_from_this_sample <- .FWGRanges(
        seqnames = seqnames,
        ranges = ranges,
        strand = strand,
        seqinfo = seqinfo,
        elementMetadata = mcols)

    # Construct 'M' and 'Cov' matrices -----------------------------------------

    ol <- findOverlaps(loci_from_this_sample, loci, type = "equal")
    M <- matrix(rep(0L, length(loci)), ncol = 1)
    Cov <- matrix(rep(0L, length(loci)), ncol = 1)
    M[subjectHits(ol)] <- dt[queryHits(ol), ][["M"]]
    Cov[subjectHits(ol)] <- dt[queryHits(ol), list(Cov = (M + U))][["Cov"]]

    # Return 'M' and 'Cov' or write them to the RealizationSink objects --------

    if (is.null(M_sink)) {
        return(list(M = M, Cov = Cov))
    }
    # Write to M_sink and Cov_sink while respecting the IPC lock.
    viewport <- grid[[b]]
    ipclock(sink_lock)
    write_block(M_sink, viewport = viewport, block = M)
    write_block(Cov_sink, viewport = viewport, block = Cov)
    ipcunlock(sink_lock)
    NULL
}

.constructCounts <- function(files, loci, strandCollapse, BPPARAM,
                             BACKEND, dir, chunkdim, level, nThread,
                             verbose = FALSE) {

    # Argument checks ----------------------------------------------------------

    subverbose <- max(as.integer(verbose) - 1L, 0L)

    # Construct ArrayGrid ------------------------------------------------------

    # NOTE: Each block contains data for a single sample.
    ans_nrow <- length(loci)
    ans_ncol <- length(files)
    ans_dim <- c(ans_nrow, ans_ncol)
    grid <- RegularArrayGrid(
        refdim = ans_dim,
        spacings = c(ans_nrow, 1L))

    # Construct RealizationSink objects ----------------------------------------

    if (is.null(BACKEND)) {
        M_sink <- NULL
        Cov_sink <- NULL
        sink_lock <- NULL
    } else if (BACKEND == "HDF5Array") {
        h5_path <- file.path(dir, "assays.h5")
        M_sink <- HDF5RealizationSink(
            dim = ans_dim,
            dimnames = NULL,
            type = "integer",
            filepath = h5_path,
            name = "M",
            chunkdim = chunkdim,
            level = level)
        on.exit(close(M_sink), add = TRUE)
        Cov_sink <- HDF5RealizationSink(
            dim = ans_dim,
            dimnames = NULL,
            type = "integer",
            filepath = h5_path,
            name = "Cov",
            chunkdim = chunkdim,
            level = level)
        on.exit(close(Cov_sink), add = TRUE)
        sink_lock <- ipcid()
        on.exit(ipcremove(sink_lock), add = TRUE)
    } else {
        # TODO: This branch should probably never be entered because we
        #       (implicitly) only support in-memory or HDF5Array backends.
        #       However, we retain it for now (e.g., fstArray backend would use
        #       this until a dedicated branch was implemented).
        M_sink <- DelayedArray::AutoRealizationSink(
            dim = ans_dim,
            type = "integer")
        on.exit(close(M_sink), add = TRUE)
        Cov_sink <- DelayedArray::AutoRealizationSink(
            dim = ans_dim,
            type = "integer")
        on.exit(close(Cov_sink), add = TRUE)
        sink_lock <- ipcid()
        on.exit(ipcremove(sink_lock), add = TRUE)
    }

    # Apply .constructCountsFromSingleFile() to each file ----------------------

    # Set number of tasks to ensure the progress bar gives frequent updates.
    # NOTE: The progress bar increments once per task
    #       (https://github.com/Bioconductor/BiocParallel/issues/54).
    #       Although it is somewhat of a bad idea to overrides a user-specified
    #       bptasks(BPPARAM), the value of bptasks(BPPARAM) doesn't affect
    #       performance in this instance, and so we opt for a useful progress
    #       bar. Only SnowParam (and MulticoreParam by inheritance) have a
    #       bptasks<-() method.
    if (is(BPPARAM, "SnowParam") && bpprogressbar(BPPARAM)) {
        bptasks(BPPARAM) <- length(grid)
    }
    counts <- bptry(bplapply(
        X = seq_along(grid),
        FUN = .constructCountsFromSingleFile,
        files = files,
        loci = loci,
        strandCollapse = strandCollapse,
        grid = grid,
        M_sink = M_sink,
        Cov_sink = Cov_sink,
        sink_lock = sink_lock,
        verbose = subverbose,
        nThread = nThread,
        BPPARAM = BPPARAM))
    if (!all(bpok(counts))) {
        stop(".constructCounts() encountered errors for these files:\n  ",
             paste(files[!bpok], collapse = "\n  "))
    }

    # Construct 'M' and 'Cov' --------------------------------------------------

    if (is.null(BACKEND)) {
        # Returning matrix objects.
        M <- do.call(c, lapply(counts, "[[", "M"))
        attr(M, "dim") <- ans_dim
        Cov <- do.call(c, lapply(counts, "[[", "Cov"))
        attr(Cov, "dim") <- ans_dim
    } else {
        # Returning DelayedMatrix objects.
        M <- as(M_sink, "DelayedArray")
        Cov <- as(Cov_sink, "DelayedArray")
    }

    return(list(M = M, Cov = Cov))
}

# Exported functions -----------------------------------------------------------

# TODO: If you have N cores available, are you better off using
#       bpworkers() = N in the BPPARAM or nThread = N and use
#       data.table::fread()? Or something in between?
# TODO: Allow user to determine `nThread`?
# TODO: Document that you can't use strandCollapse with files that lack strand
#       (e.g., .cov files).
# TODO: (long term) Report a run-time warning/message if strandCollapse = TRUE
#       is used in conjunction with files without strand information.
# TODO: Try a bpiterate()-based approach to getting the set of loci from files.
read.bismark <- function(files,
                         loci = NULL,
                         colData = NULL,
                         rmZeroCov = FALSE,
                         strandCollapse = TRUE,
                         BPPARAM = bpparam(),
                         BACKEND = NULL,
                         dir = tempfile("BSseq"),
                         replace = FALSE,
                         chunkdim = NULL,
                         level = NULL,
                         nThread = 1L,
                         verbose = getOption("verbose")) {
    # Argument checks ----------------------------------------------------------

    # Check 'files' is valid.
    if (anyDuplicated(files)) {
        stop("'files' cannot have duplicate entries.")
    }
    file_exists <- file.exists(files)
    if (!isTRUE(all(file_exists))) {
        stop("These files cannot be found:\n  ",
             paste(files[!file_exists], collapse = "\n  "))
    }
    guessed_file_types <- .guessBismarkFileType(files)
    if (anyNA(guessed_file_types)) {
        stop("Could not guess Bismark file type for these files:\n  ",
             "  ", paste(files[!is.na(guessed_file_types)], collapse = "\n  "))
    }
    # Check 'loci' is valid.
    if (!is.null(loci)) {
        if (!is(loci, "GenomicRanges")) {
            stop("'loci' must be a GenomicRanges instance if not NULL.")
        }
        if (any(width(loci) != 1L)) {
            stop("All elements of 'loci' must have width equal to 1.")
        }
    }
    # Check 'colData' is valid.
    if (is.null(colData)) {
        colData <- DataFrame(row.names = files)
    }
    if (nrow(colData) != length(files)) {
        stop("Supplied 'colData' must have nrow(colData) == length(files).")
    }
    # Check 'rmZeroCov' and 'strandCollapse' are valid.
    stopifnot(isTRUEorFALSE(rmZeroCov))
    stopifnot(isTRUEorFALSE(strandCollapse))
    # Register 'BACKEND' and return to current value on exit.
    # TODO: Is this strictly necessary?
    current_BACKEND <- getAutoRealizationBackend()
    on.exit(setAutoRealizationBackend(current_BACKEND), add = TRUE)
    setAutoRealizationBackend(BACKEND)
    # Check compatability of 'BPPARAM' with 'BACKEND'.
    if (!.areBackendsInMemory(BACKEND)) {
        if (!.isSingleMachineBackend(BPPARAM)) {
            stop("The parallelisation strategy must use a single machine ",
                 "when using an on-disk realization backend.\n",
                 "See help(\"read.bismark\") for details.",
                 call. = FALSE)
        }
    } else {
        if (!is.null(BACKEND)) {
            # NOTE: Currently do not support any in-memory realization
            #       backends. If the realization backend is NULL then an
            #       ordinary matrix is returned rather than a matrix-backed
            #       DelayedMatrix.
            stop("The '", BACKEND, "' realization backend is not supported.",
                 "\n  See help(\"read.bismark\") for details.",
                 call. = FALSE)
        }
    }
    # If using HDF5Array as BACKEND, check remaining options are sensible.
    if (identical(BACKEND, "HDF5Array")) {
        # NOTE: Most of this copied from
        #       HDF5Array::saveHDF5SummarizedExperiment().
        if (!isSingleString(dir)) {
            stop(wmsg("'dir' must be a single string specifying the path to ",
                      "the directory where to save the BSseq object (the ",
                      "directory will be created)."))
        }
        if (!isTRUEorFALSE(replace)) {
            stop("'replace' must be TRUE or FALSE")
        }
        if (!dir.exists(dir)) {
            HDF5Array::create_dir(dir)
        } else {
            HDF5Array::replace_dir(dir, replace)
        }
    }
    # Set verbosity used by internal functions.
    subverbose <- as.logical(max(verbose - 1L, 0L))

    # Construct FWGRanges with all valid loci ----------------------------------

    # NOTE: "Valid" loci are those that remain after collapsing by strand (if
    #       strandCollapse == TRUE) and then removing loci with zero coverage
    #       in all samples (if rmZeroCov == TRUE).
    if (is.null(loci)) {
        ptime1 <- proc.time()
        if (verbose) {
            message(
                "[read.bismark] Parsing files and constructing valid loci ...")
        }
        loci <- .constructFWGRangesFromBismarkFiles(
            files = files,
            rmZeroCov = rmZeroCov,
            strandCollapse = strandCollapse,
            verbose = subverbose,
            nThread = nThread,
            BPPARAM = BPPARAM)
        ptime2 <- proc.time()
        stime <- (ptime2 - ptime1)[3]
        if (verbose) {
            message("Done in ", round(stime, 1), " secs")
        }
    } else {
        if (verbose) {
            message("[read.bismark] Using 'loci' as candidate loci.")
        }
        if (strandCollapse) {
            if (verbose) {
                message("[read.bismark] Collapsing strand of 'loci' ...")
            }
            ptime1 <- proc.time()
            loci <- .strandCollapse(loci)
            ptime2 <- proc.time()
            stime <- (ptime2 - ptime1)[3]
            if (verbose) {
                message("Done in ", round(stime, 1), " secs")
            }
        }
        if (rmZeroCov) {
            if (verbose) {
                message("[read.bismark] Parsing files to identify elements of ",
                        "'loci' with non-zero coverage ...")
            }
            ptime1 <- proc.time()
            # Construct loci with non-zero coverage in files.
            loci_from_files <- .constructFWGRangesFromBismarkFiles(
                files = files,
                rmZeroCov = rmZeroCov,
                strandCollapse = strandCollapse,
                verbose = subverbose,
                nThread = nThread,
                BPPARAM = BPPARAM)
            # Retain those elements of 'loci' with non-zero coverage.
            loci <- subsetByOverlaps(loci, loci_from_files, type = "equal")
            ptime2 <- proc.time()
            stime <- (ptime2 - ptime1)[3]
            if (verbose) {
                message("Done in ", round(stime, 1), " secs")
            }
        }
    }

    # Construct 'M' and 'Cov' matrices -----------------------------------------
    ptime1 <- proc.time()
    if (verbose) {
        message("[read.bismark] Parsing files and constructing 'M' and 'Cov' ",
                "matrices ...")
    }
    counts <- .constructCounts(
        files = files,
        loci = loci,
        strandCollapse = strandCollapse,
        BPPARAM = BPPARAM,
        BACKEND = BACKEND,
        dir = dir,
        chunkdim = chunkdim,
        level = level,
        nThread = nThread,
        verbose = subverbose)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if (verbose) {
        message("Done in ", round(stime, 1), " secs")
    }

    # Construct BSseq object, saving it if it is HDF5-backed -------------------

    if (verbose) {
        message("[read.bismark] Constructing BSseq object ... ")
    }
    se <- SummarizedExperiment(
        assays = counts,
        # NOTE: For now, have to use GRanges instance as rowRanges but
        #       ultimately would like to use a FWGRanges instance.
        rowRanges = as(loci, "GRanges"),
        colData = colData)
    # TODO: Is there a way to use the internal constructor with `check = FALSE`?
    #       Don't need to check M and Cov because this has already happened
    #       when files were parsed.
    # .BSseq(se, trans = function(x) NULL, parameters = list())
    bsseq <- new2("BSseq", se, check = FALSE)
    if (!is.null(BACKEND) && BACKEND == "HDF5Array") {
          # NOTE: Save BSseq object; mimicing
        #       HDF5Array::saveHDF5SummarizedExperiment().
        x <- bsseq
        x@assays <- HDF5Array::shorten_assay2h5_links(x@assays)
        base::saveRDS(x, file = file.path(dir, "se.rds"))
    }
    bsseq
}

# TODOs ------------------------------------------------------------------------

# TODO: Document internal functions for my own sanity. Also, some may be useful
#       to users of bsseq (although I still won't export these for the time
#       being).
# TODO: (long term) Current implementation requires that user can load at least
#       one sample's worth of data into memory per worker. Could instead read
#       chunks of data, write to sink, load next chunk, etc.
# TODO: Document that if 'loci' is NULL and any 'files' (especially the first
#       file) are .cov files, then any loci present in the .cov files will have
#       their strand set to *. If you are mixing and matching .cov and
#       .cytosineReport files and don't want this behaviour (i.e. you want to
#       retain strand) then you'll need to construct your own 'gr' and pass
#       this to the function. Add unit tests for this behaviour.
hansenlab/bsseq documentation built on June 12, 2025, 7:42 p.m.