R/importBUStools.R

Defines functions importBUStools .importBUStools .constructSCEFromBUStoolsOutputs

Documented in importBUStools

# dir <- "genecount"
.constructSCEFromBUStoolsOutputs <- function(dir,
    sample,
    matrixFileName,
    featuresFileName,
    barcodesFileName,
    gzipped,
    class,
    delayedArray) {

    cb <- .readBarcodes(file.path(dir, barcodesFileName))
    fe <- .readFeatures(file.path(dir, featuresFileName))
    ma <- .readMatrixMM(file.path(dir, matrixFileName),
        gzipped = gzipped,
        class = class,
        delayedArray = delayedArray)
    ma <- t(ma)

    coln <- paste(sample, cb[[1]], sep = "_")
    rownames(ma) <- fe[[1]]

    sce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = ma))
    SummarizedExperiment::rowData(sce) <- fe
    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(cb,
        column_name = coln,
        sample = sample,
        row.names = coln)
    
    return(sce)
}


# main function
.importBUStools <- function(
    BUStoolsDirs,
    samples,
    matrixFileNames,
    featuresFileNames,
    barcodesFileNames,
    gzipped,
    class,
    delayedArray,
    rowNamesDedup) {

    if (length(BUStoolsDirs) != length(samples)) {
        stop("'BUStoolsDirs' and 'samples' have unequal lengths!")
    }

    res <- vector("list", length = length(samples))

    matrixFileNames <- .getVectorized(matrixFileNames, length(samples))
    featuresFileNames <- .getVectorized(featuresFileNames, length(samples))
    barcodesFileNames <- .getVectorized(barcodesFileNames, length(samples))
    gzipped <- .getVectorized(gzipped, length(samples))

    for (i in seq_along(samples)) {
        dir <- file.path(BUStoolsDirs[i])
        scei <- .constructSCEFromBUStoolsOutputs(dir,
            sample = samples[i],
            matrixFileName = matrixFileNames[i],
            featuresFileName = featuresFileNames[i],
            barcodesFileName = barcodesFileNames[i],
            gzipped = gzipped[i],
            class = class,
            delayedArray = delayedArray)
        res[[i]] <- scei
    }

    sce <- do.call(SingleCellExperiment::cbind, res)
    
    if (isTRUE(rowNamesDedup)) {
        if (any(duplicated(rownames(sce)))) {
            message("Duplicated gene names found, adding '-1', '-2', ",
                    "... suffix to them.")
        }
        sce <- dedupRowNames(sce)
    }
    
    return(sce)
}


#' @name importBUStools
#' @rdname importBUStools
#' @title Construct SCE object from BUStools output
#' @description Read the barcodes, features (genes), and matrix from BUStools
#'  output. Import them
#'  as one \link[SingleCellExperiment]{SingleCellExperiment} object. Note the
#'  cells in the output files for BUStools 0.39.4 are not filtered.
#' @param BUStoolsDirs A vector of paths to BUStools output files. Each sample
#'  should have its own path. For example: \code{./genecount}.
#'  Must have the same length as \code{samples}.
#' @param samples A vector of user-defined sample names for the samples to be
#'  imported. Must have the same length as \code{BUStoolsDirs}.
#' @param matrixFileNames Filenames for the Market Exchange Format (MEX) sparse
#'  matrix files (.mtx files). Must have length 1 or the same
#'  length as \code{samples}.
#' @param featuresFileNames Filenames for the feature annotation files.
#'  Must have length 1 or the same length as \code{samples}.
#' @param barcodesFileNames Filenames for the cell barcode list file.
#'  Must have length 1 or the same length as \code{samples}.
#' @param gzipped Boolean. \code{TRUE} if the BUStools output files
#'  (barcodes.txt, genes.txt, and genes.mtx) were
#'  gzip compressed. \code{FALSE} otherwise. This is \code{FALSE} in BUStools
#'  0.39.4. Default \code{"auto"} which automatically detects if the
#'  files are gzip compressed. Must have length 1 or the same length as
#'  \code{samples}.
#' @param class Character. The class of the expression matrix stored in the SCE
#'  object. Can be one of "Matrix" (as returned by
#'  \link{readMM} function), or "matrix" (as returned by
#'  \link[base]{matrix} function). Default "Matrix".
#' @param delayedArray Boolean. Whether to read the expression matrix as
#'  \link[DelayedArray]{DelayedArray-class} object or not. Default \code{FALSE}.
#' @param rowNamesDedup Boolean. Whether to deduplicate rownames. Default 
#'  \code{TRUE}.
#' @return A \code{SingleCellExperiment} object containing the count
#'  matrix, the gene annotation, and the cell annotation.
#' @examples
#' # Example #1
#' # FASTQ files were downloaded from
#' # https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0
#' # /pbmc_1k_v3
#' # They were concatenated as follows:
#' # cat pbmc_1k_v3_S1_L001_R1_001.fastq.gz pbmc_1k_v3_S1_L002_R1_001.fastq.gz >
#' # pbmc_1k_v3_R1.fastq.gz
#' # cat pbmc_1k_v3_S1_L001_R2_001.fastq.gz pbmc_1k_v3_S1_L002_R2_001.fastq.gz >
#' # pbmc_1k_v3_R2.fastq.gz
#' # The following BUStools command generates the gene, cell, and
#' # matrix files
#'
#' # bustools correct -w ./3M-february-2018.txt -p output.bus | \
#' #   bustools sort -T tmp/ -t 4 -p - | \
#' #   bustools count -o genecount/genes \
#' #     -g ./transcripts_to_genes.txt \
#' #     -e matrix.ec \
#' #     -t transcripts.txt \
#' #     --genecounts -
#'
#' # The top 20 genes and the first 20 cells are included in this example.
#' sce <- importBUStools(
#'   BUStoolsDirs = system.file("extdata/BUStools_PBMC_1k_v3_20x20/genecount/",
#'     package = "singleCellTK"),
#'   samples = "PBMC_1k_v3_20x20")
#' @export
importBUStools <- function(
    BUStoolsDirs,
    samples,
    matrixFileNames = "genes.mtx",
    featuresFileNames = "genes.genes.txt",
    barcodesFileNames = "genes.barcodes.txt",
    gzipped = "auto",
    class = c("Matrix", "matrix"),
    delayedArray = FALSE,
    rowNamesDedup = TRUE) {

    class <- match.arg(class)

    .importBUStools(
        BUStoolsDirs = BUStoolsDirs,
        samples = samples,
        matrixFileNames = matrixFileNames,
        featuresFileNames = featuresFileNames,
        barcodesFileNames = barcodesFileNames,
        gzipped = gzipped,
        class = class,
        delayedArray = delayedArray,
        rowNamesDedup = rowNamesDedup)
}
compbiomed/singleCellTK documentation built on May 8, 2024, 6:58 p.m.