R/demultiplex.R

Defines functions demultiplex errmessages extract_reads

Documented in demultiplex extract_reads

#' Helper function for demultiplexing
#'
#' Helper function for demultiplexing sequencing reads, designed in a way to
#' allow for parallelization across barcodes (parallel extraction of reads by
#' barcode). This function takes a specific barcode (numeric index) from lists
#' of sample names/barcodes, a \code{Biostrings::DNAStringSet} of barcodes by
#' sequence header, and a \code{Biostrings::QualityScaledXStringSet} of reads
#' corresponding to the barcodes. Based on the barcode index given, it extracts
#' all reads for the indexed barcode and writes all the reads from that barcode
#' to a separate .fastq file.
#'
#' @param barcodeIndex Which barcode (integer number or index) in the barcodes
#'   or sample name to use for read extraction.
#' @param barcodes A list of all barcodes in the sequencing dataset. Correlates
#'   and in same order as \code{sampleNames}.
#' @param sampleNames A list of sample names or identifiers associated with each
#'   barcode in the barcodes list.
#' @param index A \code{Biostrings::DNAStringSet} that contains the read headers
#'   and barcode sequence for each header in the sequence slot.
#' @param reads A \code{Biostrings::QualityScaledXStringSet} that has the same
#'   headers and order as the index file, but contains the read sequences and
#'   their quality scores.
#' @param location A directory location to store the demultiplexed read files.
#'   Defaults to generate a new subdirectory at './demultiplex_fastq'
#' @param rcBarcodes Should the barcode indices in the barcodes list be reverse
#'   complemented to match the sequences in the index DNAStringSet? Defaults to
#'   \code{TRUE}.
#' @param hDist Uses a Hamming Distance or number of base differences to allow
#'   for inexact matches for the barcodes/indexes. Defaults to 0. Warning: if
#'   the Hamming Distance is >=1 and this leads to inexact index matches to more
#'   than one barcode, that read will be written to more than one demultiplexed
#'   read files.
#' @param quiet Turns off most messages. Default is \code{TRUE}.
#'
#' @return Writes a single .fastq file that contains all reads whose index
#'   matches the barcode specified. This file will be written to the location
#'   directory, and will be named based on the specified sampleName and barcode,
#'   e.g. './demultiplex_fastq/SampleName1_GGAATTATCGGT.fastq.gz'
#' @export
#' @examples
#'
#' ## Create temporary directory
#' ref_temp <- tempfile()
#' dir.create(ref_temp)
#'
#' ## Load example barcode, index, and read data into R session
#' barcodePath <- system.file("extdata", "barcodes.txt", package = "MetaScope")
#' bcFile <- read.table(barcodePath, sep = "\t", header = TRUE)
#'
#' indexPath <- system.file("extdata", "virus_example_index.fastq",
#' package = "MetaScope")
#' inds <- Biostrings::readDNAStringSet(indexPath, format = "fastq")
#'
#' readPath <- system.file("extdata", "virus_example.fastq",
#'                         package = "MetaScope")
#' reads <- Biostrings::readQualityScaledDNAStringSet(readPath)
#'
#' ## Extract reads from the first barcode
#' results <- extract_reads(1, bcFile[, 2], bcFile[, 1], inds, reads,
#'                         rcBarcodes = FALSE, location = ref_temp)
#'
#' ## Extract reads from multiple barcodes
#' more_results <- lapply(1:6, extract_reads, bcFile[, 2], bcFile[, 1], inds,
#'                        reads, rcBarcodes = FALSE, location = ref_temp)
#'
#' ## Remove temporary directory
#' unlink(ref_temp, recursive = TRUE)
#'

extract_reads <- function(barcodeIndex, barcodes, sampleNames, index, reads,
                         location = "./demultiplex_fastq", rcBarcodes = TRUE,
                         hDist = 0, quiet = TRUE) {
    barcode <- barcodes[barcodeIndex]
    sampleName <- sampleNames[barcodeIndex]
    if (!quiet) message("Finding reads for barcode: ", barcode)
    if (rcBarcodes) {
        rci <- as.character(Biostrings::reverseComplement(
            Biostrings::DNAString(barcode)))
    } else {
        rci <- barcode
    }
    ind_match <- utils::adist(as.character(index), rci) <= hDist
    numReads <- sum(ind_match)
    outFileName <- paste(location, "/", sampleName, "_", barcode,
                         ".fastq.gz", sep = "")
    if (numReads == 0 && !quiet) {
        message("\tFound 0 reads for this barcode; no file will be written.")
    } else {
        if (!quiet) message("\tFound ", sum(ind_match),
                            " reads, writing reads to: ", outFileName)
        Biostrings::writeQualityScaledXStringSet(
          reads[c(ind_match)], outFileName, compress = TRUE)
    }
    return(list(output_file = outFileName, numberOfReads = numReads,
                matchedIndexes = ind_match))
}

errmessages <- function(barcodes, samNames, numReads) {
  warning("Did not find any reads for the following barcodes: ",
          paste(barcodes[numReads == 0], collapse = " "))
  warning("Did not find any reads for the following samples: ",
          paste(samNames[numReads == 0], collapse = " "))
  write(paste("Did not find any reads for the following barcodes:",
              paste(barcodes[numReads == 0], collapse = " "), "\n",
              "Did not find any reads for the following samples: ",
              paste(samNames[numReads == 0], collapse = " ")),
        file = "demultiplex_fastq/unmapped_barcodes_samples.txt")
}

#' Demultiplexing sequencing reads
#'
#' Function for demultiplexing sequencing reads arranged in a common format
#' provided by sequencers (such as Illumina) generally for 16S data. This
#' function takes a matrix of sample names/barcodes, a .fastq file of barcodes
#' by sequence header, and a .fastq file of reads corresponding to the barcodes.
#' Based on the barcodes given, the function extracts all reads for the indexed
#' barcode and writes all the reads from that barcode to separate .fastq files.
#' @param barcodeFile File name for a file containing a .tsv matrix with a
#'   header row, and then sample names (column 1) and barcodes (column 2).
#' @param indexFile Location to a .fastq file that contains the barcodes for
#'   each read. The headers should be the same (and in the same order) as
#'   \code{readFile}, and the sequence in the \code{indexFile} should be the
#'   corresponding barcode for each read. Quality scores are not considered.
#' @param readFile Location to the sequencing read .fastq file that corresponds
#'   to the \code{indexFile}.
#' @param rcBarcodes Should the barcode indexes in the barcodeFile be reverse
#'   complemented to match the sequences in the \code{indexFile}? Defaults to
#'   \code{TRUE}.
#' @param location A directory location to store the demultiplexed read files.
#'   Defaults to generate a new temporary directory.
#' @param threads The number of threads to use for parallelization
#'   (BiocParallel). This function will parallelize over the barcodes and
#'   extract reads for each barcode separately and write them to separate
#'   demultiplexed files.
#' @param hammingDist Uses a Hamming Distance or number of base differences to
#'   allow for inexact matches for the barcodes/indexes. Defaults to \code{0}.
#'   Warning: if the Hamming Distance is \code{>=1} and this leads to inexact
#'   index matches to more than one barcode, that read will be written to more
#'   than one demultiplexed read files.
#' @param quiet Turns off most messages. Default is \code{TRUE}.
#'
#' @return Returns multiple .fastq files that contain all reads whose index
#'   matches the barcodes given. These files will be written to the location
#'   directory, and will be named based on the given sampleNames and barcodes,
#'   e.g. './demultiplex_fastq/SampleName1_GGAATTATCGGT.fastq.gz'
#'
#' @export
#'
#' @examples
#'
#' ## Get barcode, index, and read data locations
#' barcodePath <- system.file("extdata", "barcodes.txt", package = "MetaScope")
#' indexPath <- system.file("extdata", "virus_example_index.fastq",
#'                          package = "MetaScope")
#' readPath <- system.file("extdata", "virus_example.fastq",
#'                          package = "MetaScope")
#'
#' ## Demultiplex
#' demult <- demultiplex(barcodePath, indexPath, readPath, rcBarcodes = FALSE,
#'                       hammingDist = 2)
#' demult
#'

demultiplex <- function(barcodeFile, indexFile, readFile, rcBarcodes = TRUE,
                        location = NULL, threads = 1,
                        hammingDist = 0, quiet = TRUE) {
    if (is.null(location)) {
      location <- tempfile()
      dir.create(location)
    }
    if (!quiet) message("Reading Sample Names and Barcodes from: ",
                        barcodeFile)
    bcFile <- utils::read.table(barcodeFile, sep = "\t", header = TRUE)
    barcodes <- bcFile[, 2]
    samNames <- bcFile[, 1]
    if (!quiet) message("\tFound information for ", length(barcodes),
                        " samples/barcodes")
    if (!quiet) message("Reading Index File: ", indexFile)
    inds <- Biostrings::readDNAStringSet(indexFile, format = "fastq")
    if (!quiet) message("\tFound indexes for ", length(inds), " reads")
    if (!quiet) message("Reading Sequence File: ", readFile)
    reads <- Biostrings::readQualityScaledDNAStringSet(readFile)
    if (!quiet) message("\tFound ", length(reads), " reads")
    ## make output directory if necessary
    if (!dir.exists(location)) dir.create(location)
    # Loop over barcodes
    numReads <- NULL
    ind_no_match <- numeric(length(reads))
    for (i in seq_along(barcodes)) {
        extracted <- extract_reads(
          i, barcodes, samNames, inds, reads, rcBarcodes = rcBarcodes,
          location = location, hDist = hammingDist, quiet = quiet)
        numReads <- c(numReads, extracted$numberOfReads)
        ind_no_match <- ind_no_match + extracted$matchedIndexes
    }
    if (!quiet) message(sum(ind_no_match > 1))
    ind_no_match <- (ind_no_match == 0)
    # number of reads for each barcode
    if (any(numReads == 0)) {
        errmessages(barcodes, samNames, numReads)
    }
    # Track reads without matches, and write them to an 'orphan' file
    if (!quiet) message("Found ", sum(ind_no_match),
                        " reads without a matching barcode (",
            100 * round(mean(ind_no_match), 4), "%), writing reads to: ",
            location, "/orphans.fastq.gz")
    Biostrings::writeQualityScaledXStringSet(
        reads[c(ind_no_match)], paste(location, "/orphans.fastq.gz",
                                      sep = ""), compress = TRUE)
    summaryMat <- cbind(bcFile[seq_along(barcodes), ],
                        NumberOfReads = numReads)
    utils::write.table(summaryMat,
                       file = paste(location, "/summary.txt", sep = ""),
                col.names = FALSE, row.names = TRUE, quote = FALSE)
    return(summaryMat)
}
compbiomed/MetaScope documentation built on April 1, 2024, 5:35 p.m.