Nothing
#' Parse Barcoded Reads
#'
#' @param bam.file File path of BAM file. `.bai` BAM index file must be in the same location (can be generated using [Rsamtools::indexBam()]).
#' @param barcode.lookup `data.frame` where the first column is the barcode identifier/name and the second column is the DNA sequence. Headers are ignored.
#' @param cell.barcode.tag Character of length 2, indicates cell barcode field in BAM, specified by Tapestri pipeline (currently "RG"). Default "RG".
#' @param contig Character, contig or chromosome name to search for barcodes in. Can be a vector of more than one contig to expand search space.
#' @param max.mismatch Numeric, the maximum and minimum number of mismatching letters allowed. Default 2.
#' @param with.indels If `TRUE`, then indels are allowed. Default `FALSE`.
#'
#' @return A data.frame of read counts for each specified barcode.
#' @export
#'
#' @rdname countBarcodedReads
#' @order 2
#'
#' @seealso [Biostrings::matchPattern()]
#'
#' @examples
#' \dontrun{
#' counts <- countBarcodedReadsFromContig(bam.file, barcode.lookup, "virus_ref2")
#' }
countBarcodedReadsFromContig <- function(bam.file, barcode.lookup, contig, cell.barcode.tag = "RG", max.mismatch = 2, with.indels = FALSE) {
cli::cli_progress_step("Reading .bam file...")
# set bam parameters
which <- GenomicRanges::GRanges(contig, IRanges::IRanges(1, 536870912))
what <- c("qname", "rname", "isize", "seq")
param <- Rsamtools::ScanBamParam(which = which, what = what, tag = cell.barcode.tag)
# check for bam index and throw error if not found
tryCatch(
expr = bam <- Rsamtools::scanBam(file = bam.file, param = param),
error = function(e) {
error.message.match <- grep(pattern = "failed to load BAM index", x = e$message)
if (!S4Vectors::isEmpty(error.message.match)) {
e$message <- paste0(
e$message, "\nBAM index .bai file needs to be in the same directory as .bam file.",
"\nRun `Rsamtools::indexBam(bam.filename)` to generate BAM index file if it cannot be found."
)
}
cli::cli_abort(e$message)
}
)
cli::cli_progress_done()
# filter bam by cell barcode tag
bam.filter <- data.frame(qname = bam[[1]]$qname, rname = bam[[1]]$rname, tlen = bam[[1]]$isize, seq = bam[[1]]$seq, tag = bam[[1]]$tag[[cell.barcode.tag]])
# parse barcode lookup table by column index
barcode.lut.vector <- barcode.lookup[, 2]
names(barcode.lut.vector) <- barcode.lookup[, 1]
barcode.set <- Biostrings::DNAStringSet(x = barcode.lut.vector, use.names = TRUE)
# match input barcodes to reads and convert to logical
# progress bar
sequence.matches <- purrr::map(barcode.set, function(x) {
as.logical(Biostrings::vcountPattern(pattern = x, subject = bam.filter$seq, max.mismatch = max.mismatch, with.indels = with.indels))
}, .progress = list(name = "Matching barcodes...", show_after = 0))
cli::cli_alert_success("Matching barcodes...")
# match barcoded reads to cell barcodes and count. Returns NULL if no matches.
sequence.match.ids <- names(sequence.matches)
names(sequence.match.ids) <- names(sequence.matches)
sequence.match.counts <- lapply(sequence.match.ids, function(x) {
bam.matches <- bam.filter$tag[sequence.matches[[x]]]
if (S4Vectors::isEmpty(bam.matches)) {
cli::cli_alert_info("No read matches found for ID {.q {x}}.")
return(NULL)
}
counts <- as.data.frame(table(bam.matches))
counts$bam.matches <- as.character(counts$bam.matches)
colnames(counts) <- c("cell.barcode", x)
cli::cli_bullets(c(">" = "{formatC(sum(counts[,2]), big.mark = ',', format = 'f', digits = 0)} read matches found for ID {x}."))
return(counts)
})
# stop if no matches
if (all(vapply(sequence.match.counts, is.null, FUN.VALUE = logical(1)))) {
cli::cli_abort("No matches found for any barcode IDs.")
}
# remove NULL items from sequence.match.counts, removes barcode IDs with no matches
sequence.match.counts <- sequence.match.counts[!vapply(sequence.match.counts, is.null, logical(1))]
# combine count tables
sequence.match.counts <- Reduce(function(x, y) merge(x, y, by = "cell.barcode", all = TRUE), sequence.match.counts)
rownames(sequence.match.counts) <- sequence.match.counts$cell.barcode
sequence.match.counts[is.na(sequence.match.counts)] <- 0
return(sequence.match.counts)
}
#' @name countBarcodedReads
#'
#' @title Get read counts from barcoded reads
#'
#' @description `countBarcodedReads()` and `countBarcodedReadsFromContig()` match exogenous DNA barcode sequences to their associated
#' cell barcodes and saves them to the `colData` (cell barcode metadata) of `TapestriExperiment`.
#' `countBarcodedReads()` is a shortcut for `countBarcodedReadsFromContig()`, allowing the user to specify 'gRNA' or 'barcode'
#' to use the `grnaCounts` or `barcodeCounts` `altExp` slots.
#' The entries in the `barcode.lookup` table do not have to be present in the sample,
#' allowing users to keep one master table/file of available barcode sequences for use in all experiments.
#' The `Rsamtools` and `Biostrings` packages must be installed to use these functions.
#'
#' @param bam.file File path of BAM file. `.bai` BAM index file must be in the same location (can be generated using [Rsamtools::indexBam()]).
#' @param barcode.lookup `data.frame`, first column is the barcode identifier/name and the second column is the DNA sequence. Headers are ignored.
#' @param probe Character, either "gRNA" or "barcode" to parse counts from `grnaCounts` or `barcodeCounts` `altExp` slots, respectively.
#' @param ... Arguments to pass on to `countBarcodedReadsFromContig()`.
#' @param TapestriExperiment `TapestriExperiment` object
#' @param return.table Logical, if `TRUE`, returns table of read counts per barcode. If `FALSE`, returns `TapestriExperiment.` Default `FALSE`.
#' @param max.mismatch Numeric, the maximum and minimum number of mismatching letters allowed. Default 2.
#' @param with.indels If `TRUE`, then indels are allowed. Default `FALSE`.
#'
#' @return `TapestriExperiment` with barcoded read counts added to `colData`.
#' @export
#'
#' @concept barcoded reads
#'
#' @rdname countBarcodedReads
#' @order 1
#'
#' @seealso [Rsamtools::indexBam()]
#'
#' @examples
#' \dontrun{
#' counts <- countBarcodedReads(
#' TapestriExperiment,
#' bam.file, barcode.lookup, "gRNA"
#' )
#' }
countBarcodedReads <- function(TapestriExperiment, bam.file, barcode.lookup, probe, return.table = FALSE, max.mismatch = 2, with.indels = FALSE, ...) {
probe <- tolower(probe)
rlang::check_installed("Biostrings", reason = "must be installed to run `countBarcodedReads()`.")
rlang::check_installed("Rsamtools", reason = "must be installed to run `countBarcodedReads()`.")
# get contig
if (probe == "grna") {
contig <- as.character(rowData(altExp(TapestriExperiment, "grnaCounts"))[grnaProbe(TapestriExperiment), "chr"])
} else if (probe == "barcode") {
contig <- as.character(rowData(altExp(TapestriExperiment, "barcodeCounts"))[barcodeProbe(TapestriExperiment), "chr"])
} else {
cli::cli_abort("{.var probe} {.q {probe}} not recognized. Try {.var probe} = {.q grna} or {.q barcode.}")
}
result <- countBarcodedReadsFromContig(bam.file, barcode.lookup, contig = contig, max.mismatch = max.mismatch, with.indels = with.indels, ...)
if (return.table) {
return(result)
} else {
# get existing colData
existing.cell.data <- as.data.frame(SingleCellExperiment::colData(TapestriExperiment))
# drop columns if already exist to allow overwriting
existing.cell.data <- existing.cell.data[, !colnames(existing.cell.data) %in% setdiff(colnames(result), "cell.barcode")]
# merge result and existing colData
updated.cell.data <- merge(existing.cell.data, result, by = "cell.barcode", all.x = TRUE, sort = FALSE)
# set NAs to 0
ids <- setdiff(colnames(result), "cell.barcode")
updated.cell.data[, ids][is.na(updated.cell.data[, ids])] <- 0
# reorder to match colData
rownames(updated.cell.data) <- updated.cell.data$cell.barcode
updated.cell.data <- updated.cell.data[rownames(SingleCellExperiment::colData(TapestriExperiment)), ]
# update TapestriExperiment
SummarizedExperiment::colData(TapestriExperiment) <- S4Vectors::DataFrame(updated.cell.data)
return(TapestriExperiment)
}
}
#' Call sample labels based on feature counts
#'
#' `callSampleLables()` assigns labels (stored as `colData` column) to cells using feature count data in `colData`.
#' This is most useful for assigning barcode labels based on barcoded reads (see [countBarcodedReads]).
#' For `method = max`, labels are dictated by whichever `input.features` column has the highest number of counts.
#' By default, ties are broken by choosing whichever label has the lowest index position (`ties.method = "first"`).
#' Samples with 0 counts for all `input.features` columns are labeled according to `neg.label`.
#' If only one feature column is used, labels are assigned to cells with counts > `min.count.threshold`, and `neg.label` otherwise.
#'
#' @param TapestriExperiment A `TapestriExperiment` object.
#' @param input.features Character vector, column names in `colData` to evaluate.
#' @param method Character, call method. Only "max" currently supported, calls based on whichever `input.features` column has the most counts.
#' @param ties.method Character, passed to `max.col()` indicating how to break ties. Default "first".
#' @param neg.label Character, label for samples with no counts. Default `NA`.
#' @param output.feature Character, column name to use for the call output. Default "sample.call".
#' @param return.table Logical, if `TRUE`, returns a data.frame of the sample.calls. If `FALSE` (default), returns updated `TapestriExperiment` object.
#' @param min.count.threshold Numeric, minimum number of counts per cell to use for call. Default 1.
#'
#' @return A `TapestriExperiment` object with sample calls added to `colData` column `sample.name`. If `return.table == TRUE`, a `data.frame` of sample calls.
#' @export
#'
#' @concept barcoded reads
#'
#' @examples
#' tap.object <- newTapestriExperimentExample() # example TapestriExperiment object
#' colData(tap.object)$gRNA1 <- 2 # example barcode counts
#' colData(tap.object)$gRNA2 <- 10 # example barcode counts
#' tap.object <- callSampleLables(tap.object,
#' input.features = c("gRNA1", "gRNA2"),
#' output.feature = "sample.grna"
#' )
callSampleLables <- function(TapestriExperiment,
input.features,
output.feature = "sample.call",
return.table = FALSE,
neg.label = NA,
method = "max",
ties.method = "first",
min.count.threshold = 1) {
if (method != "max") {
cli::cli_abort("Method not recognized. Only 'max' currently supported.")
} else {
# check for missing colData labels
if (any(!input.features %in% colnames(SingleCellExperiment::colData(TapestriExperiment)))) {
cli::cli_abort("{.q {input.features[!input.features %in% colnames(SingleCellExperiment::colData(TapestriExperiment))]}} not found in {.var colData}.")
}
# subset colData
existing.cell.data <- as.data.frame(SingleCellExperiment::colData(TapestriExperiment))
coldata.subset <- existing.cell.data[, input.features, drop = FALSE]
# check if numeric
if (any(!apply(coldata.subset, 2, is.numeric))) {
cli::cli_abort("Selected input.features are not numeric.")
}
# apply min threshold, set counts to 0
if (min.count.threshold < 1) {
cli::cli_abort("min.count.threshold must be positive.")
} else {
coldata.subset[coldata.subset < min.count.threshold] <- 0
}
# make calls
sample.calls <- input.features[max.col(coldata.subset,
ties.method = ties.method
)]
sample.calls <- data.frame(
cell.barcode = rownames(coldata.subset),
sample.call = sample.calls
)
# set label if no call is made
sample.calls[rowSums(coldata.subset) == 0, "sample.call"] <- neg.label
sample.calls$sample.call <- as.factor(sample.calls$sample.call)
rownames(sample.calls) <- sample.calls$cell.barcode
colnames(sample.calls)[2] <- output.feature
if (return.table) {
return(sample.calls)
} else {
# drop from existing data if already exist to allow overwriting
existing.cell.data <- existing.cell.data[, !colnames(existing.cell.data) %in% setdiff(colnames(sample.calls), "cell.barcode")]
# merge result and existing colData
updated.cell.data <- merge(existing.cell.data,
sample.calls,
by = "cell.barcode",
all.x = TRUE,
sort = FALSE
)
# reorder to match colData
rownames(updated.cell.data) <- updated.cell.data$cell.barcode
updated.cell.data <- updated.cell.data[rownames(SingleCellExperiment::colData(TapestriExperiment)), ]
# update TapestriExperiment
SummarizedExperiment::colData(TapestriExperiment) <- S4Vectors::DataFrame(updated.cell.data)
return(TapestriExperiment)
}
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.