Nothing
#' Build kmer database
#'
#' Build kmer database for classifying 16S rRNA and other gene sequences to
#' a genus when a kmer size is provided.
#'
#' @param sequences A vector of reference sequences for which we have
#' genus-level taxonomic information in the same order as the
#' value for genera.
#' @param genera A character vector of genus-level taxonomic information for
#' reference sequences in the same order as the value for
#' sequences. Ideally, taxonomic information will be provided
#' back to the domain level with each level separated by
#' semicolons and no spaces.
#' @param kmer_size An integer indicating the length of the nucleotide word to
#' base our classification on (default = 8)
#'
#' @return A list object containing the genus level conditional probability
#' (`conditional_prob`) of seeing each kmer in a given genus as well as
#' the genus names (`genera`)
#'
#' @references
#' Wang Q, Garrity GM, Tiedje JM, Cole JR. Naive Bayesian classifier for rapid
#' assignment of rRNA sequences into the new bacterial taxonomy. Appl Environ
#' Microbiol. 2007 Aug;73(16):5261-7.
#' doi:[10.1128/AEM.00062-07](https://pubmed.ncbi.nlm.nih.gov/17586664/)
#' PMID: 17586664; PMCID: PMC1950982.
#'
#' @examples
#' kmer_size <- 3
#' sequences <- c("ATGCGCTA", "ATGCGCTC", "ATGCGCTC")
#' genera <- c("A", "B", "B")
#'
#' build_kmer_database(sequences, genera, kmer_size)
#'
#' @export
build_kmer_database <- function(sequences, genera, kmer_size = 8) {
genera_indices <- genera_str_to_index(genera)
detected_kmers <- detect_kmers_across_sequences(sequences,
kmer_size = kmer_size
)
priors <- calc_word_specific_priors(detected_kmers, kmer_size)
cond_prob <- calc_genus_conditional_prob(
detected_kmers, genera_indices,
priors
)
genera_names <- get_unique_genera(genera)
return(list(
conditional_prob = cond_prob,
genera = genera_names
))
}
#' Classify 16S rRNA gene sequence fragment
#'
#' The `classify_sequence()` function implements the Wang et al. naive Bayesian
#' classification algorithm for 16S rRNA gene sequences.
#'
#' @param unknown_sequence A character object representing a DNA sequence that
#' needs to be classified
#' @param database A kmer database generated using [`build_kmer_database`]
#' @param kmer_size An integer value (default of 8) indicating the size of kmers
#' to use for classifying sequences. Higher values use more
#' RAM with potentially more specificity Lower values use
#' less RAM with potentially less specificity. Benchmarking
#' has found that the default of 8 provides the best
#' specificity with the lowest possible memory requirement and
#' fastest execution time.
#' @param num_bootstraps An integer value (default of 100). The value of
#' `num_bootstraps` is the number of randomizations to perform
#' where `1/kmer_size` of all kmers are sampled (without
#' replacement) from `unknown_sequence`. Higher values will
#' provide greater precision on the confidence score.
#'
#' @returns A list object of two vectors. One vector (`taxonomy`) is the
#' taxonomic assignment for each level. The second vector
#' (`confidence`) is the percentage of `num_bootstraps` that the
#' classifier gave the same classification at that level
#' @inherit build_kmer_database references
#' @export
#'
#' @examples
#' kmer_size <- 3
#' sequences <- c("ATGCGCTA", "ATGCGCTC", "ATGCGCTC")
#' genera <- c("A", "B", "B")
#'
#' db <- build_kmer_database(sequences, genera, kmer_size)
#' unknown_sequence <- "ATGCGCTC"
#'
#' classify_sequence(
#' unknown_sequence = unknown_sequence,
#' database = db,
#' kmer_size = kmer_size
#' )
classify_sequence <- function(unknown_sequence, database,
kmer_size = 8, num_bootstraps = 100) {
kmers <- detect_kmers(sequence = unknown_sequence, kmer_size)
bs_class <- numeric(length = num_bootstraps)
for (i in 1:num_bootstraps) {
bs_kmers <- bootstrap_kmers(kmers, kmer_size)
bs_class[[i]] <- classify_bs(bs_kmers, database$conditional_prob)
}
consensus_bs_class(bs_class, database$genera)
}
#' @noRd
#' @importFrom stringi stri_length stri_sub
get_all_kmers <- function(x, kmer_size = 8) {
seq_length <- stringi::stri_length(x)
n_kmers <- seq_length - kmer_size + 1
seq_kmers <- stringi::stri_sub(x, 1:n_kmers, kmer_size:seq_length)
return(seq_kmers)
}
#' @noRd
#' @importFrom stringi stri_trans_toupper stri_replace_all_charclass
#' stri_trans_char
seq_to_base4 <- function(sequence) {
stringi::stri_trans_toupper(sequence) |>
stringi::stri_replace_all_charclass(
str = _,
pattern = "[^ACGT]",
replacement = "N"
) |>
stringi::stri_trans_char(str = _, pattern = "ACGT", replacement = "0123")
}
#' @noRd
#' @importFrom stats na.omit
base4_to_index <- function(base4_str) {
# I want output to be indexed to start at position 1 rather than 0 so we're
# adding 1 to all base10 values
stats::na.omit(strtoi(base4_str, base = 4) + 1) |> as.numeric()
}
#' @noRd
detect_kmers <- function(sequence, kmer_size = 8) {
seq_to_base4(sequence) |>
get_all_kmers(kmer_size) |>
base4_to_index() |>
unique()
}
#' @noRd
detect_kmers_across_sequences <- function(sequences, kmer_size = 8) {
n_sequences <- length(sequences)
kmer_list <- vector(mode = "list", length = n_sequences)
for (i in seq_along(1:n_sequences)) {
kmer_list[[i]] <- detect_kmers(sequences[[i]], kmer_size = kmer_size)
}
return(kmer_list)
}
#' @noRd
calc_word_specific_priors <- function(detect_list, kmer_size) {
priors <- detect_list |>
unlist() |>
tabulate(bin = _, nbins = 4^kmer_size)
(priors + 0.5) / (length(detect_list) + 1)
}
#' @noRd
# The forumula for calculating the conditional probability for each genus
# is equal to (m(wi) + Pi) / (M + 1) #nolint: commented_code_linter
# genera argument needs to be an integer not char
calc_genus_conditional_prob <- function(detect_list,
genera,
word_specific_priors) {
genus_counts <- tabulate(genera)
n_genera <- length(genus_counts)
n_sequences <- length(genera)
n_kmers <- length(word_specific_priors)
kmer_genus_count <- matrix(0,
nrow = n_kmers,
ncol = n_genera
)
for (i in 1:n_sequences) {
kmer_genus_count[detect_list[[i]], genera[i]] <-
kmer_genus_count[detect_list[[i]], genera[i]] + 1
}
# this calls the Rcpp version of the function
calculate_log_probability(
kmer_genus_count,
word_specific_priors,
genus_counts
) |>
t()
}
#' @noRd
genera_str_to_index <- function(string) {
factor(string) |> as.numeric()
}
#' @noRd
get_unique_genera <- function(string) {
factor(string) |> levels()
}
#' @noRd
bootstrap_kmers <- function(kmers, kmer_size = 8) {
n_kmers <- as.integer(length(kmers) / kmer_size)
sample(kmers, n_kmers, replace = TRUE)
}
#' @noRd
#' @importFrom Rfast rowsums
classify_bs <- function(unknown_kmers, conditional_prob) {
probabilities <- Rfast::rowsums(conditional_prob[, unknown_kmers])
which.max(probabilities)
}
#' @noRd
consensus_bs_class <- function(bs_class, genera) {
taxonomy <- genera[bs_class]
taxonomy_split <- stringi::stri_split_fixed(taxonomy, pattern = ";")
n_levels <- length(taxonomy_split[[1]])
consensus_list <- lapply(
1:n_levels,
\(i) {
sapply(taxonomy_split, \(p) paste(p[1:i], collapse = ";")) |>
get_consensus()
}
)
list(
taxonomy = stringi::stri_split_fixed(consensus_list[[n_levels]][["id"]],
pattern = ";"
) |>
unlist(),
confidence = sapply(consensus_list, `[[`, "frac")
)
}
#' @noRd
get_consensus <- function(taxonomy) {
n_bs <- length(taxonomy)
taxonomy_table <- table(taxonomy)
max_index <- which.max(taxonomy_table)
list(
frac = 100 * taxonomy_table[[max_index]] / n_bs,
id = names(max_index)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.