#' Extract a gene sequence from a genbank file
#'
#' If duplicates are detected via `detect_dups` and `rename` is TRUE, duplicates
#' will be differentiated by appending a numeral to the name, e.g.
#' "accession-gene-1", "accession-gene-2", etc.
#'
#' See https://github.com/gschofl/biofiles
#'
#' @param gene_name Name of gene to extract. Must match genbank data exactly,
#' case-sensitive.
#' @param rec An instance of the gbRecord class.
#' Typically a (sub)genome containing the gene of interest.
#' @param rename Logical; should the sequence be renamed as "accession-gene"?
#' @param detect_dups Logical; should duplicates be detected? If TRUE, a text
#' warning will be returned in the case that this gene has duplicate copies
#' in the genome; if false, all copies will be returned.
#'
#' @return object of class DNAbin
#' @examples
#' \dontrun{
#' gb_file <- reutils::efetch(uid = "KY427346", db = "nuccore",
#' rettype = "gbwithparts", retmode = "text")
#' # Parse GenBank record text file
#' this_rec <- biofiles::gbRecord(gb_file)
#' fetch_gene("rbcL", this_rec)
#' }
#'
fetch_gene <- function (gene_name, rec, rename = TRUE, detect_dups = TRUE) {
# Check for reutils
if (!requireNamespace("reutils", quietly = TRUE)) {
stop("Package \"reutils\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Check for biofiles
if (!requireNamespace("biofiles", quietly = TRUE)) {
stop("Package \"biofiles\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Check input
assertthat::assert_that(assertthat::is.string(gene_name))
assertthat::assert_that(inherits(rec, what = "gbRecord"))
# Check that there are "gene" records
num_gene_features <- biofiles::filter(rec, key = "gene") %>% biofiles::getFeatures() %>% length()
# If no genes detected, return NULL.
if (num_gene_features == 0) return (NULL)
# Filter for the gene of interest
pattern <- paste0("^", gene_name, "$")
filtered_rec <- biofiles::filter(rec, key = "gene", gene = pattern)
# Extract the sequence
seq <- biofiles::getSequence(biofiles::ft(filtered_rec))
# If no gene detected, return NULL.
if(is.null(seq)) return (NULL)
# Currently we are only interested in one gene per query, so return
# a text warning if this is not the case. May want to make more
# general in the future (return multiple hits).
if(length(seq) > 1 & isTRUE(detect_dups)) return ("Multiple genes detected")
# Convert to ape::DNAbin
seq <- ape::as.DNAbin(seq)
# Optionally rename
if (isTRUE(rename)) {
if(length(seq) > 1) {
names(seq) <- paste(biofiles::getAccession(rec), gene_name, 1:length(seq), sep = "-")
} else {
names(seq) <- paste(biofiles::getAccession(rec), gene_name, sep = "-")
}
}
seq
}
#' Extract a list of gene sequences from a genbank file
#'
#' @param gene Characte vector; gene names to extract.
#' @param accession String; GenBank accession number of (full or partial) genome
#' containing the gene(s) of interest.
#' @param rename Logical; should the sequence be renamed as "accession-gene"?
#' @param detect_dups Logical; should duplicates be detected? If TRUE, a text
#' warning will be returned in the case that this gene has duplicate copies
#' in the genome instead of the sequence; if false, all copies will be returned,
#' differentiated by appending numbers to the gene name.
#'
#' @return A list
#'
#' @examples
#' \dontrun{
#' # KY427346 is the GenBank accession no. for the Diplazium striatum plastome
#' # https://www.ncbi.nlm.nih.gov/nuccore/KY427346
#'
#' genes_to_get <- c("rbcL", "matK", "psbA")
#'
#' fetch_gene_from_genome(genes_to_get, "KY427346")
#' }
#'
#' @export
fetch_gene_from_genome <- function (gene, accession, rename = TRUE, detect_dups = TRUE) {
# Check for reutils
if (!requireNamespace("reutils", quietly = TRUE)) {
stop("Package \"reutils\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Check for biofiles
if (!requireNamespace("biofiles", quietly = TRUE)) {
stop("Package \"biofiles\" needed for this function to work. Please install it.",
call. = FALSE)
}
# Check input
assertthat::assert_that(is.character(gene))
assertthat::assert_that(assertthat::is.string(accession))
assertthat::assert_that(is.logical(rename))
assertthat::assert_that(is.logical(detect_dups))
# Download GenBank record as text
# (do this before the loop so it isn't downloaded
# each time for every gene).
gb_file <- reutils::efetch(
uid = accession, db = "nuccore",
rettype = "gbwithparts", retmode = "text")
# Parse GenBank record text file
rec <- biofiles::gbRecord(gb_file)
# Use purrr to "vectorize" fetch_gene, and return output named by gene.
if(length(gene) > 1) {
gene %>% magrittr::set_names(., gene) %>%
purrr::map(fetch_gene,
rec = rec, rename = rename, detect_dups = detect_dups)
} else {
fetch_gene(gene, rec = rec, rename = rename, detect_dups = detect_dups)
}
}
#' Assemble a list of genes from annotated genbank files
#'
#' Sometimes genes may be repeated within a genome, such as chloroplast
#' genes in the inverted repeat. If `drop_dups` is set to TRUE, these will
#' be excluded from the results (with a warning). This is useful for
#' assembling chloroplast gene matrices of single-copy genes.
#'
#' When running in parallel (`parallel` option is set to TRUE),
#' it may be necessary to set the parallel backend first using
#' \code{\link[future]{plan}}, or the code will still run sequentially.
#'
#' @param accessions Vector of genbank accession numbers of (partial)
#' genomes including the genes of interest
#' @param genes Vector of gene names to assemble
#' @param parallel Logical; should \code{\link[furrr]{future_map}} be used to
#' fetch genes in parallel?
#' @param drop_dups Logical; should genes with duplicate copies be excluded
#' from the results?
#'
#' @return List. Each item in the list is a gene, which contains a list of
#' sequences of class DNAbin.
#'
#' @examples
#' \dontrun{
#' # KP136830 is the GenBank accession no. for the Cystopteris protrusa plastome
#' # https://www.ncbi.nlm.nih.gov/nuccore/KP136830
#'
#' # KP136830 is the GenBank accession no. for the Diplazium striatum plastome
#' # https://www.ncbi.nlm.nih.gov/nuccore/KY427346
#'
#' # Assemble a list of DNA sequences for three genes from these two species.
#' # Note that psbA is duplicated since it is in the Inverted Repeat.
#'
#' assemble_gene_set(
#' c("KP136830", "KY427346"),
#' c("accD", "atpA", "psbA", "not_a_proper_gene_name")
#' )
#'
#' assemble_gene_set(
#' c("KP136830", "KY427346"),
#' c("accD", "atpA", "psbA", "not_a_proper_gene_name"),
#' drop_dups = FALSE
#' )
#' }
#'
#' @export
assemble_gene_set <- function (accessions, genes,
parallel = FALSE, drop_dups = TRUE) {
assertthat::assert_that(is.character(accessions))
assertthat::assert_that(is.character(genes))
assertthat::assert_that(is.logical(parallel))
# Extract target genes from each plastome.
# Wei et al 2017 data (83 coding genes from 40 plastomes)
# took 23 min using furrr with four CPUs.
# future::plan("multiprocess")
if (isTRUE(parallel)) {
all_genes <- furrr::future_map(
accessions,
~ fetch_gene_from_genome(
accession = ., gene = genes, detect_dups = drop_dups, rename = TRUE)
) %>%
purrr::transpose()
} else {
all_genes <- purrr::map(
accessions,
~ fetch_gene_from_genome(
accession = ., gene = genes, detect_dups = drop_dups, rename = TRUE)
) %>%
purrr::transpose()
}
# Some genes are dublicated because they are in the IR (Inverted Repeat).
# Check for these.
repeats_detected <- purrr::map_lgl(all_genes,
~ purrr::map(., class) %>%
unique %>%
stringr::str_detect("character") %>%
any)
if(isTRUE(drop_dups) & sum(repeats_detected) > 0) {
print(paste(
"These genes were duplicated within at least one accession and dropped:",
genes[repeats_detected]
))
all_genes <- all_genes[!repeats_detected]
}
# Prepare gene list for output
all_genes %>%
# Remove NULL values (when a gene wasn't found)
purrr::map(purrr::compact) %>%
purrr::keep(~ length(.x) > 0) %>%
# Remove redundant names
purrr::map(~ purrr::set_names(., NULL)) %>%
# Convert each list of sequence lists within a gene
# into a single list per gene
purrr::reduce(., c)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.