Nothing
#PrimerTree
#Copyright (C) 2013 Jim Hester
#' Retrieves a fasta sequence from NCBI nucleotide database.
#'
#' @param accession nucleotide accession to retrieve.
#' @param start start base to retrieve, numbered beginning at 1. If NULL the
#' beginning of the sequence.
#' @param stop last base to retrieve, numbered beginning at 1. if NULL the end of
#' the sequence.
#' @param api_key NCBI api-key to allow faster sequence retrieval.
#' @return an DNAbin object.
#' @seealso \code{\link[ape]{DNAbin}}
#' @export
get_sequence <- function(
accession,
start = NULL,
stop = NULL,
api_key = Sys.getenv("NCBI_API_KEY")
) {
fetch_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
query <- list(
db = "nuccore",
rettype = "fasta",
retmode = "text",
id = accession
)
if (!is.null(start)) {
query$seq_start <- start
}
if (!is.null(stop)) {
query$seq_stop <- stop
}
if (nzchar(api_key)) {
query$api_key <- api_key
}
response <- POST_retry(fetch_url, body = query)
#stop if response failed
stop_for_status(response)
content <- content(response, as = "raw")
#from ape package read.FASTA
res <- .Call("rawStreamToDNAbin", content)
names(res) <- sub("^ +", "", names(res))
class(res) <- "DNAbin"
res
}
#' Retrieves fasta sequences from NCBI nucleotide database.
#'
#' @param accession the accession number of the sequence to retrieve
#' @param start start bases to retrieve, numbered beginning at 1. If NULL the
#' beginning of the sequence.
#' @param stop stop bases to retrieve, numbered beginning at 1. if NULL the stop
#' of the sequence.
#' @param api_key NCBI api-key to allow faster sequence retrieval.
#' @param simplify simplify the FASTA headers to include only the genbank
#' accession.
#' @param .parallel if 'TRUE', perform in parallel, using parallel backend
#' provided by foreach
#' @param .progress name of the progress bar to use, see 'create_progress_bar'
#' @return an DNAbin object.
#' @seealso \code{\link[ape]{DNAbin}}
#' @export
get_sequences <- function(
accession,
start = NULL,
stop = NULL,
api_key = Sys.getenv("NCBI_API_KEY"),
simplify = TRUE,
.parallel = FALSE,
.progress = "none"
) {
#expand arguments by recycling
args <- expand_arguments(accession = accession, start = start, stop = stop)
#assign expanded arguments to actual arguments
lapply(seq_along(args), function(i) names(args)[i] <<- args[i])
#define rate to query NCBI servers with get_sequences
query_rate <- 3 #queries per second
if (nzchar(api_key)) {
query_rate <- 10
} else {
warning(
paste0(
"Sequence retrieval limited to 3 per second. Provide an",
"api_key to increase this to 10. See:",
"https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/",
"new-api-keys-for-the-e-utilities/"
),
immediate. = TRUE
)
}
# size <- length(accession)
get_sequence_itr <- function(i) {
start_time <- Sys.time()
sequence <- get_sequence(accession[i], start[i], stop[i], api_key)
stop_time <- Sys.time()
if ((stop_time - start_time) < (1 / query_rate)) {
#sleep to limit query rate :-(
Sys.sleep((1 / query_rate) - (stop_time - start_time))
}
sequence
}
sequences <- plyr::alply(
seq_along(accession),
.margins = 1,
.parallel = .parallel,
.progress = .progress,
failwith(NA, f = get_sequence_itr)
)
names <- if (simplify) accession else laply(sequences, names)
sequences <- llply(sequences, `[[`, 1)
names(sequences) <- names
class(sequences) <- "DNAbin"
sequences
}
# from http://stackoverflow.com/questions/9335099/implementation-of-standard-recycling-rules
expand_arguments <- function(...) {
dot_list <- list(...)
max.length <- max(sapply(dot_list, length))
suppressWarnings(lapply(dot_list, rep, length = max.length))
}
#' Construct a neighbor joining tree from a dna alignment
#'
#' @param dna fasta dna object the tree is to be constructed from
#' @param pairwise.deletion a logical indicating if the distance matrix should
#' be constructed using pairwise deletion
#' @param ... furthur arguments to dist.dna
#' @seealso \code{\link[ape]{dist.dna}}, \code{\link[ape]{nj}}
#' @export
tree_from_alignment <- function(dna, pairwise.deletion = TRUE, ...) {
ape::nj(
ape::dist.dna(
dna,
model = "N",
pairwise.deletion = pairwise.deletion,
...
)
)
}
#' Multiple sequence alignment with clustal omega
#'
#' Calls clustal omega to align a set of sequences of class DNAbin. Run
#' without any arguments to see all the options you can pass to the command
#' line clustal omega.
#' @param x an object of class 'DNAbin'
#' @param exec a character string with the name or path to the program
#' @param quiet whether to supress output to stderr or stdout
#' @param original.ordering use the original ordering of the sequences
#' @param ... additional arguments passed to the command line clustalo
#' @export
clustalo <- function(
x,
exec = "clustalo",
quiet = TRUE,
original.ordering = TRUE,
...
) {
help_text <- system(paste(exec, "--help"), intern = TRUE)
all_options <- get_command_options(help_text)
inf <- tempfile(fileext = ".fas")
outf <- tempfile(fileext = ".aln")
options <- c(infile = inf, outfile = outf, list(...))
match_args <- pmatch(
names(options),
names(all_options),
duplicates.ok = TRUE
)
bad_args <- is.na(match_args)
if (missing(x)) {
message(paste(help_text, collapse = "\n"))
stop("No input")
}
if (any(bad_args)) {
stop(
paste(names(options)[bad_args], collapse = ","),
" not valid option\n"
)
}
write.dna(x, inf, "fasta")
args <- paste(paste(all_options[match_args], options, collapse = " "))
system2(
exec,
args = args,
stdout = ifelse(quiet, FALSE, ""),
stderr = ifelse(quiet, FALSE, "")
)
res <- read.dna(outf, "fasta")
if (original.ordering) {
res <- res[labels(x), ]
}
res
}
#parses the usage and enumerates the commands
get_command_options <- function(usage) {
m <- gregexpr("-+\\w+", usage)
arguments <- unlist(regmatches(usage, m))
names(arguments) <- gsub("-+", "", arguments)
arguments
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.