R/sequence.R

Defines functions get_command_options clustalo tree_from_alignment expand_arguments get_sequences get_sequence

Documented in clustalo get_sequence get_sequences tree_from_alignment

#PrimerTree
#Copyright (C) 2013 Jim Hester

#' Retrieves a fasta sequence from NCBI nucleotide database.
#'
#' @param accession nucleotide accession to retrieve.
#' @param start start base to retrieve, numbered beginning at 1.  If NULL the
#'        beginning of the sequence.

#' @param stop last base to retrieve, numbered beginning at 1. if NULL the end of
#'        the sequence.
#' @param api_key NCBI api-key to allow faster sequence retrieval.
#' @return an DNAbin object.
#' @seealso \code{\link[ape]{DNAbin}}
#' @export

get_sequence <- function(
    accession,
    start = NULL,
    stop = NULL,
    api_key = Sys.getenv("NCBI_API_KEY")
) {
    fetch_url <- "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    query <- list(
        db = "nuccore",
        rettype = "fasta",
        retmode = "text",
        id = accession
    )

    if (!is.null(start)) {
        query$seq_start <- start
    }

    if (!is.null(stop)) {
        query$seq_stop <- stop
    }

    if (nzchar(api_key)) {
        query$api_key <- api_key
    }

    response <- POST_retry(fetch_url, body = query)

    #stop if response failed
    stop_for_status(response)

    content <- content(response, as = "raw")

    #from ape package read.FASTA
    res <- .Call("rawStreamToDNAbin", content)
    names(res) <- sub("^ +", "", names(res))
    class(res) <- "DNAbin"
    res
}

#' Retrieves fasta sequences from NCBI nucleotide database.
#'
#' @param accession the accession number of the sequence to retrieve
#' @param start start bases to retrieve, numbered beginning at 1.  If NULL the
#'        beginning of the sequence.

#' @param stop stop bases to retrieve, numbered beginning at 1. if NULL the stop
#'        of the sequence.
#' @param api_key NCBI api-key to allow faster sequence retrieval.
#' @param simplify simplify the FASTA headers to include only the genbank
#'        accession.
#' @param .parallel if 'TRUE', perform in parallel, using parallel backend
#'        provided by foreach
#' @param .progress name of the progress bar to use, see 'create_progress_bar'
#' @return an DNAbin object.
#' @seealso \code{\link[ape]{DNAbin}}
#' @export

get_sequences <- function(
    accession,
    start = NULL,
    stop = NULL,
    api_key = Sys.getenv("NCBI_API_KEY"),
    simplify = TRUE,
    .parallel = FALSE,
    .progress = "none"
) {
    #expand arguments by recycling
    args <- expand_arguments(accession = accession, start = start, stop = stop)
    #assign expanded arguments to actual arguments
    lapply(seq_along(args), function(i) names(args)[i] <<- args[i])

    #define rate to query NCBI servers with get_sequences
    query_rate <- 3 #queries per second

    if (nzchar(api_key)) {
        query_rate <- 10
    } else {
        warning(
            paste0(
                "Sequence retrieval limited to 3 per second. Provide an",
                "api_key to increase this to 10. See:",
                "https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/",
                "new-api-keys-for-the-e-utilities/"
            ),
            immediate. = TRUE
        )
    }

    # size <- length(accession)
    get_sequence_itr <- function(i) {
        start_time <- Sys.time()
        sequence <- get_sequence(accession[i], start[i], stop[i], api_key)
        stop_time <- Sys.time()
        if ((stop_time - start_time) < (1 / query_rate)) {
            #sleep to limit query rate :-(
            Sys.sleep((1 / query_rate) - (stop_time - start_time))
        }
        sequence
    }
    sequences <- plyr::alply(
        seq_along(accession),
        .margins = 1,
        .parallel = .parallel,
        .progress = .progress,
        failwith(NA, f = get_sequence_itr)
    )
    names <- if (simplify) accession else laply(sequences, names)
    sequences <- llply(sequences, `[[`, 1)
    names(sequences) <- names
    class(sequences) <- "DNAbin"
    sequences
}
# from http://stackoverflow.com/questions/9335099/implementation-of-standard-recycling-rules
expand_arguments <- function(...) {
    dot_list <- list(...)
    max.length <- max(sapply(dot_list, length))
    suppressWarnings(lapply(dot_list, rep, length = max.length))
}
#' Construct a neighbor joining tree from a dna alignment
#'
#' @param dna fasta dna object the tree is to be constructed from
#' @param pairwise.deletion a logical indicating if the distance matrix should
#' be constructed using pairwise deletion
#' @param ... furthur arguments to dist.dna
#' @seealso \code{\link[ape]{dist.dna}}, \code{\link[ape]{nj}}
#' @export
tree_from_alignment <- function(dna, pairwise.deletion = TRUE, ...) {
    ape::nj(
        ape::dist.dna(
            dna,
            model = "N",
            pairwise.deletion = pairwise.deletion,
            ...
        )
    )
}
#' Multiple sequence alignment with clustal omega
#'
#' Calls clustal omega to align a set of sequences of class DNAbin.  Run
#' without any arguments to see all the options you can pass to the command
#' line clustal omega.
#' @param x an object of class 'DNAbin'
#' @param exec a character string with the name or path to the program
#' @param quiet whether to supress output to stderr or stdout
#' @param original.ordering use the original ordering of the sequences
#' @param ... additional arguments passed to the command line clustalo
#' @export
clustalo <- function(
    x,
    exec = "clustalo",
    quiet = TRUE,
    original.ordering = TRUE,
    ...
) {
    help_text <- system(paste(exec, "--help"), intern = TRUE)
    all_options <- get_command_options(help_text)

    inf <- tempfile(fileext = ".fas")
    outf <- tempfile(fileext = ".aln")

    options <- c(infile = inf, outfile = outf, list(...))
    match_args <- pmatch(
        names(options),
        names(all_options),
        duplicates.ok = TRUE
    )
    bad_args <- is.na(match_args)

    if (missing(x)) {
        message(paste(help_text, collapse = "\n"))
        stop("No input")
    }
    if (any(bad_args)) {
        stop(
            paste(names(options)[bad_args], collapse = ","),
            " not valid option\n"
        )
    }

    write.dna(x, inf, "fasta")
    args <- paste(paste(all_options[match_args], options, collapse = " "))
    system2(
        exec,
        args = args,
        stdout = ifelse(quiet, FALSE, ""),
        stderr = ifelse(quiet, FALSE, "")
    )
    res <- read.dna(outf, "fasta")
    if (original.ordering) {
        res <- res[labels(x), ]
    }
    res
}

#parses the usage and enumerates the commands
get_command_options <- function(usage) {
    m <- gregexpr("-+\\w+", usage)
    arguments <- unlist(regmatches(usage, m))
    names(arguments) <- gsub("-+", "", arguments)
    arguments
}

Try the primerTree package in your browser

Any scripts or data that you put into this service are public.

primerTree documentation built on Nov. 5, 2025, 6:22 p.m.