Nothing
#PrimerTree
#Copyright (C) 2013 Jim Hester
#' \pkg{primerTree} Visually Assessing the Specificity and Informativeness of Primer Pairs
#'
#' \code{primerTree} has two main commands:
#' \code{\link{search_primer_pair}} which takes a primer pair and returns an
#' primerTree object of the search results
#' \code{\link{plot.primerTree}} a S3 method for plotting the primerTree object
#' obtained using \code{\link{search_primer_pair}}
#' @name primerTree
#' @docType package
#' @aliases primerTree-package
#' @import ggplot2 XML ape httr plyr directlabels gridExtra
#' stringr foreach
#' @importFrom lubridate %--% seconds now
#' @importFrom grid grid.locator
#' @importFrom scales expand_range
#' @importFrom grDevices dev.cur dev.off dev.set pdf
#' @importFrom stats na.omit quantile
#' @importFrom utils capture.output
#' @useDynLib primerTree rawStreamToDNAbin
NULL
#' PrimerTree results for the mammalian 16S primers
#' @name mammals_16S
#' @docType data
NULL
#' PrimerTree results for the bryophyte trnL primers
#' @name bryophytes_trnL
#' @docType data
NULL
#' @export
print.primerTree <- function(x, ...) {
cat(
"Name: ",
x$name,
"\n",
" Arguments: ",
paste(names(x$arguments), x$arguments, sep = ":", collapse = " "),
"\n"
)
cat("\nHTTP Response\n")
print(x$response[[1]])
cat("\nPrimer Products\n")
print(x$sequence)
cat("\nAligned Products\n")
print(x$alignment)
cat("\nPhylogenetic Tree\n")
print(x$tree)
}
#' plot function for a primerTree object, calls plot_tree_ranks
#' @param x primerTree object to plot
#' @param ranks The ranks to include, defaults to all common ranks, if NULL
#' print all ranks. If 'none' just print the layout.
#' @param main optional title to display, if NULL displays the name as the title
#' @param ... additional arguments passed to plot_tree_ranks
#' @export
#' @seealso \code{\link{plot_tree_ranks}}, \code{\link{plot_tree}}
#' @examples
#' library(gridExtra)
#' library(directlabels)
#' #plot with all common ranks
#' plot(mammals_16S)
#'
#' #plot only the class
#' plot(mammals_16S, 'class')
#'
#' #plot the layout only
#' plot(mammals_16S, 'none')
plot.primerTree <- function(x, ranks = NULL, main = NULL, ...) {
if (is.null(ranks)) {
if (is.null(main)) {
main <- x$name
}
plot_tree_ranks(x$tree, x$taxonomy, main = main, ...)
} else if (length(ranks) > 1) {
if (is.null(main)) {
main <- x$name
}
plot_tree_ranks(x$tree, x$taxonomy, ranks = ranks, main = main, ...)
} else {
if (ranks == "none") {
plot_tree(x$tree, main = main, ...)
} else {
plot_tree(
x$tree,
taxonomy = x$taxonomy,
rank = ranks,
main = main,
...
)
}
}
}
#' Automatic primer search Search a given primer pair, retrieving the alignment
#' results, their product sequences, taxonomic information for the sequences,
#' a multiple alignment of the products
#' @param name name to give to the primer pair
#' @param simplify use simple names for primer hit results or complex
#' @param .progress name of the progress bar to use, see
#' \code{\link[plyr]{create_progress_bar}}
#' @param clustal_options a list of options to pass to clustal omega, see
#' \code{link{clustalo}} for a list of options
#' @param distance_options a list of options to pass to dist.dna, see
#' \code{link[ape]{dist.dna}} for a list of options
#' @param api_key NCBI api-key to allow faster sequence retrieval
#' @inheritParams primer_search
#' @return A list with the following elements,
#' \item{name}{name of the primer pair}
#' \item{BLAST_result}{html blast results from Primer-BLAST as
#' 'a \code{\link[httr]{response}}} object.
#' \item{taxonomy}{taxonomy for the primer products from NCBI}
#' \item{sequence}{sequence of the primer products}
#' \item{alignment}{multiple alignment of the primer products}
#' \item{tree}{phylogenetic tree of the reconstructed from the
#' 'multiple alignment}
#' @seealso \code{\link{primer_search}}, \code{\link{clustalo}}
#' @export
#' @examples
#' \dontrun{
#' #simple search
#' mammals_16S = search_primer_pair(name='Mammals 16S',
#' 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
#' #returning 1000 alignments, allow up to 3 mismatches in primer
#' mammals_16S = search_primer_pair(name='Mammals 16S',
#' 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT',
#' num_aligns=1000, total_primer_specificity_mismatch=3)
#' }
search_primer_pair <- function(
forward,
reverse,
name = NULL,
num_aligns = 500,
num_permutations = 25,
simplify = TRUE,
clustal_options = list(),
distance_options = list(model = "N", pairwise.deletion = TRUE),
api_key = Sys.getenv("NCBI_API_KEY"),
...,
.parallel = FALSE,
.progress = "none"
) {
#HACK, primerTree is an environment rather than a list so we can treat it as
#a pointer, I could make it a reference class, but that seems to be overkill
#as I am converting to a list at the end of the function anyway...
if (missing(forward) || missing(reverse)) {
BLAST_primer()
return()
}
primer_search <- new.env(parent = globalenv())
#list all primers used to search
primer_search <- env2list(
try_default(
{
primer_search$name <- if (!is.null(name)) {
name
} else {
name <- paste(forward, reverse, sep = "_")
}
primer_search$arguments <-
c(
forward = forward,
reverse = reverse,
name = name,
num_aligns = num_aligns,
num_permutations = num_permutations,
simplify = simplify,
clustal_options = clustal_options,
list(...)
)
primer_search$response <- primer_search(
forward,
reverse,
num_permutations = num_permutations,
.progress = .progress,
.parallel = .parallel,
num_aligns = num_aligns,
...
)
start_time <- now()
primer_search$BLAST_result <-
filter_duplicates(plyr::ldply(
primer_search$response,
parse_primer_hits,
.parallel = .parallel
))
message(
nrow(primer_search$BLAST_result),
" BLAST alignments parsed in ",
seconds_elapsed_text(start_time)
)
start_time <- now()
primer_search$taxonomy <- get_taxonomy(
primer_search$BLAST_result$accession
)
message(
"taxonomy retrieved in ",
seconds_elapsed_text(start_time)
)
start_time <- now()
primer_search$sequence <- get_sequences(
primer_search$BLAST_result$accession,
primer_search$BLAST_result$product_start,
primer_search$BLAST_result$product_stop,
api_key = api_key,
simplify = simplify,
.parallel = .parallel
)
lengths <- plyr::laply(primer_search$sequence, length)
message(
length(primer_search$sequence),
" sequences retrieved from NCBI",
" in ",
seconds_elapsed_text(start_time),
", product length",
" min:",
min(lengths),
" mean:",
round(mean(lengths), 2),
" max:",
max(lengths)
)
start_time <- now()
primer_search$alignment <- do.call(
clustalo,
c(
list(
primer_search$sequence,
threads = getDoParWorkers()
),
clustal_options
)
)
message(
nrow(primer_search$alignment),
" sequences aligned in ",
seconds_elapsed_text(start_time),
" length:",
ncol(primer_search$alignment)
)
start_time <- now()
primer_search$distances <- do.call(
ape::dist.dna,
c(list(primer_search$alignment), distance_options)
)
message(
"pairwise DNA distances calculated in ",
seconds_elapsed_text(start_time)
)
start_time <- now()
primer_search$tree <- tree_from_alignment(
primer_search$alignment
)
message(
"constructed neighbor joining tree in ",
seconds_elapsed_text(start_time)
)
primer_search
},
default = primer_search
)
)
class(primer_search) <- "primerTree"
primer_search
}
#given a start time print out the number of seconds which have elapsed
seconds_elapsed_text <- function(start_time) {
paste((start_time %--% now()) %/% seconds(1), "seconds")
}
#fast way to convert a environment to a list
env2list <- function(env) {
names <- ls(env)
mget(names, env)
}
#' identify the point closest to the mouse click
#' only works on single ranks
#' @param x the plot to identify
#' @param ... additional arguments passed to annotate
#' @export
identify.primerTree_plot <- function(x, ...) {
point <- gglocator(x$layers[[4]])
distances <- distance(point, x$layers[[4]]$data[, c("x", "y")])
closest <- which(distances == min(distances))[1]
point$label <- x$layers[[4]]$data[
closest,
deparse(x$layers[[4]]$mapping$colour)
]
x + annotate("text", label = point$label, x = point$x, y = point$y, ...)
}
gglocator <- function(object) {
loc <- as.numeric(grid.locator("npc"))
xrng <- with(object, range(data[, deparse(mapping$x)]))
yrng <- with(object, range(data[, deparse(mapping$y)]))
point <- data.frame(
xrng[1] + loc[1] * diff(xrng),
yrng[1] + loc[2] * diff(yrng)
)
names(point) <- with(object, c(deparse(mapping$x), deparse(mapping$y)))
point
}
#returns the distance from a point in point to the points in points
distance <- function(point, points) {
sqrt((point$x - points$x)^2 + (point$y - points$y)^2)
}
#' Summarize a primerTree result, printing quantiles of sequence length and
#' pairwise differences.
#' @param object the primerTree object to summarise
#' @param ... Ignored options
#' @param probs quantile probabilities to compute, defaults to 0, 5, 50, 95,
#' and 100 probabilities.
#' @param ranks ranks to show unique counts for, defaults to the common ranks
#' @return invisibly returns a list containing the printed results
#' @export
summary.primerTree <- function(
object,
...,
probs = c(0, .05, .5, .95, 1),
ranks = common_ranks
) {
res <- list()
res[["lengths"]] <- t(data.frame(
"Sequence lengths" = labeled_quantile(
laply(object$sequence, length),
sprintf("%.0f%%", probs * 100),
probs = probs
),
check.names = FALSE
))
print(res[["lengths"]])
res[["distances"]] <- t(data.frame(
"Pairwise differences" = labeled_quantile(
object$distances,
sprintf("%.0f%%", probs * 100),
probs = probs
),
check.names = FALSE
))
cat("\n")
print(res[["distances"]])
res[["rankDistances"]] <- calc_rank_dist_ave(object, common_ranks)
print(res[["rankDistances"]])
res[["ranks"]] <- laply(object$taxonomy[common_ranks], function(x) {
length(unique(x))
})
cat(
"\n",
"Unique taxa out of ",
length(object$sequence),
" sequences\n",
sep = ""
)
names(res[["ranks"]]) <- ranks
print(res[["ranks"]])
invisible(res)
}
labeled_quantile <- function(x, labels, ...) {
res <- quantile(x, ...)
names(res) <- labels
res
}
#' Summarize pairwise differences.
#' @param x a primerTree object
#' @param ranks ranks to show unique counts for, defaults to the common ranks
#' @return returns a data frame of results
#' @details
#' The purpose of this function is to calculate the average number
#' of nucleotide differences between species within each taxa of given taxonomic
#' level.
#'
#' For example, at the genus level, the function calculates the average number
#' of nucleotide differences between all species within each genus and reports
#' the mean of those values.
#'
#' There are several key assumptions and calculations made in this
#' function.
#'
#' First, the function randomly selects one sequence from each species
#' in the primerTree results. This is to keep any one species (e.g.
#' human, cow, etc.) with many hits from skewing the results.
#'
#' Second, for each taxonomic level tested, the function divides the
#' sequences by each taxon at that level and calculates the mean
#' number of nucleotide differences within that taxa, then returns the
#' mean of those values.
#'
#' Third, when calculating the average distance, any taxa for which
#' there is only one species is omitted, as the number of nucleotide
#' differences will always be 0.
#'
#' @examples
#' \dontrun{
#' calc_rank_dist_ave(mammals_16S)
#'
#' calc_rank_dist_ave(bryophytes_trnL)
#'
#' # Note that the differences between the results from these two primers
#' # the mean nucleotide differences is much higher for the mammal primers
#' # than the byrophyte primers. This suggests that the mammal primers have
#' # better resolution to distinguish individual species.
#' }
#' @importFrom reshape2 melt
#' @export
# using tree data format info from http://www.phytools.org/eqg/Exercise_3.2/
calc_rank_dist_ave <- function(x, ranks = common_ranks) {
used_ranks <- grep("species", ranks, invert = TRUE, value = TRUE)
rank_dist_mean <- data.frame(matrix(nrow = 1, ncol = 0))
# Raw taxonomy data
taxa <- as.data.frame(x$taxonomy)
# Randomize the order of the taxa data frame
taxa <- taxa[sample(nrow(taxa)), ]
rownames(taxa) <- taxa$accession
# Pick random example per species and add back in the taxa info
unique_factors <- as.data.frame(unique(taxa$species))
colnames(unique_factors) <- "species"
unique_factors <- plyr::join(
unique_factors,
taxa,
type = "left",
match = "first",
by = "species"
)
# Get sequences for randomly selected species
seqs <- x$sequence
seqs <- seqs[names(seqs) %in% unique_factors$accession]
# Align and calculate pairwise distances and convert dists to dataframe
align <- clustalo(seqs)
dists <- as.data.frame(as.matrix(ape::dist.dna(
align,
model = "N",
pairwise.deletion = TRUE
)))
dists$accession <- row.names(dists)
# Melt the dists dataframe so I can drop any distance that isn't within
# the (rank)
melted <- reshape2::melt(
dists,
id = "accession",
variable.name = "accession2",
value.name = "dist"
)
for (rank in used_ranks) {
# Gather only the needed taxa data
unique_factors_sub <- unique_factors[,
colnames(unique_factors) %in% c("accession", "species", rank)
]
# Drop any row in (rank) where there is only one species represented
# Any instance of this leads to a distance within that rank of 0,
# skewing the results downward
counts <- as.data.frame(table(unique_factors_sub[[rank]]))
colnames(counts) <- c(rank, "count")
unique_factors_sub <- plyr::join(unique_factors_sub, counts, by = rank)
unique_factors_sub <- unique_factors_sub[unique_factors_sub$count > 1, ]
# Pull the nucleotide distance data in
# Replace the rank1 accession with the rank1 taxa
melted_sub <- plyr::join(melted, unique_factors_sub, by = "accession")
melted_sub$rank1 <- as.factor(melted_sub[[rank]])
# Drop all columns except the three needed so the next join doesn't get
# messed up
melted_sub <- melted_sub[,
colnames(melted_sub) %in%
c("accession2", "dist", "rank1", "species")
]
colnames(melted_sub)[1] <- "accession"
# Replace the rank2 accession with the rank2 taxa
melted_sub <- plyr::join(
melted_sub,
unique_factors_sub,
by = "accession"
)
melted_sub$rank2 <- as.factor(melted_sub[[rank]])
# Drop all columns except the three needed
melted_sub <- melted_sub[,
colnames(melted_sub) %in% c("rank2", "dist", "rank1", "species")
]
# Drop all rows with missing information
melted_sub <- na.omit(melted_sub)
# We only want distances within a taxa, so drop all comparisons between
# taxa
# We also want to drop any comparisons of a species to itself, which
# will have dist == 0
melted_sub <- melted_sub[
melted_sub$rank1 == melted_sub$rank2 &
melted_sub$species != melted_sub$species.1,
]
# Calculate the mean distance for each taxa compared
# We calculate each separately to avoid any one taxa with lots of hits
# (like human seqs)
# from skewing the mean
melted_sub$group <- paste(melted_sub$rank1, melted_sub$rank2)
# Plug the means into the storage dataframe
rank_dist_mean[[rank]] <- mean(melted_sub$dist)
}
message(
"\nAverage number of nucleotide differences between sequences within a given taxonomic group"
)
message("See function description for further details")
rank_dist_mean
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.