#' Abundance timeseries
#'
#' Generates a timeseries plot showing relative read abundances over time.
#'
#' @param data (\emph{required}) Data list as loaded with \code{\link{amp_load}}.
#' @param time_variable (required) The name of the column in the metadata containing the time variables, e.g. \code{"Date"}. Must be directly compatible with \code{\link[lubridate]{as_date}} and preferably of the form \code{"yyyy-mm-dd"} or \code{"\%Y-\%m-\%d"}.
#' @param group_by Group the samples by a variable in the metadata.
#' @param split Split the plot into subplots of each taxa. (\emph{default:} \code{FALSE})
#' @param scales If \code{split = TRUE}, should the axis scales of each subplot be fixed (\code{fixed}), free (\code{"free"}), or free in one dimension (\code{"free_x"} or \code{"free_y"})? (\emph{default:} \code{"fixed"})
#' @param tax_aggregate The taxonomic level to aggregate the OTUs. (\emph{default:} \code{"OTU"})
#' @param tax_add Additional taxonomic level(s) to display, e.g. \code{"Phylum"}. (\emph{default:} \code{"none"})
#' @param tax_show The number of taxa to show, or a vector of taxa names. (\emph{default:} \code{6})
#' @param tax_empty How to show OTUs without taxonomic information. One of the following:
#' \itemize{
#' \item \code{"remove"}: Remove OTUs without taxonomic information.
#' \item \code{"best"}: (\emph{default}) Use the best classification possible.
#' \item \code{"OTU"}: Display the OTU name.
#' }
#' @param tax_class Converts a specific phylum to class level instead, e.g. \code{"p__Proteobacteria"}.
#' @param normalise (\emph{logical}) Transform the OTU read counts to be in percent per sample. (\emph{default:} \code{TRUE})
#' @param plotly (\emph{logical}) Returns an interactive plot instead. (\emph{default:} \code{FALSE})
#' @param ... Additional arguments passed to \code{\link[lubridate]{as_date}} to make the time_variable compatible with the timeseries plot, fx the \code{format} or \code{tz} arguments, see \code{?as_date}.
#'
#' @keywords timeseries
#'
#' @import ggplot2
#' @importFrom dplyr arrange group_by summarise desc summarise_at
#' @importFrom tidyr gather
#' @importFrom data.table as.data.table setkey setDT
#' @importFrom plotly ggplotly
#' @importFrom stats median
#'
#' @return A ggplot2 object.
#'
#' @export
#'
#' @section Preserving relative abundances in a subset of larger data:
#' See \code{?\link{amp_filter_samples}} or the \href{https://kasperskytte.github.io/ampvis2/articles/faq.html#preserving-relative-abundances-in-a-subset-of-larger-data}{ampvis2 FAQ}.
#'
#' @seealso
#' \code{\link{amp_load}}
#'
#' @examples
#'
#' # Load example data
#' data("AalborgWWTPs")
#'
#' # Timeseries of the 5 most abundant OTUs based on the "Date" column
#' amp_timeseries(AalborgWWTPs,
#' time_variable = "Date",
#' tax_aggregate = "OTU"
#' )
#'
#' # As the above warning suggests, there are more than one sample per date in the data,
#' # in this case one from Aalborg East and one from Aalborg West. The average of the
#' # two samples is then shown per date. In such case it is then recommended to either
#' # subset the data, or group the samples by setting group_by = "" and split by tax_aggregate
#' # by setting split = TRUE:
#' amp_timeseries(AalborgWWTPs,
#' time_variable = "Date",
#' group_by = "Plant",
#' split = TRUE,
#' scales = "free_y",
#' tax_show = 9,
#' tax_aggregate = "Genus",
#' tax_add = "Phylum"
#' )
#' @author Julie Klessner Thun Pedersen \email{julieklessnerthun@@gmail.com}
#' @author Kasper Skytte Andersen \email{ksa@@bio.aau.dk}
amp_timeseries <- function(data,
time_variable = NULL,
group_by = NULL,
tax_aggregate = "OTU",
tax_add = NULL,
tax_show = 5,
tax_class = NULL,
tax_empty = "best",
split = FALSE,
scales = "free_y",
normalise = TRUE,
plotly = FALSE,
...) {
checkReqPkg("lubridate")
### Data must be in ampvis2 format
is_ampvis2(data)
#
tax_aggregate <- getLowestTaxLvl(
tax = data$tax,
tax_aggregate = tax_aggregate,
tax_add = NULL
)
## Clean up the taxonomy
data <- amp_rename(
data = data,
tax_class = tax_class,
tax_empty = tax_empty,
tax_level = tax_aggregate
)
## try to find a date column
if (is.null(time_variable)) {
dateCols <- unlist(lapply(data$metadata, lubridate::is.Date))
if (sum(dateCols) == 1) {
time_variable <- colnames(data$metadata)[which(dateCols)]
message("No \"time_variable\" provided, assuming the column \"", time_variable, "\" contains the dates.")
} else {
stop("Please provide a valid date column by the argument time_variable.", call. = FALSE)
}
}
## Coerce the group_by and facet_by variables to factor to always be considered categorical. Fx Year is automatically loaded as numeric by R, but it should be considered categorical.
if (!is.null(group_by)) {
data$metadata[group_by] <- lapply(data$metadata[group_by], factor)
}
# normalise counts
if (isTRUE(normalise)) {
data <- normaliseTo100(data)
}
# Aggregate to a specific taxonomic level
abund3 <- aggregate_abund(
abund = data$abund,
tax = data$tax,
tax_aggregate = tax_aggregate,
tax_add = tax_add,
calcSums = TRUE,
format = "long"
) %>%
as.data.frame()
suppressWarnings(
if (!is.null(group_by)) {
if (length(group_by) > 1) {
grp <- data.frame(Sample = data$metadata[, 1], .Group = apply(data$metadata[, group_by], 1, paste, collapse = "; "))
oldGroup <- unique(cbind.data.frame(data$metadata[, group_by], .Group = grp$.Group))
} else {
grp <- data.frame(Sample = data$metadata[, 1], .Group = data$metadata[, group_by])
}
abund3$.Group <- grp$.Group[match(abund3$Sample, grp$Sample)]
abund5 <- abund3
} else {
abund5 <- data.frame(abund3, .Group = abund3$Sample)
}
)
TotalCounts <- group_by(abund5, Display) %>%
summarise(Median = median(Sum), Total = sum(Sum), Mean = mean(Sum)) %>%
arrange(desc(Mean))
## Subset to the x most abundant levels
if (is.numeric(tax_show)) {
if (tax_show > nrow(TotalCounts)) {
tax_show <- nrow(TotalCounts)
}
abund5$Display <- as.character(abund5$Display)
abund7 <- subset(abund5, Display %in% as.character(unlist(TotalCounts[1:tax_show, "Display"])))
}
## Subset to a list of level names
if (!is.numeric(tax_show)) {
if (length(tax_show) > 1) {
abund7 <- subset(abund5, Display %in% tax_show)
}
if ((length(tax_show) == 1) && (tax_show != "all")) {
abund7 <- subset(abund5, Display %in% tax_show)
}
### Or just show all
if ((length(tax_show) == 1) && (tax_show == "all")) {
tax_show <- nrow(TotalCounts)
abund7 <- subset(abund5, Display %in% as.character(unlist(TotalCounts[1:tax_show, "Display"])))
}
}
abund7 <- as.data.frame(abund7)
abund7$Display <- factor(abund7$Display, levels = TotalCounts$Display)
if (length(group_by) > 1) {
abund7 <- merge(abund7, oldGroup)
}
abund7 <- merge(abund7, data$metadata, by.x = "Sample", by.y = colnames(data$metadata)[1])
abund7[, time_variable] <- lubridate::as_date(abund7[, time_variable], ...)
abund7$DisplayGroup <- paste(abund7$Display, abund7$.Group)
colnames(abund7)[which(colnames(abund7) == "Display")] <- tax_aggregate
colnames(abund7)[which(colnames(abund7) == "Sum")] <- "Value"
abund7 <- abund7[, -c(which(colnames(abund7) == "Abundance"))]
abund7 <- unique(abund7)
if (is.null(group_by)) {
if (any(duplicated(data$metadata[, time_variable]))) {
warning("Duplicate dates in column ", time_variable, ", displaying the average for each date.\n Consider grouping dates using the group_by argument or subset the data using amp_filter_samples.\n", call. = FALSE)
setDT(abund7)
abund7 <- abund7[, .(Value = mean(Value, na.rm = TRUE)), by = c(time_variable, tax_aggregate)]
}
if (isTRUE(split)) {
p <- ggplot(abund7, aes_string(x = time_variable, y = "Value", group = tax_aggregate))
} else if (!isTRUE(split)) {
p <- ggplot(abund7, aes_string(x = time_variable, y = "Value", group = tax_aggregate, color = tax_aggregate))
}
} else if (!is.null(group_by)) {
if (isTRUE(split)) {
p <- ggplot(abund7, aes_string(x = time_variable, y = "Value", color = ".Group", group = "DisplayGroup"))
} else if (!isTRUE(split)) {
p <- ggplot(abund7, aes_string(x = time_variable, y = "Value", color = ".Group", group = "DisplayGroup", linetype = tax_aggregate, shape = tax_aggregate))
}
}
p <- p +
geom_line() +
geom_point() +
xlab("") +
ylim(c(0, NA)) +
theme_classic() +
theme(
axis.text.x = element_text(size = 10, vjust = 0.3, angle = 90),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90")
)
if ((is.null(attributes(data)$normalised) | isFALSE(attributes(data)$normalised)) &
isFALSE(normalise)) {
p <- p + ylab("Read counts")
} else {
p <- p + ylab("Read abundance (%)")
}
if (split == T) {
p <- p + facet_wrap(tax_aggregate, scales = scales) +
theme(
strip.background = element_rect(colour = NA, fill = "grey95"),
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90"),
legend.position = "bottom",
strip.text = element_text(size = 10),
legend.title = element_blank()
)
}
if (isTRUE(plotly)) {
return(ggplotly(p, tooltip = c(".Group", tax_aggregate, "Date", "Value")))
} else if (!isTRUE(plotly)) {
return(p)
}
}
#' @rdname amp_timeseries
#' @export
amp_time_series <- amp_timeseries
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.