library(Cairo)
knitr::opts_chunk$set(cache = FALSE,
                      fig.width = 9,
                      dpi=300,
                      dev = "png",
                      dev.args = list(type = "cairo-png"),
                      message = FALSE,
                      warning = FALSE)
library(mia)
library(miaTime)
library(dplyr)
library(lubridate)
library(knitr)
library(TreeSummarizedExperiment)

Minimal gut microbiome

Dense samples of the minimal gut microbiome. In the initial hours, MDb-MM was grown under batch condition and 24 h onwards, continuous feeding of media with pulse feeding cycles. This information is stored in the colData.

library(miaTime)
data(minimalgut)
tse <- minimalgut

# quick check of number of samples 
kable(table(colData(tse)$StudyIdentifier,colData(tse)$condition_1))

Visualize samples available for each of the bioreactors. This allows to identify if there are any missing samples for specific times.

library(ggplot2)
library(tidySummarizedExperiment)
tse |> 
    ggplot(aes(as.factor(Time.hr), StudyIdentifier)) +
    geom_tile(aes(fill=condition_1), color="white") +
    scale_fill_manual("Condition Sampled", 
                      values = c("#ff006e", "#e07a5f", "#457b9d")) +
    theme_minimal() +
    theme(axis.text.x = element_text(size=8, angle = 90),
          legend.position = "top") +
    labs(x="Time (h)", y="")

Community dynamics

The minimalgut dataset, mucus-diet based minimal microbiome (MDbMM-16), consists of 16 species assembled in three bioreactors. We can investigate the succession of mdbMM16 from the start of experiment here hour zero until the end of the experiment.

## Divergence from baseline i.e from hour zero.
tse <- mia::relAbundanceCounts(minimalgut) # get relative abundance
tse <- getBaselineDivergence(tse,
                             group = "StudyIdentifier",
                             time_field = "Time.hr",
                             name_divergence = "divergence_from_baseline",
                             name_timedifference = "time_from_baseline",
                             assay_name="relabundance",
                             FUN = vegan::vegdist,
                             method="bray")

Visualize the divergence

# First define nice colors for bioreactors
bioreac_cols <- c(`Bioreactor A`= "#b2182b",
                  `Bioreactor B`= "#2166ac",
                  `Bioreactor C` = "#35978f")

tse |>
    ggplot(aes(x=Time.hr, y=divergence_from_baseline))+
    geom_point(aes(color=StudyIdentifier), size=2, alpha=0.5) +
    geom_line(aes(color=StudyIdentifier)) +
    theme_minimal() +
    scale_color_manual(values = bioreac_cols) +
    labs(x="Time (h)", y="Divergence \nfrom baseline") +
    # highlight specific timepoints
    geom_vline(xintercept = 152, lty=2, color="#991720") + 
    geom_vline(xintercept = 248, lty=2, color= "#0963bd")+
    annotate("text",x=c(152, 248),y=c(0.8, 0.8),
             label=c("Addition of\nB.hydrogenotrophica","Acetate Discontinued"),
             hjust=c(1.05,-0.05))

Visualizing selected taxa

Now visualize abundance of Blautia hydrogenotrophica using the miaViz::plotSeries function.

library(miaViz)
plotSeries(mia::relAbundanceCounts(minimalgut),
           x = "Time.hr",
           y = "Blautia_hydrogenotrophica",
           colour_by = "Species",
           assay_name = "relabundance")+
    geom_vline(xintercept = 152, lty=2, color="#991720") + 
    geom_vline(xintercept = 248, lty=2, color= "#0963bd")+
    annotate("text",x=c(152, 248),y=c(0.2, 0.15),
             label=c("Addition of\nB.hydrogenotrophica","Acetate Discontinued"),
             hjust=c(1.05,-0.05))+
    labs(x="Time (h)", y="B.hydrogenotrophica\nRelative Abundance") +
    theme(legend.position = "none") 

Visualize the rate (slope) of divergence

Sample dissimilarity between consecutive time steps(step size n >= 1) within a group(subject, age, reaction chamber, etc.) can be calculated by getStepwiseDivergence. If we normalize this by the time interval, this gives an approximate slope for the change.

# Load libraries
library(miaTime)
library(dplyr)

# Sort samples by time (necessary for getStepwiseDivergence)
tse <- tse[, order(colData(tse)$Time_hr_num)]

# Divergence between consecutive time points
tse <- getStepwiseDivergence(tse, group = "StudyIdentifier",
                               time_interval = 1,
                               time_field = "Time_hr_num",
                               name_divergence = "divergence_from_previous_step",
                               name_timedifference = "time_from_previous_step",
                               assay_name="relabundance",
                               FUN = vegan::vegdist,
                               method="bray")

# We have now new fields added in the colData:
# time_from_previous_step, divergence_from_previous_step
# print(colData(tse))

# Visualize the slope of dissimilarity between consecutive time points as a function of time (from baseline)
library(ggplot2)
theme_set(theme_bw(10))
p <- tse |> ggplot(aes(x=time_from_baseline,
                   y=divergence_from_previous_step/time_from_previous_step,
                   color=StudyIdentifier)) +
       geom_point() +
       geom_line() +
       labs(x="Time (hours)", y="Slope of dissimilarity (Bray-Curtis)") +
       geom_hline(aes(yintercept=0), linetype=2)

print(p)

Moving average of the slope

This shows how to calculate and plot moving average for the variable of interest (here: slope).

# Add slope explicitly in colData
colData(tse)$slope <- colData(tse)$divergence_from_previous_step / colData(tse)$time_from_previous_step

# Split by group and perform operation
tselist <- mia::splitOn(tse, "StudyIdentifier")

# colData(tse)$divergence_from_previous_step
addmean <- function (x, k, field, field_name) {
  # Calculate rolling mean
  m <- zoo::rollmean(x[[field]], k = k)
  # Initialize empty field
  colData(x)[[field_name]] <- rep(NA, ncol(x))
  # Fill in the rolling mean (length does not match with original data in rolling mean)
  colData(x)[1:length(m), field_name] <- m
  # Return the object with a new field added
  x
}

# Calculate sliding average for the field "divergence_from_previous_step"
# and store the result in a new field with the name "sliding_average"
tselist2 <- lapply(tselist, function (x) {addmean(x, k=3, field = "slope", field_name = "sliding_average")})

# Merge back
tse <- mia::mergeSEs(tselist2)

# Visualize
theme_set(theme_bw(10))
p <- tse |> ggplot(aes(x = time_from_baseline,
                   y = sliding_average,
                   color=StudyIdentifier)) +
     geom_point() +
     geom_line() +
     labs(x="Time (hours)", y="Mean slope of dissimilarity (Bray-Curtis)") +
     geom_hline(aes(yintercept=0), linetype=2)
print(p)

Error bars

This shows how to visualize error bars on top of time series. In this example we create artificial replicates and variation for a brief example.

## Source: https://www.geeksforgeeks.org/adding-error-bars-to-a-line-graph-with-ggplot2-in-r/
## Gives count, mean, standard deviation, standard error of the mean,
## and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

# Create artificial example on replicates because we don't have any in this demo data
set.seed(3422)
df <- as.data.frame(colData(tse))
sdlevel <- 0.1*mean(df$sliding_average, na.rm=TRUE)
df1 <- df
df2 <- df; df2$sliding_average <- rnorm(mean=df1$sliding_average, sd = sdlevel, n=nrow(df1)) 
df3 <- df; df3$sliding_average <- rnorm(mean=df1$sliding_average, sd = sdlevel, n=nrow(df1))
df <- bind_rows(list(df1, df2, df3))

# Calculate deviations
df <- summarySE(df, measurevar="sliding_average", groupvars=c("StudyIdentifier", "time_from_baseline"))

# Visualize
theme_set(theme_bw(10))
p <- ggplot(df,
         aes(x = time_from_baseline, y = sliding_average, color=StudyIdentifier)) +
     geom_point() +
     geom_line() +
     geom_errorbar(aes(ymin=sliding_average-sd, ymax=sliding_average+sd), width=.2,
                 position=position_dodge(0.05)) +    
     labs(x="Time (hours)", y="Mean slope of dissimilarity (Bray-Curtis)") +
     geom_hline(aes(yintercept=0), linetype=2)
print(p)

Session info

sessionInfo()


microbiome/miaTime documentation built on May 7, 2023, 2:06 p.m.