This file contains the code used for making the figures in https://arxiv.org/abs/2008.02662. The data and the interpretation of the figures are described in more detail in the paper.
I'm including the code as a vignette in the localBiplots
package, but the computations for the local biplot axes take long enough to run that I'm only including the code and not having it be evaluated by default.
I've included some notes about which ones take a long time and how long they took on my laptop.
library(knitr) opts_chunk$set(eval = FALSE)
We start with the packages we need and read in the data.
library(localBiplots) library(Matrix) library(phyloseq) library(ape) library(gridExtra) library(adaptiveGPCA) library(treeDA) source("unifrac-distances-and-derivs.R") data("antibiotic")
Then color palette definition and some data cleaning/recoding of factor levels and a function that we will need later for plotting.
# from http://tools.medialab.sciences-po.fr/iwanthue/ otuPalette = c("#D44251","#77CD39","#6AAECD","#CB53D4","#CDB13E","#577B39","#CA96CC","#53B99F","#C6488F","#AA6E37","#7670CF","#63658B","#D95728","#78C872","#B4606B") species_point_size = .6 sample_point_size = .8 tab = table(tax_table(abt)[,"Taxon_5"]) FamilySubset = tax_table(abt)[,"Taxon_5"] FamilySubset[FamilySubset %in% names(which(tab < 5))] = "" unknown = c("Incertae Sedis", "uncultured") FamilySubset[FamilySubset %in% unknown] = "" FamilySubset[FamilySubset == "Clostridiaceae_1"] = "Clostridiaceae" FamilySubset[FamilySubset == "Peptostreptococcaceae_1"] = "Peptostreptococcaceae" FamilySubset[FamilySubset == "Peptococcaceae_1"] = "Peptococcaceae" FamilySubset[FamilySubset == "BrenneriaBCCCDEEEEEHKKLLPPRRSYersinia"] = "Yersinia" FamilySubset[FamilySubset == ""] = "Unknown" FamilySubset = as.factor(FamilySubset) FamilySubset = relevel(FamilySubset, "Unknown") sd = as(sample_data(abt), "data.frame") sd$type = ifelse(sd$condition %in% c("Pre Cp", "Interim", "Post Cp"), "no abx", "abx") g_legend = function(a.gplot) { tmp = ggplot_gtable(ggplot_build(a.gplot)) leg = which(sapply(tmp$grobs, function(x) x$name) == "guide-box") legend = tmp$grobs[[leg]] return(legend) }
Here we do the setup to compute the local biplot axes for UniFrac.
A
and L
are matrices related to the phylogenetic tree associated with this dataset.
The code that I'm using does some pre-computations of values that are used multiple times later, this takes about 45 minutes on my laptop.
X = t(as(otu_table(abt), "matrix")) A = t(treeDA:::makeDescendantMatrix(phy_tree(abt))) L = treeDA:::getBranchLengths(phy_tree(abt)) uf_cache = uf_lb_computations(X, A, L)
Then we actually create the local biplot axes for UniFrac.
This is the part that takes the longest, on my laptop it took about two hours.
We define a function that makes the UniFrac distance, and then the local biplot axes are created by the uf_lb
function.
We don't make use of the local_biplot
interface here because it's too slow: uf_lb
is built specificalyl for UniFrac and does things a little more efficiently.
uf_dist_ours = function(X) uf_dist(X, t(A), L) uf_mds_matrices = localBiplots:::make_mds_matrices(X, uf_dist_ours) samples_for_biplot = seq(1,nrow(X), by = 3) uf_sens_all = lapply(samples_for_biplot, function(i) { uf_lb(X, sample_idx = i, uf_cached = uf_cache, mds_matrices = uf_mds_matrices, n_axes = 2) })
Next we do the same thing for weighted UniFrac.
We define a distance function and a derivative function for weighted UniFrac and the tree associated with this dataset.
Once the distance and derivative functions are defined, we can use the local_biplot
function to get the biplot data frame.
The local biplot axis computations here take about 10 minutes.
wuf_dist_ours = function(X) wuf_dist(X, t(A), L) wuf_deriv_ours = function(x, y) wuf_dist_deriv(x, y, t(A), L) biplot_df = local_biplot(X, wuf_dist_ours, wuf_deriv_ours)
Now that all the computations are done, we can make the plots.
We start off by just making the plot of the samples for weighted UniFrac.
The sample embeddings are one of the elements in the output from make_mds_matrices
.
wuf_mds_matrices = localBiplots:::make_mds_matrices(X, wuf_dist_ours) wuf_sample_plot = ggplot(data.frame(subset(biplot_df, variable == "UncShi72"), sd)) + geom_point(aes(x = x, y = y, color = type, shape = ind), size = sample_point_size) + xlab(sprintf("Axis 1: %2.1f%%", wuf_mds_matrices$Lambda[1] / sum(wuf_mds_matrices$Lambda) * 100)) + ylab(sprintf("Axis 2: %2.1f%%", wuf_mds_matrices$Lambda[2] / sum(wuf_mds_matrices$Lambda) * 100)) pdf("wuf_sample_plot.pdf", width = 4, height = 2.5) wuf_sample_plot dev.off()
Then we define a function that makes a pair of plots corresponding to the local biplot axes for weighted UniFrac at one sample point. In the pair of plots, the left-hand plot shows the sample embeddings with the sample corresponding to the local biplot axes highlighted. The right-hand plot shows the local biplot axes for the highlighted sample.
make_pair_at_sample_wuf = function(s_idx, plotList = FALSE) { highlight.color = "#E58601" ## this is wes_palette("FantasticFox1", 5)[4] wuf_xlab = xlab(sprintf("Axis 1: %2.1f%%", wuf_mds_matrices$Lambda[1] / sum(wuf_mds_matrices$Lambda) * 100)) wuf_ylab = ylab(sprintf("Axis 2: %2.1f%%", wuf_mds_matrices$Lambda[2] / sum(wuf_mds_matrices$Lambda) * 100)) sample_plot_highlighted = ggplot(data.frame(subset(biplot_df, variable == "UncShi72"), sample_data(abt))) + geom_point(aes(x = x, y = y, color = sample == s_idx, size = sample == s_idx)) + scale_color_manual(values = c("TRUE" = highlight.color, "FALSE" = "grey60"), guide = FALSE) + scale_size_manual(values = c("TRUE" = 2, "FALSE" = .6), guide = FALSE) + wuf_xlab + wuf_ylab species_plot = ggplot(data.frame(subset(biplot_df, sample == s_idx), FamilySubset)) + geom_point(aes(x = xend - x, y = yend - y, color = FamilySubset), size = species_point_size) + theme(legend.position = "none") + wuf_xlab + wuf_ylab if(plotList) { return(list(sample_plot_highlighted = sample_plot_highlighted, species_plot = species_plot)) } multiplot(sample_plot_highlighted, species_plot, cols = 2) }
These figures aren't included in the paper, but are included here as examples.
make_pair_at_sample_wuf(44) make_pair_at_sample_wuf(124) make_pair_at_sample_wuf(72) make_pair_at_sample_wuf(86) make_pair_at_sample_wuf(161) make_pair_at_sample_wuf(19) make_pair_at_sample_wuf(152)
We can then move to the sample embedding plots for unweighted UniFrac.
The sample embeddings are again one of the elements in the output from make_mds_matrices
.
uf_sample_plot = ggplot(data.frame(uf_mds_matrices$Y, sd)) + geom_point(aes(x = Axis1, y = Axis2, color = type, shape = ind), size = sample_point_size) + xlab(sprintf("Axis 1: %2.1f%%", uf_mds_matrices$Lambda[1] / sum(uf_mds_matrices$Lambda) * 100)) + ylab(sprintf("Axis 2: %2.1f%%", uf_mds_matrices$Lambda[2] / sum(uf_mds_matrices$Lambda) * 100)) uf_sample_plot
As for weighted UniFrac, we define a function that makes a pair of plots corresponding to the local biplot axes for weighted UniFrac at one sample point. As before, the left-hand plot shows the sample embeddings with the sample corresponding to the local biplot axes highlighted, and the right-hand plot shows the local biplot axes for the highlighted sample.
make_pair_at_sample_uf = function(s_idx, plotList = FALSE) { if(!(s_idx %in% samples_for_biplot)) { stop("didn't compute axes for that point") } else { idx = which(samples_for_biplot == s_idx) } highlight.color = "#E58601" ## this is wes_palette("FantasticFox1", 5)[4] uf_xlab = xlab(sprintf("Axis 1: %2.1f%%", uf_mds_matrices$Lambda[1] / sum(uf_mds_matrices$Lambda) * 100)) uf_ylab = ylab(sprintf("Axis 2: %2.1f%%", uf_mds_matrices$Lambda[2] / sum(uf_mds_matrices$Lambda) * 100)) sample_plot_highlighted = ggplot(data.frame(uf_mds_matrices$Y, sample = 1:nrow(uf_mds_matrices$Y))) + geom_point(aes(x = Axis1, y = Axis2, color = sample == s_idx, size = sample == s_idx)) + scale_color_manual(values = c("TRUE" = highlight.color, "FALSE" = "grey60"), guide = FALSE) + scale_size_manual(values = c("TRUE" = 2, "FALSE" = .6), guide = FALSE) + uf_xlab + uf_ylab species_plot = ggplot(data.frame(uf_sens_all[[idx]], FamilySubset)) + geom_point(aes(x = X1, y = X2, color = FamilySubset), size = species_point_size) + theme(legend.position = "none") + uf_xlab + uf_ylab if(plotList) { return(list(sample_plot_highlighted = sample_plot_highlighted, species_plot = species_plot)) } multiplot(sample_plot_highlighted, species_plot, cols = 2) }
These plots are not in the paper, but included as some examples.
make_pair_at_sample_uf(133) make_pair_at_sample_uf(154) make_pair_at_sample_uf(76) make_pair_at_sample_uf(103)
As described in the paper, for the generalized Euclidean distances the local biplot axes are constant and can be computed using a generalized PCA. The first two lines are transformations of the data matrix so that it can be used for gPCA. The first line is transforming the data to compositions --- this isn't necessarily a transformation we would normally use, but doing it makes the results more comparable to those for weighted UniFrac, which implicitly transforms to compositions. The second line is just column centering the data, which we always need to do before PCA.
Xscaled = sweep(X, 1, STATS = rowSums(X), FUN = "/") Xscaled = sweep(Xscaled, 2, STATS = colMeans(Xscaled), FUN = "-")
Once we have transformed the data matrix, we define the matrix Q
used for the generalized Euclidean distance corresponding to DPCoA, and perform gPCA with Xscaled
and Q
.
This corresponds to MDS with the generalized Euclidean distance d(x,y)^2 = (x - y)^T Q (x - y)
dpcoa_samples
and dpcoa_species
are the sample embeddings and local biplot axes, respectively.
Q = ape::vcv(phy_tree(abt)) Qeig = eigen(Q, symmetric = TRUE) out_dpcoa = adaptiveGPCA:::gpcaEvecs(X = Xscaled, evecs = Qeig$vectors, evals = Qeig$values, k = 2) dpcoa_xlab = xlab(sprintf("Axis 1: %2.1f%%", out_dpcoa$lambda[1]^2 / sum(out_dpcoa$lambda^2) * 100)) dpcoa_ylab = ylab(sprintf("Axis 2: %2.1f%%", out_dpcoa$lambda[2]^2 / sum(out_dpcoa$lambda^2) * 100)) dpcoa_samples = ggplot(data.frame(out_dpcoa$U, sd)) + geom_point(aes(x = Axis1, y = Axis2, color = type, shape = ind), size = sample_point_size) + dpcoa_xlab + dpcoa_ylab dpcoa_species = ggplot(data.frame(out_dpcoa$QV, Family = FamilySubset)) + geom_point(aes(x = Axis1, y = Axis2, color = Family), size = species_point_size) + dpcoa_xlab + dpcoa_ylab + theme(legend.key.height=unit(.7,"line")) grid.arrange(g_legend(dpcoa_samples), dpcoa_samples + theme(legend.position = "none"), dpcoa_species + theme(legend.position = "none"), g_legend(dpcoa_species), ncol = 4, widths = c(.4, 1, 1, .8))
For PCA, we do the same thing: we use the same data matrix, pca_samples
are the sample embeddings, and pca_species
are the local biplot axes.
out_pca = adaptiveGPCA:::gpcaEvecs(X = Xscaled, evecs = diag(rep(1, ncol(X))), evals = rep(1, ncol(X)), k = 2) pca_xlab = xlab(sprintf("Axis 1: %2.1f%%", out_pca$lambda[1]^2 / sum(out_pca$lambda^2) * 100)) pca_ylab = ylab(sprintf("Axis 2: %2.1f%%", out_pca$lambda[2]^2 / sum(out_pca$lambda^2) * 100)) pca_samples = ggplot(data.frame(out_pca$U, sd)) + geom_point(aes(x = Axis1, y = Axis2, color = type, shape = ind), size = sample_point_size) + pca_xlab + pca_ylab pca_species = ggplot(data.frame(out_pca$QV, Family = FamilySubset)) + geom_point(aes(x = Axis1, y = Axis2, color = Family), size = species_point_size) + pca_xlab + pca_ylab grid.arrange(g_legend(pca_samples), pca_samples + theme(legend.position = "none"), pca_species + theme(legend.position = "none"), g_legend(pca_species), ncol = 4, widths = c(.4, 1, 1, .8))
Finally, we have the local biplot axes for the generalized Euclidean distance corresponding to adaptive gPCA.
As before, agpca_samples
are the sample embeddings, and agpca_species
are the local biplot axes.
out_agpca = adaptivegpca(X = Xscaled, Q = Q, k = 2) agpca_xlab = xlab(sprintf("Axis 1: %2.1f%%", out_agpca$lambda[1]^2 / sum(out_agpca$lambda^2) * 100)) agpca_ylab = ylab(sprintf("Axis 2: %2.1f%%", out_agpca$lambda[2]^2 / sum(out_agpca$lambda^2) * 100)) agpca_samples = ggplot(data.frame(out_agpca$U, sd)) + geom_point(aes(x = Axis1, y = Axis2, color = type, shape = ind), size = sample_point_size) + agpca_xlab + agpca_ylab agpca_species = ggplot(data.frame(out_agpca$QV, Family = FamilySubset)) + geom_point(aes(x = Axis1, y = Axis2, color = Family), size = species_point_size) + agpca_xlab + agpca_ylab + theme(legend.key.height = unit(.7, "line")) grid.arrange(g_legend(agpca_samples), agpca_samples + theme(legend.position = "none"), agpca_species + theme(legend.position = "none"), g_legend(agpca_species), ncol = 4, widths = c(.4, 1.1, 1.2, .8))
Finally, we can collect all of the sample embedding plots together into one to facilitate comparison.
pdf("all-samples.pdf", width = 6.5, height = 4) grid.arrange( pca_samples + coord_fixed() + ggtitle("Euclidean distance") + theme(legend.position = "none"), uf_sample_plot + coord_fixed() + ggtitle("UniFrac") + theme(legend.position = "none"), agpca_samples + coord_fixed() + ggtitle("Adaptive gPCA") + theme(legend.position = "none"), wuf_sample_plot + coord_fixed() + ggtitle("Weighted UniFrac") + theme(legend.position = "none"), dpcoa_samples + coord_fixed() + ggtitle("DPCoA") + theme(legend.position = "none"), g_legend(agpca_samples + scale_shape("Subject") + scale_color_discrete("Antibiotic Treatment")), ncol = 3 ) dev.off()
We can do the same thing with same of the local biplot axes. Each panel is a set of local biplot axes for one of the distance, and we put them all together in one plot so that we can compare them easily.
biplot_theme = theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 6), axis.text.y = element_text(size = 6), legend.position = "none") biplot_legend_theme = theme(legend.key.size = unit(.1, 'cm'), legend.spacing.y = unit(0, 'cm')) pdf("all-biplots.pdf", width = 7, height = 5) grid.arrange( pca_species + coord_fixed() + ggtitle("Euclidean distance") + biplot_theme, make_pair_at_sample_uf(103, plotList = TRUE)[[2]] + coord_fixed() + ggtitle("UniFrac") + biplot_theme, agpca_species + coord_fixed() + ggtitle("Adaptive gPCA") + biplot_theme, make_pair_at_sample_wuf(161, plotList = TRUE)[[2]] + coord_fixed() + ggtitle("Weighted UniFrac") + biplot_theme, dpcoa_species + coord_fixed() + ggtitle("DPCoA") + biplot_theme, g_legend(dpcoa_species + biplot_legend_theme), ncol = 3, heights = c(1, 1.2) ) dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.