source("path_fix.R")
library(tidyverse)
library(rulesoflife)
library(fido)
library(driver)
library(shapes)
library(frechet)
library(vegan) # for Mantel test
library(pedtools)
library(kinship2)
library(RColorBrewer)
library(cowplot)
library(ggridges)
# ------------------------------------------------------------------------------
#
# R^2 association testing
#
# ------------------------------------------------------------------------------
# Specify model output directory
output_dir <- "asv_days90_diet25_scale1"
p <- 999 # number of permutations
# ------------------------------------------------------------------------------
# Load data (MAP estimates), compute dynamics distances
# ------------------------------------------------------------------------------
# Load a few posterior samples for a few hosts
data <- load_data(tax_level = "ASV")
md <- data$metadata
hosts <- sort(unique(md$sname))
groups <- get_host_social_groups(host_list = hosts)
N <- length(hosts)
Sigmas <- pull_Sigmas(output_dir)
D <- dim(Sigmas)[1] + 1
# NA correlation pairs with low confidence (lots of joint 0-0 observations)
rug_asv <- summarize_Sigmas(output_dir = "asv_days90_diet25_scale1")
filtered_pairs <- filter_joint_zeros(data$counts, threshold_and = 0.05, threshold_or = 0.5)
filtered_obj <- rug_asv
filtered_obj$tax_idx1 <- filtered_obj$tax_idx1[filtered_pairs$threshold]
filtered_obj$tax_idx2 <- filtered_obj$tax_idx2[filtered_pairs$threshold]
filtered_obj$rug <- filtered_obj$rug[,filtered_pairs$threshold]
# Calculate samples of the dynamics distance
host_combos <- combn(length(hosts), m = 2)
grp_assignments <- get_host_social_groups(hosts)
group_combos <- character(ncol(host_combos))
for(i in 1:ncol(host_combos)) {
h1 <- hosts[host_combos[1,i]]
h2 <- hosts[host_combos[2,i]]
grp1 <- grp_assignments$grp[grp_assignments$sname == h1]
grp2 <- grp_assignments$grp[grp_assignments$sname == h2]
if(grp1 == grp2) {
group_combos[i] <- grp1
} else {
group_combos[i] <- NA
}
}
dynamics_distances <- matrix(NA, N, N)
for(i in 1:ncol(host_combos)) {
if(i %% 100 == 0) {
cat(paste0("Host combo ", i, " / ", ncol(host_combos), "\n"))
}
a <- host_combos[1,i]
b <- host_combos[2,i]
# Just Euclidean distance
# d <- dist4cov(Sigmas[,,a] + diag(D-1)*1e-06,
# Sigmas[,,b] + diag(D-1)*1e-06)$dist
# vec1 <- Sigmas[,,a][upper.tri(Sigmas[,,a])]
# vec1 <- vec1[!is.na(vec1)]
# vec2 <- Sigmas[,,b][upper.tri(Sigmas[,,b])]
# vec2 <- vec2[!is.na(vec2)]
# dmat <- matrix(c(vec1, vec2), byrow = T, 2, length(vec1))
d <- as.numeric(dist(filtered_obj$rug[c(a,b),]))
dynamics_distances[a,b] <- d
dynamics_distances[b,a] <- d
}
dynamics_dist_vec <- dynamics_distances[lower.tri(dynamics_distances)]
# ------------------------------------------------------------------------------
#
# R^2: Pedigree associations
#
# ------------------------------------------------------------------------------
# Load pedigree
pedigree <- readRDS(file.path("input", "pedigree_56hosts.RDS"))
mapping <- data.frame(sname = sort(hosts)) %>%
left_join(data.frame(sname = rownames(pedigree), order = 1:nrow(pedigree)), by = "sname")
pedigree <- pedigree[mapping$order,mapping$order] # subset and order as Sigmas
ped_dist <- 1 - pedigree
ped_vec <- ped_dist[lower.tri(ped_dist, diag = FALSE)]
obs_Rsq_ped <- c(cor(scale(ped_vec), scale(dynamics_dist_vec))**2)
# R^2 plot
p1_ped <- ggplot() +
geom_point(data = data.frame(x = ped_vec, y = dynamics_dist_vec, grp = group_combos) %>%
filter(is.na(grp)),
mapping = aes(x = x, y = y),
size = 3, shape = 21, fill = "#bbbbbb") +
geom_point(data = data.frame(x = ped_vec, y = dynamics_dist_vec, grp = group_combos) %>%
filter(!is.na(grp)),
mapping = aes(x = x, y = y, fill = grp),
size = 3, shape = 21) +
geom_smooth(data = data.frame(x = ped_vec, y = dynamics_dist_vec),
mapping = aes(x = x, y = y),
color = "black", alpha = 0.2, method = "lm") +
scale_fill_brewer(palette = "Spectral") +
theme_bw() +
theme(legend.position = "none",
axis.title.y = element_text(size = 10)) +
labs(#x = "kinship dissimilarity (1 - % relatedness)",
x = "1 - coefficient of relatedness",
# y = "host-host dynamics distance",
y = "microbe-microbe correlation matrix distances",
fill = "Paired social group")
# ------------------------------------------------------------------------------
# Mantel test
#
# This answers the question: How much variation is explained by X on distances
# in dynamics *in a population this related*?
# ------------------------------------------------------------------------------
res1_ped <- mantel(as.dist(ped_dist), as.dist(dynamics_distances))
cat(paste0("P-value from Mantel test (kinship): ", round(res1_ped$signif, 3), "\n"))
# ------------------------------------------------------------------------------
# Null distribution - kinship/pedigree
#
# Generalized approximately to populations with any amount of relatedness
# ------------------------------------------------------------------------------
# Note: If the number of founders (which must apparently be << N) is not
# specified, this is drawn from a Poisson distribution. A smaller number of
# founders makes the population more related.
if(FALSE) {
simulate_K <- function(g) {
# Simulate a random pedigree
temp <- randomPed(g = g)
# Combine components
if(any(str_detect(names(temp), "^_comp"))) {
temp2 <- NULL
for(i in 1:length(temp)) {
temp2 <- rbind(as.data.frame(temp2),
as.data.frame(temp[[i]]))
}
temp2 <- temp2[order(as.numeric(temp2$id)),]
} else {
temp2 <- as.data.frame(temp)
}
# Convert it to a kinship matrix
temp3 <- data.frame(id = temp2$id,
mom = temp2$mid,
dad = temp2$fid,
sex = temp2$sex)
f <- sum(temp3$dad == 0 | temp3$mom == 0)
temp3$mom[1:f] <- NA
temp3$dad[1:f] <- NA
K <- kinship(with(temp3, pedigree(id, dad, mom, sex)))*2
K <- K[(f+1):nrow(K),(f+1):ncol(K)]
K
}
draw_random_Rsq_ped <- function(g) {
K <- simulate_K(g)
ped_dist <- 1 - K
ped_vec <- ped_dist[lower.tri(ped_dist, diag = FALSE)]
c(cor(scale(ped_vec), scale(dynamics_dist_vec))**2)
}
null_Rsq_ped <- sapply(1:p, function(pp) draw_random_Rsq_ped(56))
# Calculate a pseudo pvalue from quantiles
res2 <- sum(null_Rsq_ped > obs_Rsq_ped) / p
cat(paste0("Observed R^2: ", round(obs_Rsq_ped, 3), "\n"))
cat(paste0("P-value from random null test: ", round(res2, 3), "\n"))
p2_ped <- ggplot() +
geom_density(data = data.frame(x = null_Rsq_ped),
mapping = aes(x = x)) +
geom_point(data = data.frame(x = obs_Rsq_ped, y = 1),
mapping = aes(x = x, y = y),
size = 4,
shape = 21,
fill = "#d99e57") +
theme_bw() +
labs(x = "R-squared (random kinship dissimilarity x dynamics distance)",
y = "density")
p3_ped <- plot_grid(p1_ped, p2_ped, ncol = 2,
rel_widths = c(1,1), labels = c("A", "B"),
scale = 0.9,
label_size = 18)
}
# ------------------------------------------------------------------------------
#
# R^2: Baseline composition associations
#
# ------------------------------------------------------------------------------
# Calculate baseline composition as mean CLR-transformed composition for each host
baselines <- matrix(NA, length(hosts), D)
for(i in 1:length(hosts)) {
# Average of ALR samples for this host is their baseline
host_samples <- clr_array(data$counts[,which(md$sname == hosts[i])] + 0.5,
parts = 1)
baselines[i,] <- apply(host_samples, 1, mean)
}
baseline_distances <- dist(baselines)
baseline_dist_vec <- c(baseline_distances)
obs_Rsq_comp <- c(cor(baseline_dist_vec, dynamics_dist_vec)**2)
# R^2 plot
p1_comp <- ggplot() +
geom_point(data = data.frame(x = baseline_dist_vec, y = dynamics_dist_vec, grp = group_combos) %>%
filter(is.na(grp)),
mapping = aes(x = x, y = y),
size = 3, shape = 21, fill = "#bbbbbb") +
geom_point(data = data.frame(x = baseline_dist_vec, y = dynamics_dist_vec, grp = group_combos) %>%
filter(!is.na(grp)),
mapping = aes(x = x, y = y, fill = grp),
size = 3, shape = 21) +
geom_smooth(data = data.frame(x = baseline_dist_vec, y = dynamics_dist_vec),
mapping = aes(x = x, y = y),
color = "black", alpha = 0.2, method = "lm") +
scale_fill_brewer(palette = "Spectral") +
theme_bw() +
theme(legend.position = "bottom",
axis.title.y = element_text(size = 10)) +
labs(x = "Aitchison distance over baselines",
# y = "host-host dynamics distance",
y = "microbe-microbe correlation matrix distances",
fill = "Paired social group")
legend <- get_legend(p1_comp)
p1_comp <- p1_comp +
theme(legend.position = "none")
# ------------------------------------------------------------------------------
# Mantel test
#
# This answers the question: How much variation is explained by X on distances
# in dynamics *in a population this related*?
# ------------------------------------------------------------------------------
res1_comp <- mantel(baseline_distances, as.dist(dynamics_distances))
cat(paste0("P-value from Mantel test (composition): ", round(res1_comp$signif, 3), "\n"))
# Partial Mantel on pedigree
# Let's control for this KNOWN host-host similarity in baseline composition
res1_ped_partial <- mantel.partial(as.dist(ped_dist), as.dist(dynamics_distances), baseline_distances)
cat(paste0("P-value from PARTIAL Mantel test (kinship): ", round(res1_ped_partial$signif, 3), "\n"))
# Estimate R^2 for pedigree from the partial correlation coefficient after
# removing the effect of baseline distance
rxy <- cor(ped_vec,dynamics_dist_vec)
rxz <- cor(ped_vec,baseline_dist_vec)
ryz <- cor(dynamics_dist_vec,baseline_dist_vec)
# From vegan package partial.mantel source:
# https://rdrr.io/cran/vegan/src/R/mantel.partial.R
r_partial <- (rxy - rxz * ryz)/sqrt(1-rxz*rxz)/sqrt(1-ryz*ryz)
# r_partial**2 # r^2 = 0.009 after controlling for baseline composition
# ------------------------------------------------------------------------------
# Null distribution - baseline composition
#
# Generalized approximately to populations with any amount of relatedness
# ------------------------------------------------------------------------------
draw_random_Rsq_comp <- function() {
# Calculate baseline composition as mean CLR-transformed composition for each host
# Shuffle existing baselines by shuffling same taxa along all hosts
new_baselines <- baselines
for(i in 1:ncol(baselines)) {
new_baselines[,i] <- sample(new_baselines[,i])
}
new_baseline_dist_vec <- c(dist(new_baselines))
c(cor(new_baseline_dist_vec, dynamics_dist_vec)**2)
}
null_Rsq_comp <- numeric(p)
for(pp in 1:p) {
null_Rsq_comp[pp] <- draw_random_Rsq_comp()
}
# Calculate a pseudo pvalue from quantiles
res2 <- sum(null_Rsq_comp > obs_Rsq_comp) / p
cat(paste0("Observed R^2: ", round(obs_Rsq_comp, 3), "\n"))
cat(paste0("P-value from random null test: ", round(res2, 3), "\n"))
if(FALSE) {
p2_comp <- ggplot() +
geom_density(data = data.frame(x = null_Rsq_comp),
mapping = aes(x = x)) +
geom_point(data = data.frame(x = obs_Rsq_comp, y = 1),
mapping = aes(x = x, y = y),
size = 4,
shape = 21,
fill = "#d99e57") +
theme_bw() +
labs(x = "R-squared (random baseline x dynamics distance)",
y = "density")
p3_comp <- plot_grid(p1_comp, p2_comp, ncol = 2,
rel_widths = c(1,1), labels = c("C", "D"),
scale = 0.9,
label_size = 18)
}
# p_all <- plot_grid(p3_ped, p3_comp, ncol = 1)
p_row <- plot_grid(p1_comp, p1_ped,
ncol = 2,
rel_widths = c(1, 1),
labels = c("A", "B"),
label_size = 16,
label_y = 1.00,
label_x = -0.01,
scale = 0.95)
p_all <- plot_grid(p_row, legend,
ncol = 1,
rel_heights = c(1, 0.1))
ggsave(file.path("output", "figures", "Figure_5.png"),
plot = p_all,
dpi = 200,
units = "in",
height = 4,
width = 8,
bg = "white")
# ------------------------------------------------------------------------------
#
# R^2: Lifespan
#
# ------------------------------------------------------------------------------
females_only <- FALSE
# Load lifespan data
lifespan <- read.delim(file.path("input", "lifespanForKim_4Dec2021.csv"), sep = ",")
if(females_only) {
lifespan <- lifespan %>%
filter(sex == "F")
}
host_list <- lifespan %>% pull(sname)
host_include <- sort(hosts) %in% host_list
# use_Sigmas <- Sigmas[,,host_include]
use_Sigmas <- filtered_obj$rug[host_include,]
# Load pedigree
mapping <- data.frame(sname = sort(hosts)) %>%
left_join(data.frame(sname = host_list, order = 1:length(host_list)), by = "sname")
lifespan <- lifespan$age_at_death_censor[mapping$order]
lifespan_dist <- as.matrix(dist(lifespan))
lifespan_vec <- lifespan_dist[lower.tri(lifespan_dist)]
# Subset dynamics distances to these hosts
use_dd <- dynamics_distances[host_include,host_include]
use_dd_vec <- use_dd[lower.tri(use_dd)]
obs_Rsq_lifespan <- c(cor(scale(lifespan_vec), scale(use_dd_vec))**2)
# R^2 plot
# p1_ped <- ggplot(data.frame(x = scale(ped_vec), y = scale(dynamics_dist_vec)),
p1_life <- ggplot(data.frame(x = lifespan_vec, y = use_dd_vec),
aes(x = x, y = y)) +
# geom_smooth(color = "gray", alpha = 0.5) +
geom_point(size = 3, shape = 21, fill = "#999999") +
theme_bw() +
labs(x = paste0("lifespan distance", ifelse(females_only, " (females only)", "")),
y = "host-host dynamics distance")
# ------------------------------------------------------------------------------
# Mantel test
#
# This answers the question: How much variation is explained by X on distances
# in dynamics *in a population this related*?
# ------------------------------------------------------------------------------
res1_life <- mantel(as.dist(lifespan_dist), as.dist(use_dd))
cat(paste0("P-value from Mantel test (lifespan): ", round(res1_life$signif, 3), "\n"))
cat(paste0("Observed R^2: ", round(obs_Rsq_lifespan, 3), "\n"))
# ------------------------------------------------------------------------------
# Null distribution - lifespan
#
# Note: I think permutations of observed lifespan will have the same problem
# as the Mantel test!
# Essentially we would need an independent model of lifespan here.
# But luckily, this isn't remotely significant by Mantel.
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
#
# Pseudo-ANOVA
#
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# Load dynamics estimates and Frechet mean
# ------------------------------------------------------------------------------
# Load a few posterior samples for a few hosts
md <- data$metadata
hosts <- sort(unique(md$sname))
# Pull the already-parsed MAP estimates of dynamics from this object, calculated
# by `analysis_compute_Frechets.R`
# F1 <- readRDS(file.path("output", "Frechet_1_corr.rds"))
# Calculate the mean using the `frechet` package
# F1$mean <- CovFMean(F1$Sigmas)$Mout[[1]]
# Compute frechet mean with `shapes` package
# cov_mean <- estcov(Sigmas, method = "Euclidean")
cov_mean <- colMeans(filtered_obj$rug)
N <- dim(Sigmas)[3]
# ------------------------------------------------------------------------------
# Pull host social group / sex metadata
# ------------------------------------------------------------------------------
host_groups <- get_host_social_groups(hosts)
groups <- unique(host_groups$grp)
metadata <- load_data()$metadata
host_sex <- metadata %>%
dplyr::select(sname, sex) %>%
distinct() %>%
filter(sname %in% hosts) %>%
arrange(sname)
sexes <- unique(host_sex$sex)
# Load group means
group_means <- list()
for(g in 1:length(groups)) {
# group_means[[g]] <- CovFMean(F1$Sigmas[,,host_groups$grp == groups[g]])$Mout[[1]]
# group_means[[g]] <- estcov(Sigmas[,,host_groups$grp == groups[g]], method = "Euclidean")$mean
group_means[[g]] <- colMeans(filtered_obj$rug[host_groups$grp == groups[g],])
}
# Load group means
sex_means <- list()
for(s in 1:length(sexes)) {
# sex_means[[s]] <- CovFMean(F1$Sigmas[,,host_sex$sex == sexes[s]])$Mout[[1]]
# sex_means[[s]] <- estcov(Sigmas[,,host_sex$sex == sexes[s]], method = "Euclidean")$mean
sex_means[[s]] <- colMeans(filtered_obj$rug[host_sex$sex == sexes[s],])
}
# ------------------------------------------------------------------------------
# ANOVA on social group
# ------------------------------------------------------------------------------
K <- length(groups)
N <- nrow(host_groups)
between_sum <- 0
for(i in 1:K) {
grp <- groups[i]
n_i <- sum(host_groups == grp)
# d <- dist4cov(group_means[[i]], cov_mean$mean)$dist**2
d <- as.numeric(dist(matrix(c(group_means[[i]],
cov_mean),
byrow = T, 2, length(cov_mean))))**2
between_sum <- between_sum + (n_i * d)
}
numerator <- between_sum / (K-1)
within_sum <- 0
for(i in 1:K) {
grp <- groups[i]
idx_i <- which(host_groups$grp == grp)
for(j in idx_i) {
# d <- dist4cov(Sigmas[,,j], group_means[[i]])$dist**2
d <- as.numeric(dist(matrix(c(filtered_obj$rug[j,],
group_means[[i]]),
byrow = T, 2, length(group_means[[i]]))))**2
within_sum <- within_sum + d
}
}
denominator <- within_sum / (N-K)
ratio <- numerator / denominator
cat(paste0("P-value for pseudo-ANOVA on GROUP (F=",
round(ratio, 3),
"): ",
round(1 - pf(ratio, K-1, N-K), 3),
"\n"))
# Visualize as boxplots
# Get all distances
# coords <- cmdscale(dist(filtered_obj$rug))
# ggplot(data.frame(x = coords[,1], y = coords[,2], label = host_groups$grp),
# aes(x = x, y = y, fill = factor(label))) +
# geom_point(size = 2, shape = 21)
# ------------------------------------------------------------------------------
# ANOVA on sex
# ------------------------------------------------------------------------------
K <- length(sexes)
N <- nrow(host_sex)
between_sum <- 0
for(i in 1:K) {
sex <- sexes[i]
n_i <- sum(host_sex == sex)
# d <- dist4cov(sex_means[[i]], cov_mean$mean)$dist**2
d <- as.numeric(dist(matrix(c(sex_means[[i]],
cov_mean),
byrow = T, 2, length(cov_mean))))**2
between_sum <- between_sum + (n_i * d)
}
numerator <- between_sum / (K-1)
within_sum <- 0
for(i in 1:K) {
sex <- sexes[i]
idx_i <- which(host_sex$sex == sex)
for(j in idx_i) {
# d <- dist4cov(Sigmas[,,j], sex_means[[i]])$dist**2
d <- as.numeric(dist(matrix(c(filtered_obj$rug[j,],
sex_means[[i]]),
byrow = T, 2, length(sex_means[[i]]))))**2
within_sum <- within_sum + d
}
}
denominator <- within_sum / (N-K)
ratio <- numerator / denominator
cat(paste0("P-value for pseudo-ANOVA on SEX (F=",
round(ratio, 3),
"): ",
round(1 - pf(ratio, K-1, N-K), 3),
"\n"))
# Visualize as boxplots
# Get all distances
# ggplot(data.frame(x = coords[,1], y = coords[,2], label = host_sex$sex),
# aes(x = x, y = y, fill = factor(label))) +
# geom_point(size = 2, shape = 21)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.