In abichat/zazou: Z-Scores As Ornstein-Uhlenbeck

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>", 
  fig.align = "center",
  eval = TRUE
)

library(ggplot2)
library(ggrepel)
library(tibble)
library(evabic)
library(tictoc)
library(dplyr)
library(tidyr)
library(zazou)
library(purrr)
library(ape)
theme_set(theme_minimal())

Deterministic situation

tree <- read.tree(text = "(((A:1,B:1):1,C:2):1,(D:1,E:1):2);")
tree$tip.label

incidence_mat <- incidence_matrix(tree)
incidence_mat + 0

true_shifts <- c(0, -3, 0, 0, 0, 0, -2, 0)
true_zscores <- incidence_mat %*% true_shifts
obs_zscores <- true_zscores <- true_zscores[, 1]
covar_mat <- diag(nrow = 5, ncol = 5)

covarianceOU_matrix(tree, alphaOU = 10^3)

plot_shifts(tree, true_shifts, true_scores = true_zscores)

estimation <- estimate_shifts(zscores = obs_zscores, 
                              tree = tree, alphaOU = 10^3, lambda = 0)
estimation
plot(estimation)

Solution without penalty is not sparse due to ill-conditioning of the incidence matrix.

estimation <- estimate_shifts(zscores = obs_zscores, 
                              tree = tree, alphaOU = 10^3, lambda = 2)
estimation
plot(estimation)

With a low penalty, the solution is much sparser and closer to the true shifts on branches.

Simulations on a toy example

tree <- read.tree(text = "(((A:1,B:1):1,C:2):1,(D:1,E:1):2);")
set.seed(2019)
covar_mat <- covarianceOU_matrix(tree, alphaOU = 1)
corrplot::corrplot(covar_mat, type = "upper")
true_shifts <- c(-3, -3, 0, -2, 0)
sqrtcovar <- t(chol(covar_mat))
zscores <- true_shifts + sqrtcovar %*% rnorm(5) # simulate 
zscores <- zscores[, 1]
zscores

Estimate shifts without penalty

The fit is perfect since all observed $z$-scores are negative.

estimation <- estimate_shifts(zscores = zscores, 
                              tree = tree, alphaOU = 1, lambda = 0)
estimation
plot(estimation, true_scores = true_shifts)

round(data.frame(true = true_shifts, observed  = zscores,
                 estimated = estimation$zscores_est), 
      digits = 4)

The fit degrades as we increase sparsity. [Well, not so much...]

estimation <- estimate_shifts(zscores = zscores, 
                              tree = tree, alphaOU = 1, lambda = 0.1)
estimation
plot(estimation, true_scores = true_shifts)

round(data.frame(true = true_shifts, observed  = zscores,
                 estimated = estimation$zscores_est), 
      digits = 4)

The fit does not depend a lot (on this toy example) on the initial solution.

estimation <- estimate_shifts(zscores = zscores, tree = tree, 
                              beta0 = c(0, -3, 0, 0, 0, 0, -2, 0), 
                              alphaOU = 1, lambda = 0.1)

estimation
plot(estimation, true_scores = true_shifts)

round(data.frame(true = true_shifts, observed  = zscores,
                 estimated = estimation$zscores_est), 
      digits = 4)

Simulations from real data

set.seed(42)
data(alcohol)
abund <- alcohol$X[, alcohol$Y == "Low"]
groups <- sample(c("A", "B"), size = ncol(abund), replace = TRUE)
tree <- force_ultrametric(alcohol$tree)
otu_to_keep <- names(which(rowSums(abund > 0) > 20))
abund <- abund[otu_to_keep, ]
tree <- drop.tip(tree, setdiff(tree$tip.label, otu_to_keep))
N_branch <- length(tree$edge.length)

pvalues_original <- test_wilcoxon(abund, groups)$p.value
zscores_original <- p2z(pvalues_original)
plot_shifts(tree, shifts = NA, obs_scores = zscores_original)

clustering <- create_clusters(tree, N_clusters = 20, method = "paraphyletic")

clusters <- sample(10, 4)
table(clustering[which(clustering %in% clusters)])
otus_da <- names(clustering[which(clustering %in% clusters)])
pi0 <- 1 - length(otus_da) / length(otu_to_keep)
abund[otus_da, groups == "B"] <- 4 * abund[otus_da,  groups == "B"]

pvalues <- test_wilcoxon(abund, groups)$p.value
zscores <- p2z(pvalues)
plot_shifts(tree, NA, obs_scores = zscores, 
            sup_scores = list(list(scores = clustering,
                                   title = "Clusters",
                                   color = as.character(clustering)),
                              list(scores = zscores - zscores_original,
                                   title = "Difference in z-scores after fold-change",
                                   color = as.character(sign(zscores - zscores_original)))))

tic()
estimation_shooting <- estimate_shifts(zscores = zscores, 
                                       tree = tree, alphaOU = c(0.2, 0.5, 1, 2, 5), 
                                       method = "lasso", constraint_type = "beta")
estimation_scla <- estimate_shifts(zscores = zscores, 
                                   tree = tree, alphaOU = c(0.2, 0.5, 1, 2, 5), 
                                   method = "scaled lasso", constraint_type = "beta")

confint_scoresystem <- estimate_confint(estimation_scla, method = "score system")
toc()
estimation_shooting
estimation_scla
confint_scoresystem
plot(estimation_shooting, 
     sup_scores = list(list(scores = zscores - zscores_original,
                            title = "Difference in z-scores after fold-change",
                            color = as.character(sign(zscores - zscores_original)))))
plot(estimation_scla, 
     sup_scores = list(list(scores = zscores - zscores_original,
                            title = "Difference in z-scores after fold-change",
                            color = as.character(sign(zscores - zscores_original)))))
plot(confint_scoresystem)
estimation_shooting$optim_info$bic_selection %>%
  mutate(n_shifts = map_dbl(shifts_est, ~ sum(. != 0))) %>% 
  select(-shifts_est)

data.frame(punct_scla = estimation_scla$shifts_est, 
           punct_scoresystem = confint_scoresystem$shifts_est$estimate) %>% 
  ggplot() + 
  aes(punct_scla, punct_scoresystem) +
  geom_point()
data.frame(punct_scla = estimation_scla$zscores_est, 
           punct_scoresystem = confint_scoresystem$zscores_est$estimate) %>% 
  ggplot() + 
  aes(punct_scla, punct_scoresystem) +
  geom_point()

detected_sh <- names(which(estimation_shooting$zscores_est != 0))
pvalues_bh <- p.adjust(pvalues, method = "BH")
detected_bh <- names(which(pvalues_bh < 0.05))
pvalues_bonf <- p.adjust(pvalues, method = "bonferroni")
detected_bonf <- names(which(pvalues_bonf < 0.05))
mm <- c("TPR", "FPR", "FDR", "ACC", "F1", "BACC")
ebc_tidy(detected_sh, otus_da, m = length(otu_to_keep), measures = mm)
ebc_tidy(detected_bh, otus_da, m = length(otu_to_keep), measures = mm)
ebc_tidy(detected_bonf, otus_da, m = length(otu_to_keep), measures = mm)
ebc_tidy(c(), otus_da, m = length(otu_to_keep), measures = mm)
ebc_tidy(otu_to_keep, otus_da, m = length(otu_to_keep), measures = mm)

map_dfr(list(detected_sh, detected_bh, detected_bonf, c(), otu_to_keep), ebc_tidy, 
        detected = otus_da, m = length(otu_to_keep), measures = mm) %>% 
  mutate(method = c("zazou", "bh", "bonf", "nothing", "everything")) %>% 
  select(method, everything())

plot(estimation_shooting, 
     sup_scores = list(list(scores = zscores - zscores_original,
                            title = "Difference in z-scores after fold-change",
                            color = as.character(sign(zscores - zscores_original))),
                       list(scores = pvalues_bh, title = "Detected by BH",
                            color = pvalues_bh < 0.05),
                       list(scores = pvalues_bonf, title = "Detected by Bonferroni",
                            color = pvalues_bonf < 0.05)))

df_measures_zazou <-
  estimation_shooting$zscores_est %>% 
  ebc_tidy_by_threshold(true = otus_da, 
                        m  = length(otu_to_keep), measures = mm) %>% 
  mutate(method = "zazou")
df_measures_bh <-
  ebc_tidy_by_threshold(pvalues_bh, true = otus_da, 
                        m  = length(otu_to_keep), measures = mm) %>% 
  mutate(method = "bh")
df_measures_bonf <-
  ebc_tidy_by_threshold(pvalues_bonf, true = otus_da, 
                        m  = length(otu_to_keep), measures = mm) %>% 
  mutate(method = "bonferonni")

df_measures <- rbind(df_measures_zazou, df_measures_bh, df_measures_bonf)

df_measures %>%
  arrange(FPR, TPR) %>%
  ggplot() +
  aes(x = FPR, y = TPR, color = method) +
  geom_point() +
  geom_line()

estimation_shooting$zscores_est %>% 
  ebc_AUC(true = otus_da, m  = length(otu_to_keep))

ebc_AUC(pvalues_bh, true = otus_da, m  = length(otu_to_keep))
ebc_AUC(pvalues_bonf, true = otus_da, m  = length(otu_to_keep))

several_confint <-
  seq(from = 0, to = 1, by = 0.01) %>% 
  enframe(value = "level", name = NULL) %>% 
  mutate(model = map(level, update_confint, x = confint_scoresystem),
         da_species = map(model, extract_significant_leaves, side = "left"),
         N = map_dbl(.data$da_species, length)) %>%
  group_split(N) %>% 
  map(head, 1) %>% 
  reduce(bind_rows) %>% 
  select(-N) %>% 
  add_row(level = Inf, model = list(NULL), da_species = list(otu_to_keep)) %>% 
  mutate(nested_measures = map(da_species, ebc_tidy, 
                               true = .env$otus_da, m = length(otu_to_keep))) %>% 
  unnest(nested_measures) %>% 
  mutate(method = "despars")

several_confint

evabic::ebc_AUC_from_measures(several_confint)

all_measures <-
  df_measures %>% 
  select(level = threshold, TPR:method) %>% 
  bind_rows(select(several_confint, -model, -da_species))

all_measures %>% 
  group_by(method) %>% 
  summarise(AUC = evabic:::area_rect(FPR, TPR))

ggplot(all_measures) +
  aes(x = FPR, y = TPR, color = method) +
  geom_point() +
  geom_line() +
  geom_text_repel(aes(label = round(FDR, 2)), show.legend = FALSE) 

ggplot(all_measures) +
  aes(x = level, y = FDR, color = method) +
  geom_point() +
  geom_line() +
  geom_abline(slope = pi0, intercept = 0) +
  facet_grid(cols = vars(method), scales = "free_x")