In syma-research/microTensor: Tensor Decomposition for Longitudinal Microbiomes

rm(list = ls())
library(magrittr)
library(ggplot2)

dir_output <- "results/DIABIMMUNE"
dir.create(dir_output, showWarnings = TRUE, recursive = TRUE)

load("data/DIABIMMUNE/X_array.RData")

l_fit_microTensor <- list()
for(i in seq(1, 5)) {
  set.seed(i)
  l_fit_microTensor[[i]] <- microTensor::microTensor(
    X = X_array, R = 3, 
    nn_t = TRUE, ortho_m = TRUE,
    weighted = TRUE,
    control = list(L_init = 2,
                   gamma = 2,
                   maxit = 1000,
                   verbose = TRUE,
                   debug_dir = paste0(dir_output, "/fit/", i)))
}
fit_microTensor <- l_fit_microTensor %>% 
  purrr::map_dbl(~ min(.x$fitting$obj)) %>% 
  {order(.)[1]} %>% 
  {l_fit_microTensor[[.]]}
save(fit_microTensor, file = paste0(dir_output, "/fit_microTensor.RData"))

l_fit_ctf <- list()
for(i in seq(1, 5)) {
  l_fit_ctf[[i]] <- 
    microTensor::ctf(
    X = X_array, R = 3)
}
fit_ctf <- l_fit_ctf %>% 
  purrr::map_dbl(~ min(.x$fitting$obj)) %>% 
  {order(.)[1]} %>% 
  {l_fit_ctf[[.]]}
save(fit_ctf, file = paste0(dir_output, "/fit_ctf.RData"))

fit_pca <- microTensor::pca(X_array, R = 3)
save(fit_pca, file = paste0(dir_output, "/fit_pca.RData"))

load(paste0(dir_output, "/fit_microTensor.RData"))
load(paste0(dir_output, "/fit_ctf.RData"))
load(paste0(dir_output, "/fit_pca.RData"))
load("data/DIABIMMUNE/X_array.RData")

Yhat_microTensor <- microTensor::get_Yhat(fit_microTensor)
expY <- exp(Yhat_microTensor)
expYsum_m <- apply(expY, c(2, 3), sum)
phat_microTensor <- 
  vapply(seq_len(dim(X_array)[1]),
         function(i) expY[i, , ] / expYsum_m,
         matrix(0.0, nrow = dim(X_array)[2], ncol = dim(X_array)[3]))
phat_microTensor <- 
  aperm(phat_microTensor, perm = c(3, 1, 2))

Yhat_ctf <- microTensor::get_Yhat(fit_ctf)
expY <- exp(Yhat_ctf)
expYsum_m <- apply(expY, c(2, 3), sum)
phat_ctf <- 
  vapply(seq_len(dim(X_array)[1]),
         function(i) expY[i, , ] / expYsum_m,
         matrix(0.0, nrow = dim(X_array)[2], ncol = dim(X_array)[3]))
phat_ctf <- 
  aperm(phat_ctf, perm = c(3, 1, 2))

Yhat_pca_mat <-  fit_pca$m %*% (t(fit_pca$s_full) * fit_pca$lambda)
Yhat_pca <- array(NA, dim(X_array))
for(i in seq_len(nrow(Yhat_pca_mat))) {
  Yhat_pca[i, , ] <-  Yhat_pca_mat[i, ]
}
expY <- exp(Yhat_pca)
expYsum_m <- apply(expY, c(2, 3), sum)
phat_pca <- 
  vapply(seq_len(dim(X_array)[1]),
         function(i) expY[i, , ] / expYsum_m,
         matrix(0.0, nrow = dim(X_array)[2], ncol = dim(X_array)[3]))
phat_pca <- 
  aperm(phat_pca, perm = c(3, 1, 2))


phat_data <- array(NA, dim(X_array))
Xsum_m <- apply(X_array, c(2, 3), sum)
phat_data <- 
  vapply(seq_len(dim(X_array)[1]),
         function(i) X_array[i, , ] / Xsum_m,
         matrix(0.0, nrow = dim(X_array)[2], ncol = dim(X_array)[3]))
phat_data <- 
  aperm(phat_data, perm = c(3, 1, 2))

tb_diff <- 
  list(phat_microTensor,
       phat_ctf,
       phat_pca) %>% 
  purrr::map2_dfr(
    c("microTensor", "CTF", "PCA"),
    function(i_phat, i_method) {
      tibble::tibble(
        diff = abs(i_phat - phat_data) %>% 
          apply(c(2, 3), mean) %>% 
          as.vector(),
        method = i_method
      )
    }
  ) %>% 
  dplyr::filter(!is.na(diff)) %>% 
  dplyr::mutate(method = factor(method, levels = c("PCA", "CTF", "microTensor")))
p_diff <- tb_diff %>% 
  ggplot(aes(x = method, y = diff)) +
  geom_boxplot() +
  theme_bw() +
  theme(axis.title.x = element_blank()) +
  ylab("Absolute difference in observed and \n method-based microbial profiles")

load(paste0(dir_output, "/fit_microTensor.RData"))
load(paste0(dir_output, "/fit_ctf.RData"))
load(paste0(dir_output, "/fit_pca.RData"))
load("data/DIABIMMUNE/df_samples.RData")
df_subjects <- df_samples %>% 
  dplyr::group_by(subjectid) %>% 
  dplyr::slice(1) %>% 
  dplyr::ungroup()
load("data/DIABIMMUNE/X_array.RData")
feature_names <- dimnames(X_array)[[1]]
subject_names <- dimnames(X_array)[[2]]
time_names <- dimnames(X_array)[[3]]


mat_tax <- feature_names %>% 
  stringr::str_split_fixed(stringr::fixed("|"), n = 6)
percs <- fit_microTensor$lambda^2 / sum(fit_microTensor$lambda^2) * 100
ps <- c(1, 2) %>% 
  purrr::map(function(r) {
    # feature
    df_m <- microTensor::create_loading(fit_microTensor,
                           feature_names = feature_names,
                           subject_names = subject_names,
                           time_names = time_names,
                           class = "feature") %>% 
      dplyr::mutate(
        feature_plot = 
          microTensor::betterGeneraNames(mat_tax[, 3],
                            mat_tax[, 4],
                            mat_tax[, 5],
                            mat_tax[, 6])) %>% 
      dplyr::mutate(m = !!sym(paste0("Axis ", r)))

    df_plot <- 
      rbind(
      df_m %>%
        dplyr::arrange(-df_m[["m"]]) %>%
        dplyr::slice(seq_len(5)),
      df_m %>%
        dplyr::arrange(df_m[["m"]]) %>%
        dplyr::slice(seq_len(5)) %>%
        {.[seq(5, 1), ]}
    ) %>%
      dplyr::mutate(feature_plot = factor(feature_plot, 
                                          levels = rev(feature_plot)))

    p_feature <- df_plot %>% 
      ggplot(aes(y = m, x = feature_plot)) +
      geom_bar(stat = "identity") +
      coord_flip() +
      theme_bw() +
      theme(axis.title.y = element_blank()) +
      ylab("Feature loading") +
      ggtitle(" ")

    # subject
    tb_loading <- microTensor::create_loading(
      fit_decomp = fit_microTensor,
                                 feature_names = feature_names,
                                 subject_names = subject_names,
                                 time_names = time_names,
                                 class = "subject")
    tb_loading <- tb_loading %>% 
      dplyr::left_join(df_samples %>% 
                         dplyr::group_by(subjectid) %>% 
                         dplyr::slice(1) %>% 
                         dplyr::ungroup(), 
                       by = c("Subject" = "subjectid")) %>% 
      dplyr::mutate(Csection = ifelse(csection,
                                      "C-section",
                                      "Vaginal"),
                    Loading = !!sym(paste0("Axis ", r)))
    p_subject <- tb_loading %>% 
      ggplot(aes(x = Csection,
                 y = Loading)) +
      geom_boxplot(outlier.shape = NA) +
      geom_point(position = position_jitter(width = 0.25)) +
      theme_bw() +
      xlab("") +
      ylab("Subject loading") + 
      ggtitle("")

     return(list(p_feature,
                p_subject))
  })


p_main <- 
  cowplot::plot_grid(ps[[1]][[1]], ps[[1]][[2]],
                     nrow = 1,
                     rel_widths = c(1, 0.6),
                     labels = c(paste0("B) Axis ", 1,
                                       " (", round(percs[1], 2),
                                       "%)"),
                                "")) %>% 
  cowplot::plot_grid(p_diff, ., labels = c("A)", ""),
                     nrow = 1, rel_widths = c(0.5, 1))
ggsave(p_main, file = "figures/figure7/figure_DIABIMMUNE.pdf",
       width = 12, height = 4.5)

p_supps <- list(fit_pca,
                fit_ctf) %>% 
  purrr::map2(
    c("PCA", "CTF"),
    function(fit_decomp,
             method) {
      percs <- fit_decomp$lambda^2 / sum(fit_decomp$lambda^2) * 100
      ps <- c(1, 2) %>% 
        purrr::map(function(r) {
          # feature
          df_m <- microTensor::create_loading(
            fit_decomp,
                                 feature_names = feature_names,
                                 subject_names = subject_names,
                                 time_names = time_names,
                                 class = "feature") %>% 
            dplyr::mutate(
              feature_plot = 
                microTensor::betterGeneraNames(mat_tax[, 3],
                                  mat_tax[, 4],
                                  mat_tax[, 5],
                                  mat_tax[, 6])) %>% 
            dplyr::mutate(m = !!sym(paste0("Axis ", r)))

          df_plot <- rbind(
            df_m %>% 
              dplyr::arrange(-df_m[["m"]]) %>% 
              dplyr::slice(seq_len(5)),
            df_m %>% 
              dplyr::arrange(df_m[["m"]]) %>% 
              dplyr::slice(seq_len(5)) %>% 
              {.[seq(5, 1), ]}
          ) %>% 
            dplyr::mutate(feature_plot = factor(feature_plot, 
                                                levels = rev(feature_plot)))

          p_feature <- df_plot %>% 
            ggplot(aes(y = m, x = feature_plot)) +
            geom_bar(stat = "identity") +
            coord_flip() +
            theme_bw() +
            theme(axis.title.y = element_blank()) +
            ylab("Feature loading") +
            ggtitle(" ")

          # subject
          tb_loading <- microTensor::create_loading(
            fit_decomp = fit_decomp,
                                       feature_names = feature_names,
                                       subject_names = subject_names,
                                       time_names = time_names,
                                       class = "subject")
          tb_loading <- tb_loading %>% 
            dplyr::left_join(df_samples %>% 
                               dplyr::group_by(subjectid) %>% 
                               dplyr::slice(1) %>% 
                               dplyr::ungroup(), 
                             by = c("Subject" = "subjectid")) %>% 
            dplyr::mutate(Csection = ifelse(csection,
                                            "C-section",
                                            "Vaginal"),
                          Loading = !!sym(paste0("Axis ", r)))
          p_subject <- tb_loading %>% 
            ggplot(aes(x = Csection,
                       y = Loading)) +
            geom_boxplot(outlier.shape = NA) +
            geom_point(position = position_jitter(width = 0.25)) +
            theme_bw() +
            xlab("") +
            ylab("Subject loading") + 
            ggtitle("")

          return(list(p_feature,
                      p_subject))
        })

      p <- 
        cowplot::plot_grid(ps[[1]][[1]], ps[[1]][[2]],
                           nrow = 1,
                           rel_widths = c(1, 0.6),
                           labels = c(paste0("    Axis ", 1,
                                             " (", round(percs[1], 2),
                                             "%)"),
                                      ""))

      return(p)
    })

p_supp <- cowplot::plot_grid(plotlist = p_supps,
                             labels = c("A) PCA", "B) CTF"),
                             ncol = 1)
ggsave(p_supp, file = "figures/suppFigure/figure_DIABIMMUNE_supp.pdf",
       width = 7, height = 5)

list(fit_microTensor, fit_pca, fit_ctf) %>% 
  purrr::map(function(fit_decomp) {
    tb_loading <- microTensor::create_loading(fit_decomp = fit_decomp,
                                 feature_names = feature_names,
                                 subject_names = subject_names,
                                 time_names = time_names,
                                 class = "subject")
    tb_loading <- tb_loading %>% 
      dplyr::left_join(df_samples %>% 
                         dplyr::group_by(subjectid) %>% 
                         dplyr::slice(1) %>% 
                         dplyr::ungroup(), 
                       by = c("Subject" = "subjectid")) %>% 
      dplyr::mutate(Csection = ifelse(csection,
                                      "C-section",
                                      "Vaginal"),
                    Loading = `Axis 1`)
    t.test(tb_loading$Loading[tb_loading$csection], tb_loading$Loading[!tb_loading$csection])
  })

load("data/DIABIMMUNE/df_samples.RData")
df_subjects <- df_samples %>% 
  dplyr::group_by(subjectid) %>% 
  dplyr::slice(1) %>% 
  dplyr::ungroup()
load("data/DIABIMMUNE/X_array.RData")
feature_names <- dimnames(X_array)[[1]]
subject_names <- dimnames(X_array)[[2]]
time_names <- dimnames(X_array)[[3]]


tb_loading <- microTensor::create_loading(fit_decomp = fit_microTensor,
                                          feature_names = feature_names,
                                          subject_names = subject_names,
                                          time_names = time_names,
                                          class = "sample")
tb_loading <- tb_loading %>% 
  dplyr::mutate(Time = as.integer(as.character(Time))) %>% 
  dplyr::left_join(df_subjects, 
                   by = c("Subject" = "subjectid")) 

tb_loading_avg <- tb_loading %>% 
  dplyr::mutate(`Birth mode` = ifelse(csection, "C-section", "Vaginal")) %>% 
  dplyr::group_by(Time, `Birth mode`) %>% 
  dplyr::summarise(mean_loading = mean(`Axis 1`, na.rm = TRUE),
                   sd_loading = sd(`Axis 1`, na.rm = TRUE) /
                     sqrt(dplyr::n())) %>% 
  dplyr::ungroup() %>% 
  dplyr::mutate(Month = Time - 1)

p_time <- tb_loading_avg %>% 
  dplyr::filter(!is.na(`Birth mode`)) %>% 
  ggplot(aes(x = Month, y = mean_loading,
             color = `Birth mode`,
             shape = `Birth mode`)) +
  geom_line(position = position_dodge(width = 0.5)) +
  geom_point(position = position_dodge(width = 0.5), size = 2) +
  geom_errorbar(aes(ymax = mean_loading + sd_loading,
                    ymin = mean_loading - sd_loading),
                position = position_dodge(width = 0.5),
                width = 0.5) +
  theme_bw() +
  ylab("Average sample loading") +
  ggtitle("Time trajectory") +
  theme(legend.position = c(0, 1),
        legend.justification = c(0, 1),
        legend.background = element_blank())

ggsave(p_time, filename = "figures/suppFigure/time_DIABIMMUNE.pdf",
       width = 5, height = 4)