visual16S: Visualization functions for 16S and Metagenomics analysis

Documented in plot_beta_diversity

#' plot_beta_diversity
#'
#' This is a function for plotting beta diversity.
#'
#' @param phyloseq A phyloseq object contain OTU table, taxonomy table, sample metadata and
#' phylogenetic tree, or a phyloseq object only contain constructed OTU table and metadata.
#'
#' @param feature The column name of the feature you want to select from metadata, e.g. "Phenotype".
#'
#' @param feature2 The column name of another feature you want to select from metadata, which will
#' show in different shape, e.g. "Gender". Default is NA.
#'
#' @param feature2_shape The shape values for feature2. See ggplot2::scale_shape_manual.
#'
#' @param level Which taxonomy level will be used to calculate beta diversity. Default is NA. If NA,
#' the original OTU table in phyloseq will be used. If level is given, first an OTU table with
#' taxonomy will be constructed using construct_otu_table, then the OTU table will be used to
#' calculate beta diversity through phyloseq::distance. The level name should be one of "Kingdom",
#' "Phylum", "Class", "Order", "Family", "Genus", "Species".
#'
#' @param method The method to calculate beta diversity. Method should be one of c("bray", "jaccard",
#' "unifrac", "wunifrac"). Default is "bray". PS: "unifrac" and "wunifrac" require a phylogenetic
#' tree.
#'
#' @param colors A vector of colors. The number of colors should be larger than the number of
#' feature. Default is NULL, if NULL, plot_beta_diversity will use ggsci::jco theme for the plot.
#'
#' @param size Value for geom_point(size). Default is 3.
#'
#' @param print_to_screen Default is FALSE. If TRUE, print beta-diversity table to the screen.
#'
#' @param label Default is FALSE. If TRUE, add sample name labels to plot.
#'
#' @param group_column The first parameter for adding arrows to the plot. Choose a column to group.
#' The arrows will be added to each group.
#'
#' @param arrange_column The second parameter for adding arrows to the plot. Choose a column to
#' arrange. This column should be continuous variables. Each group will be arrange by variables in
#' this column, the first variable is where the arrow starts, pointing to the second and to the last
#' variable.
#'
#' @param add_pc1 Default is FALSE. If TRUE, plot out top 3 most correlated OTUs to PC1.
#'
#' @param add_pc1_otu_level Default is FALSE. If FALSE, add_pc1 will extract otu_table from phyloseq
#' directly. If phyloseq object has both otu_table and tax_table, add_pc1_otu_level can get
#' different taxonomy for top 3 most correlated OTUs in add_pc1. If "all", will plot all taxonomy
#' level. Note: If `level` argument is provided, the corresponding OTU with taxonomy will be used to
#' compute correlation with PC1 and PC2 values, `add_pc1_otu_level` and `add_pc2_otu_level` need to
#' be set to FALSE.
#'
#' @param add_pc1_cor_method The method for computing correlation between PC1 and OTUs. One of
#' "spearman" (default), "kendall", or "pearson".
#'
#' @param add_pc2 Same as add_pc1.
#'
#' @param add_pc2_otu_level Same as add_pc1_otu_level.
#'
#' @param add_pc2_cor_method Same as add_pc1_cor_method.
#'
#' @export
#'
#' @examples
#' plot_beta_diversity(demo_phyloseq_object, feature = "diagnosis")

plot_beta_diversity <- function(
  phyloseq,
  feature,
  feature2 = NA,
  feature2_shape = c(0:21),
  level = NA,
  method = "bray",
  colors = NULL,
  size = 3,
  label = FALSE,
  group_column = NA,
  arrange_column = NA,
  add_pc1 = FALSE,
  add_pc1_otu_level = FALSE,
  add_pc1_cor_method = "spearman",
  add_pc2 = FALSE,
  add_pc2_otu_level = FALSE,
  add_pc2_cor_method = "spearman",
  print_to_screen = FALSE
  ) {
  set.seed(99)
  ## Step 1: Calculate beta diversity
  if(!method %in% c("bray", "jaccard", "unifrac", "wunifrac")) {
    stop(
      paste0('Beta diversity method should be one of c("bray", "jaccard", "unifrac", "wunifrac").')
    )
  } else {
    if(is.na(level)) {
      # 如果没有给level，就直接用phyloseq的otu计算beta diversity
      beta <- cmdscale(phyloseq::distance(physeq = phyloseq, method = method), eig = TRUE)
    } else if(!level %in% c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) {
      # 如果level输入错误，停止运行
      stop(paste0('Argument "level" should be one of "Kingdom", "Phylum", "Class", "Order", ',
                  '"Family", "Genus", "Species".'))
    } else {
      # 如果level输入正确：
      # 1.先判断是否是"unifrac"或"wunifrac"
      if(method %in% c("unifrac", "wunifrac")) {
        # "unifrac"和"wunifrac"需要input有phy_tree的phyloseq object才能计算
        stop(
          paste0('Calculate Unifrac and Weighted-Unifrac requires phyloseq object with phy_tree.')
        )
      }
      # 2.判断是否重复提供add_pc1_otu_level和add_pc2_otu_level
      if(!isFALSE(add_pc1_otu_level) | !isFALSE(add_pc2_otu_level)) {
        # 如果提供了`level`，就强制用`level`去画pc1，否则会变成画pcoa时用的`level`，画pc1计算
        # correlation时用otu，产生误导。
        # 如果没有提供`level`但提供了`add_pc1_otu_level`，那么就用otu计算correlation，和画pcoa时用
        # otu level保持一致，再用得到的前3个OTU去找taxonomy。
        stop(
          paste0("If `level` argument is provided, the corresponding OTU with taxonomy will be used",
                 " to compute correlation with PC1/PC2 values, `add_pc1_otu_level` and ",
                 "`add_pc2_otu_level` need to be set to FALSE.")
        )
      }
      # 生成对应level的otu再计算beta
      otu <- construct_otu_table(phyloseq, level = level)
      beta <- cmdscale(
        phyloseq::distance(physeq = otu_table(otu, taxa_are_rows = F), method = method),
        eig = TRUE
      )
    }

  }
  if(method == "bray") {
    plot_title <- "Bray–Curtis"
  } else if(method == "jaccard") {
    plot_title <- "Jaccard"
  } else if(method == "unifrac") {
    plot_title <- "UniFrac"
  } else if(method == "wunifrac") {
    plot_title <- "Weighted Unifrac"
  }
  ## Step 2: Construct plot table
  # Extract PC1 and PC2
  PC <- as.data.frame(beta$points)
  colnames(PC) <- c("PC1", "PC2")
  PC <- rownames_to_column(PC, var = "SampleID")
  # Extract metadata
  metadata <- extract_metadata_phyloseq(phyloseq)
  # Join two tables
  beta_plot <- left_join(PC, metadata)
  # Print beta-diversity table
  if(print_to_screen) {
    if(is.na(feature2)) {
      beta_plot %>%
        dplyr::select(SampleID, !!feature, PC1, PC2) %>%
        arrange_(feature) %>%
        print()
    } else {
      beta_plot %>%
        dplyr::select(SampleID, !!feature, !!feature2, PC1, PC2) %>%
        arrange_(feature) %>%
        print()
    }
  }
  ## Step 3: Plot beta diversity
  # Make x-axis and y-axis names for aes_string
  x_name <- "PC1"
  y_name <- "PC2"
  # Make factor for color, prevent "Error: Continuous value supplied to discrete scale"
  beta_plot[[feature]] <- beta_plot[[feature]] %>% as.factor()
  # 为add_pc1_info添加facet_grid(rows = vars(...))所需的column
  beta_plot$PCoA <- "PCoA"
  # Plot beta diversity
  if(is.na(feature2)) {
    p <- ggplot(data = beta_plot,
                # In the first layer (i.e. ggplot()), aes() can only contain x and y,
                # color, shape, etc. should be in upper layer (i.e. geom_point).
                aes_string(x = x_name, y = y_name)) + # Use aes_string() to pass variables to ggplot
      # Use 'color' instead of 'fill' in geom_point
      geom_point(aes_string(color = feature), size = size) +
      xlab(paste("PC1:", round(100*as.numeric(beta$eig[1]/sum(beta$eig)), 2), "%", sep = " ")) +
      ylab(paste("PC2:", round(100*as.numeric(beta$eig[2]/sum(beta$eig)), 2), "%", sep = " ")) +
      #labs(title = plot_title) +
      theme_classic() +
      theme(panel.grid = element_blank(),
            axis.text.y = element_text(size = 14),
            axis.text.x = element_text(size = 14),
            axis.title = element_text(size = 16),
            legend.text = element_text(size = 12))
  } else {
    p <- ggplot(data = beta_plot,
                # In the first layer (i.e. ggplot()), aes() can only contain x and y,
                # color, shape, etc. should be in upper layer (i.e. geom_point).
                aes_string(x = x_name, y = y_name)) + # Use aes_string() to pass variables to ggplot
      # Use 'color' instead of 'fill' in geom_point
      geom_point(aes_string(color = feature, shape = feature2), size = size) +
      xlab(paste("PC1:", round(100*as.numeric(beta$eig[1]/sum(beta$eig)), 2), "%", sep = " ")) +
      ylab(paste("PC2:", round(100*as.numeric(beta$eig[2]/sum(beta$eig)), 2), "%", sep = " ")) +
      #labs(title = plot_title) +
      scale_shape_manual(values = feature2_shape) +
      theme_classic() +
      theme(panel.grid = element_blank(),
            axis.text.y = element_text(size = 14),
            axis.text.x = element_text(size = 14),
            axis.title = element_text(size = 16),
            legend.text = element_text(size = 12))
  }
  # 添加label
  if(!is.na(level)) {
    p <- p + labs(title = paste0(plot_title, ", ", level, " Level"))
  } else {
    p <- p + labs(title = plot_title)
  }
  if(label) {
    label_name = "SampleID"
    p <- p + ggrepel::geom_label_repel(aes_string(x = x_name, y = y_name, label = label_name,
                                                  color = feature))
  }
  if(is.null(colors)) {
    p <- p + ggsci::scale_fill_jco() + ggsci::scale_color_jco()
  } else if(length(colors) < length(unique(beta_plot[[feature]]))) {
    stop(paste0("The number of colors is smaller than the number of ", feature, "."))
  } else {
    p <- p + scale_fill_manual(values = colors) + scale_color_manual(values = colors)
  }
  # Add arrows to plot
  if(!is.na(group_column) & !is.na(arrange_column)) {
    # 将string转换成name，才能通过!!符号传入dplyr
    group_column_sym <- rlang::sym(group_column)
    arrange_column_sym <- rlang::sym(arrange_column)
    p_data <- p$data %>%
      # 按照group_column进行group
      dplyr::group_by(!!group_column_sym) %>%
      # 按照arrange_column排序，添加.by_group才能按照group排序
      dplyr::arrange(!!arrange_column_sym, .by_group = TRUE)
    # 用for loop将每根线都添加到plot
    for(i in 1:nrow(p_data)) {
      # 先按照group slice，提取X，Y end的坐标
      xyend <- p_data %>%
        dplyr::slice((i + 1))
      # 如果所有样本都没有X，Y end的坐标，则停止
      if(nrow(xyend) == 0) {
        break
      }
      # 通过group_column找到X，Y end对应的X，Y坐标
      xy <- p_data %>%
        dplyr::filter(!!group_column_sym %in% as.character(xyend[[group_column]])) %>%
        dplyr::slice(i)
      # 创建geom_segment所需的dataframe，必须有 x, xend, y, yend 4列
      arrows_tab <- left_join(
        dplyr::select(xy, !!group_column_sym, x = PC1, y = PC2),
        dplyr::select(xyend, !!group_column_sym, xend = PC1, yend = PC2)
      )
      # 添加到beta diversity plot
      p <- p + geom_segment(
        data = arrows_tab, aes(x = x, xend = xend, y = y, yend = yend),
        arrow = arrow(length = unit(0.2, "cm")), color = "darkgrey"
      )
    }
  } else if(is.na(group_column) & !is.na(arrange_column)) {
    stop(paste0("group_column is missing."))
  } else if(!is.na(group_column) & is.na(arrange_column)) {
    stop(paste0("arrange_column is missing."))
  }
  if(add_pc1 | add_pc2) {
    # 为add_pc1和add_pc2准备好OTU table
    # 取出OTU（rows是SampleID，和PC一样）
    if(is.na(level)) {
      if(!taxa_are_rows(phyloseq)) {
        otu <- otu_table(phyloseq) %>% as.matrix() %>% as.data.frame()
      } else {
        otu <- otu_table(phyloseq) %>% as.matrix() %>% t() %>% as.data.frame()
      }
    } else if(!level %in% c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) {
      # 如果level输入错误，停止运行
      stop(paste0('Argument "level" should be one of "Kingdom", "Phylum", "Class", "Order", ',
                  '"Family", "Genus", "Species".'))
    } else {
      # 如果`level`不是NA，那么一开始计算distance的时候就会生成对应level的otu
      otu <- otu
    }
  }
  if(add_pc1) {
    # 计算所有OTU与PC1的correlation
    cor_pc1 <- NULL
    for(i in 1:ncol(otu)) {
      cor_pc1[i] <- cor(otu[i], PC$PC1, method = add_pc1_cor_method)
    }
    # 取出correlation最高的3个OTU
    cor_pc1 <- data.frame(cor = abs(cor_pc1), row.names = colnames(otu)) %>%
      rownames_to_column("OTU") %>%
      dplyr::arrange(desc(cor)) %>%
      dplyr::filter(rownames(.) %in% rownames(.)[1:3])
    top3otu <- cor_pc1$OTU
    # 取出top 3 otu的abundance
    otu_pc1 <- dplyr::select(otu, !!top3otu) %>% rownames_to_column("SampleID")
    # 将3个OTU合为一列，用于facet_wrap；将abundance合为一列，用于plot
    data_pc1 <- left_join(PC, otu_pc1) %>%
      tidyr::gather(key = "OTU", value = "abundance", !!top3otu)
    data_pc1$OTU <- factor(data_pc1$OTU, levels = top3otu)
    # 用计算correlation后得到的OTU去找taxonomy
    if(!isFALSE(add_pc1_otu_level)) {
      if(is.null(tax_table(phyloseq, errorIfNULL = F))) {
        stop(paste0("Taxonomy table is empty."))
      } else {
        tax <- tax_table(phyloseq) %>% as.data.frame() %>% rownames_to_column("OTU")
        # 如果add_pc1_otu_level == "all"，则将所有level都合并成一个level
        if(add_pc1_otu_level == "all" | add_pc1_otu_level == "All") {
          tax <- tax %>% tidyr::unite(2:ncol(.), col = !!add_pc1_otu_level, sep = "|")
        }
        tax <- tax %>% dplyr::select(OTU, !!add_pc1_otu_level)
        data_pc1 <- left_join(data_pc1, tax) %>%
          dplyr::select(-OTU) %>%
          dplyr::rename(OTU = !!add_pc1_otu_level)
      }
    }
    # plot pc1
    pc1 <- ggplot(data = data_pc1, aes(x = PC1, y = abundance, color = OTU)) +
      geom_line() +
      facet_grid(rows = vars(OTU), # rows相当于表格里的行，所以facet后plot会分成几行
                 scales = "free") +
      xlab(paste("PC1:", round(100*as.numeric(beta$eig[1]/sum(beta$eig)), 2), "%", sep = " ")) +
      ylab("Abundance") +
      theme_bw() +
      theme(
        panel.grid = element_blank(),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_text(size = 14),
        axis.title = element_text(size = 16),
        legend.text = element_text(size = 12),
        strip.text.y = element_blank() #去掉facet的label
      )
  }
  if(add_pc2) {
    # 计算所有OTU与PC2的correlation
    cor_pc2 <- NULL
    for(i in 1:ncol(otu)) {
      cor_pc2[i] <- cor(otu[i], PC$PC2, method = add_pc2_cor_method)
    }
    # 取出correlation最高的3个OTU
    cor_pc2 <- data.frame(cor = abs(cor_pc2), row.names = colnames(otu)) %>%
      rownames_to_column("OTU") %>%
      dplyr::arrange(desc(cor)) %>%
      dplyr::filter(rownames(.) %in% rownames(.)[1:3])
    top3otu <- cor_pc2$OTU
    # 取出top 3 otu的abundance
    otu_pc2 <- dplyr::select(otu, !!top3otu) %>% rownames_to_column("SampleID")
    # 将3个OTU合为一列，用于facet_wrap；将abundance合为一列，用于plot
    data_pc2 <- left_join(PC, otu_pc2) %>%
      tidyr::gather(key = "OTU", value = "abundance", !!top3otu)
    data_pc2$OTU <- factor(data_pc2$OTU, levels = top3otu)
    # 用计算correlation后得到的OTU去找taxonomy
    if(!isFALSE(add_pc2_otu_level)) {
      if(is.null(tax_table(phyloseq, errorIfNULL = F))) {
        stop(paste0("Taxonomy table is empty."))
      } else {
        tax <- tax_table(phyloseq) %>% as.data.frame() %>% rownames_to_column("OTU")
        # 如果add_pc2_otu_level == "all"，则将所有level都合并成一个level
        if(add_pc2_otu_level == "all" | add_pc2_otu_level == "All") {
          tax <- tax %>% tidyr::unite(2:ncol(.), col = !!add_pc2_otu_level, sep = "|")
        }
        tax <- tax %>% dplyr::select(OTU, !!add_pc2_otu_level)
        data_pc2 <- left_join(data_pc2, tax) %>%
          dplyr::select(-OTU) %>%
          dplyr::rename(OTU = !!add_pc2_otu_level)
      }
    }
    # plot pc2
    pc2 <- ggplot(data = data_pc2, aes(x = PC2, y = abundance, color = OTU)) +
      geom_line() +
      facet_grid(cols = vars(OTU), # cols相当于表格里的列，所以facet后plot会分成几列
                 scales = "free") +
      xlab(paste("PC2:", round(100*as.numeric(beta$eig[2]/sum(beta$eig)), 2), "%", sep = " ")) +
      ylab("Abundance") +
      theme_bw() +
      theme(
        panel.grid = element_blank(),
        axis.text.y = element_text(size = 14),
        axis.text.x = element_text(size = 14, angle = 270),
        axis.title = element_text(size = 16),
        legend.text = element_text(size = 12),
        legend.position = "left",
        strip.text.x = element_blank() #去掉facet的label
      ) +
      coord_flip()
  }
  if(add_pc1 & add_pc2) {
    # 同时添加pc1和pc2
    #p <- ggpubr::ggarrange(p, pc2, pc1, nrow = 2, ncol = 2)
    p_legend <- cowplot::get_legend(p)
    p <- p +
      facet_wrap(vars(PCoA)) +
      theme(axis.title.x = element_blank(),
            axis.text.x = element_blank(),
            axis.title.y = element_blank(),
            axis.text.y = element_blank(),
            strip.text = element_blank(),
            legend.position = "none",
            title = element_blank())
    # 去掉pc1的Y轴信息
    pc1 <- pc1 + theme(axis.title.y = element_blank(), axis.text.y = element_blank(),
                       legend.position = "bottom")
    # 去掉pc2的X轴信息
    pc2 <- pc2 + theme(axis.title.x = element_blank(), axis.text.x = element_blank())
    # 合并
    p <- cowplot::plot_grid(pc2, p, p_legend, pc1, ncol = 2)
  } else if(isTRUE(add_pc1) & isFALSE(add_pc2)) {
    # 只添加pc1
    p <- p +
      facet_grid(rows = vars(PCoA)) + #和pc1的操作一样
      theme(axis.title.x = element_blank(),
            axis.text.x = element_blank(),
            strip.text.y = element_blank())
    p <- cowplot::plot_grid(p, pc1, nrow = 2, align = "v")
  } else if(isFALSE(add_pc1) & isTRUE(add_pc2)) {
    # 只添加pc2
    p <- p +
      facet_grid(cols = vars(PCoA)) + #和pc2的操作一样
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank(),
            strip.text.x = element_blank())
    p <- cowplot::plot_grid(pc2, p, ncol = 2, align = "h")
  }
  # Show plot
  p
}