amplicon: Statistics and visualization for microbiome data

Documented in alpha_boxplot

# 绘制alpha多样性箱线图并添加统计分组 Alpha boxplot & ANOVA+Tukey/LSD test
#
# This is the first function named 'alpha_boxplot'
# which draw box plot with alpha and metadata, and return a ggplot2 object

#' @title Plotting alpha diversity boxplot for each group with ANOVA statistics
#' @description Input alpha index and metadata, and manual set alpha index and metadata column name
#' ANOVA+TukeyHSD/agricolae::LSD.test calculate p-value, and dplyr summary each group max for p-value groups position.
#' ggplot2 show boxplot, jitter and stat groups.
#' @param alpha_div alpha diversity matrix, typical output of usearch(-alpha_div)/QIIME/vegan
#' rowname is sample ID, colname is index of alpha diversity;
#' @param index index type of alpha diversity;
#' @param metadata matrix or data frame, including sample ID and group ID;
#' @param groupID column name for group ID.
#' @details
#' By default, returns richness diversity index
#' The available diversity indices include the following:
#' \itemize{
#' \item{most used indices: richness, simpson, shannon_2}
#' \item{other used indices: berger_parker, buzas_gibson, dominance, equitability, jost, jost1, reads, robbins, simpson, shannon_2, shannon_10}
#' }
#' @return ggplot2 object.
#' @author Contact: Yong-Xin Liu \email{yxliu@@genetics.ac.cn}
#' @references
#'
#' Yong-Xin Liu, Yuan Qin, Tong Chen, Meiping Lu, Xubo Qian, Xiaoxuan Guo & Yang Bai.
#' A practical guide to amplicon and metagenomic analysis of microbiome data.
#' Protein Cell, 2020(41), 1-16, DOI: \url{https://doi.org/10.1007/s13238-020-00724-8}
#'
#' @seealso alpha_barplot
#' @examples
#' # Input alpha index (alpha_div) and metadata, select richness (index type) and Group (catagory)
#' alpha_boxplot(alpha_div, metadata, "richness", "Group")
#' # Select shannon_2 (index type) and Site (catagory)
#' alpha_boxplot(alpha_div = alpha_div, metadata = metadata, index = "shannon_2", groupID = "Site")
#' @export

alpha_boxplot <- function(alpha_div, metadata, index = "richness", groupID = "Group",
                          outlier = TRUE
                          ) {
  # 依赖关系检测与安装
  p_list = c("ggplot2", "dplyr", "multcompView") # "agricolae"
  for(p in p_list){
    if (!requireNamespace(p)){
    install.packages(p)}
    suppressPackageStartupMessages(library(p, character.only = TRUE, quietly = TRUE, warn.conflicts = FALSE))
  }

  # 测试默认参数
  # library(amplicon)
  # index = "richness"
  # groupID = "Group"
  # metadata = read.table("metadata2.txt", header=T, row.names=1, sep="\t", comment.char="", stringsAsFactors = F)
  # metadata = subset(metadata, Group %in% c("KO","OE"))

  # 交叉筛选
  idx = rownames(metadata) %in% rownames(alpha_div)
  metadata = metadata[idx,,drop=F]
  alpha_div = alpha_div[rownames(metadata),]

  # 提取样品组信息,默认为group可指定
  sampFile = as.data.frame(metadata[, groupID],row.names = row.names(metadata))
  # colnames(sampFile)[1] = "group"

  # 合并alpha_div和metadata
  df = cbind(alpha_div[rownames(sampFile),index], sampFile)
  colnames(df) = c(index,"group")

  # 统计各种显著性
  model = aov(df[[index]] ~ group, data=df)
  # 计算Tukey显著性差异检验
  Tukey_HSD = TukeyHSD(model, ordered = TRUE, conf.level = 0.95)
  # 提取比较结果
  Tukey_HSD_table = as.data.frame(Tukey_HSD$group)

  # 保存统计结果
  # 保存一个制表符，解决存在行名时，列名无法对齐的问题
  write.table(paste(date(), "\nGroup\t", groupID, "\n\t", sep=""), file=paste("alpha_boxplot_TukeyHSD.txt",sep=""),append = T, quote = F, eol = "", row.names = F, col.names = F)
  # 保存统计结果，有warning正常
  suppressWarnings(write.table(Tukey_HSD_table, file=paste("alpha_boxplot_TukeyHSD.txt",sep=""), append = T, quote = F, sep="\t", eol = "\n", na = "NA", dec = ".", row.names = T, col.names = T))

  # 函数：将Tukey检验结果P值转换为显著字母分组
  # 输入文件为图基检验结果和分组
  generate_label_df = function(TUKEY, variable){
    # library(multcompView)
    # 转换P值为字母分组
    ## 提取图基检验中分组子表的第4列P adjust值
    Tukey.levels = TUKEY[[variable]][,4]
    # 方法1.multcompLetters函数将两两p值转换为字母，data.frame并生成列名为Letters的数据框
    Tukey.labels = data.frame(multcompLetters(Tukey.levels)['Letters'])
    # 方法2. 解决字母顺序相反的问题
    # library(multcomp)
    # tuk <- cld(glht(model, alternative = 'two.sided', linfct = mcp(group = 'Tukey')), sig = p, decreasing = TRUE)
    # Tukey.labels <- data.frame(Letters=tuk$mcletters$Letters, stringsAsFactors = FALSE)

    # 按分组名字母顺序
    ## 提取字母分组行名为group组名
    Tukey.labels$group = rownames(Tukey.labels)
    # 按组名的字母顺序排列，默认的Levels
    Tukey.labels=Tukey.labels[order(Tukey.labels$group), ]
    return(Tukey.labels)
  }

  # 当只有两组时，用LSD标注字母
  if (length(unique(df$group)) == 2){
    # LSD检验，添加差异组字母
    library(agricolae)
    out = LSD.test(model, "group", p.adj="none")
    stat = out$groups
    # 分组结果添入Index
    df$stat=stat[as.character(df$group),]$groups
    # 当大于两组时，用multcompView标注字母
  }else{
    # library(multcompView)
    LABELS = generate_label_df(Tukey_HSD , "group")
    df$stat=LABELS[as.character(df$group),]$Letters
  }

  # 设置分组位置为各组y最大值+高的5%
  max=max(df[,c(index)])
  min=min(df[,index])
  x = df[,c("group",index)]
  y = x %>% group_by(group) %>% summarise_(Max=paste('max(',index,')',sep=""))
  y=as.data.frame(y)
  rownames(y)=y$group
  df$y=y[as.character(df$group),]$Max + (max-min)*0.05


  if (outlier) {
    # 绘图 plotting
    p = ggplot(df, aes(x=group, y=.data[[index]], color=group)) +
      geom_boxplot(alpha=1,
                   # outlier.shape = NA,
                   # outlier.size=0,
                   size=0.7,
                   width=0.5, fill="transparent") +
      labs(x="Groups", y=paste(index, "index"), color=groupID) + theme_classic() +
      geom_text(data=df, aes(x=group, y=y, color=group, label=stat)) +
      geom_jitter(position=position_jitter(0.17), size=1, alpha=0.7)+
      theme(text=element_text(family="sans", size=7))
    p
  } else{
    # 绘图 plotting
    p = ggplot(df, aes(x=group, y=.data[[index]], color=group)) +
      geom_boxplot(alpha=1,
                   outlier.shape = NA,
                   outlier.size=0,
                   size=0.7,
                   width=0.5, fill="transparent") +
      labs(x="Groups", y=paste(index, "index"), color=groupID) + theme_classic() +
      geom_text(data=df, aes(x=group, y=y, color=group, label=stat)) +
      geom_jitter(position=position_jitter(0.17), size=1, alpha=0.7)+
      theme(text=element_text(family="sans", size=7))
    p
  }


}