R/16S_analysis_functions_Yip.R

#!/usr/bin/env Rscript

######################### 16S Analysis Functions ##################################
## Copyright (C) Xbiome Biotech Co., Ltd. - All Rights Reserved
## Unauthorized copying of this file, via any medium is strictly prohibited
## Proprietary and confidential
##
## Functions of 16S analysis
##
## Written by Guanhua Ye <yeguanhua@xbiome.cn>
## Updated by Guanhua Ye <yeguanhua@xbiome.cn>

# Set environment -----------------------------------------------------------------
if(F) {
  #rm(list = ls())
  options(stringsAsFactors = FALSE)
  library(pacman)
  p_load(dada2, phyloseq)
}

# Set colors ----------------------------------------------------------------------
if (F) {
  # Joined all qualitative palettes from RColorBrewer package. Qualitative palettes are supposed to
  # provide X most distinctive colors each. Of course, mixing them joins into one palette also
  # similar colours, but that's the best I can get (74 colors).
  library(RColorBrewer)
  qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
  col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
  # Take all R colors from graphical devices and sample from them.
  # Removed shades of grey as they are too similar.
  # This gives 433 colors
  color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)] %>% as.character()
  distinctive_colors <- c(col_vector, color)
}

# Convert to percentage -----------------------------------------------------------
if (F) {
  convert_to_percentage <- function (df, row_sum = TRUE) {
    # df must be data frame only contain numeric
    if (row_sum) {
      # 将df中每一行求和,然后用每一行中的每一个数字除以行和
      sweep(df, 1, rowSums(df), '/')
    } else {
      # 将df中每一列求和,然后用每一列中的每一个数字除以列和
      sweep(df, 2, colSums(df), '/')
    }
  }
}

# Distinctive colors in R (maximum 74 distinctive colors) -------------------------
if (F) {
  distinctive_colors <- function(number = 74) {
    library(RColorBrewer)
    qual_col_pals <-  brewer.pal.info[brewer.pal.info$category == 'qual',]
    distinctive_colors <-  unlist(mapply(brewer.pal, qual_col_pals$maxcolors,
                                         rownames(qual_col_pals)))
    distinctive_colors <- distinctive_colors[1:number]
    return(distinctive_colors)
  }
  distinctive_colors <- distinctive_colors()
}

# Raw reads from fastqc -----------------------------------------------------------
# plot total & unique reads
if (F) {
  # do the next 2 steps to obtain the input file:
  # 1. download the fastqc_sequence_counts_plot data from multiqc HTML.
  # 2. add a 'Total Reads' column, which is the sum of Unique Reads & Duplicate Reads.
  # the input table should at least contain 'Category', 'Unique Reads', 'Total Reads'.
  plot_raw_reads <- function(raw_reads) {
    # create a long table with 3 column: Category, ReadsType, Reads
    reads_table <- gather(raw_reads, `Unique Reads`, `Total Reads`,
                          key = 'ReadsType', value = 'Reads')
    reads_table$Reads <- log10(reads_table$Reads)
    ggplot(reads_table, aes(x = Category, y = Reads, fill = ReadsType)) +
      geom_bar(position = 'dodge', stat = "identity") +
      theme_bw() +
      theme(panel.grid = element_blank(),
            axis.text.x = element_text(angle = 90))
  }
}

# DADA2 workflow ------------------------------------------------------------------
if (FALSE) {
  dada2_workflow <- function(
    file_path,
    read_type = "se",
    # Truncate reads longer than this truncLen. Reads shorter than this are discarded.
    trunc_len = 0,
    # After truncation, reads with higher than maxEE "expected errors" will be discarded.
    max_ee = Inf,
    ref_fasta = "/home/yeguanhua/database/dada2/silva_nr_v132_train_set.fa.gz",
    ref_species = "/home/yeguanhua/database/dada2/silva_species_assignment_v132.fa.gz",
    pyrosequencing = FALSE,
    max_len = Inf, #Remove reads with length greater than maxLen.
    #Additional filtering of 454 sequences by maximum length.
    min_len = 20, #Remove reads with length less than minLen.
    #minLen is enforced after trimming and truncation.
    big_data = FALSE,
    trim_left = 0
  ) {
    dada2_res <- list()
    if (read_type == "pe") {
      # Forward and reverse fastq filenames must be following format:
      # SAMPLENAME_R1.fastq.gz / SAMPLENAME_R2.fastq.gz or SAMPLENAME_R1.fastq / SAMPLENAME_R2.fastq
      fnFs <- sort(
        list.files(file_path, pattern = "\\_R1\\.fastq\\.gz|\\_R1\\.fastq", full.names = T)
      )
      fnRs <- sort(
        list.files(file_path, pattern = "\\_R2\\.fastq\\.gz|\\_R2\\.fastq", full.names = T)
      )
      if (length(fnFs) == 0) stop ("No file in given file_path.")
      if (length(fnFs) != length(fnRs)) stop ("Forward and reverse files do not match.")
      # Extract sample names, assuming filenames have format:
      # SAMPLENAME_R1.fastq.gz / SAMPLENAME_R1.fastq or SAMPLENAME_R1.fastq / SAMPLENAME_R2.fastq
      sample.names <- sapply(strsplit(basename(fnFs), "_R1.fastq.gz|_R1.fastq"), `[`, 1)
      # Filter & Trim
      filtFs <- file.path(file_path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
      filtRs <- file.path(file_path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
      out <- filterAndTrim(fwd = fnFs,
                           filt = filtFs,
                           rev = fnRs,
                           filt.rev = filtRs,
                           truncLen = trunc_len,
                           maxEE = max_ee,
                           trimLeft = trim_left,
                           maxLen = max_len,
                           minLen = min_len,
                           rm.phix = TRUE,
                           compress = TRUE,
                           multithread = TRUE,
                           verbose = TRUE)
      ## Check filter result
      #out_print <- out %>% as.data.frame() %>% select(reads.in, reads.out)
      #out_print
      # Learn the Error Rates
      names(filtFs) <- sample.names
      names(filtRs) <- sample.names
      set.seed(100)
      errF <- learnErrors(filtFs, nbases = 1e8, multithread = TRUE)
      errR <- learnErrors(filtRs, nbases = 1e8, multithread = TRUE)
      if (big_data) {
        # Sample inference and merger of paired-end reads
        mergers <- vector("list", length(sample.names))
        names(mergers) <- sample.names
        for(sam in sample.names) {
          cat("Processing:", sam, "\n")
          derepF <- derepFastq(filtFs[[sam]])
          derepR <- derepFastq(filtRs[[sam]])
          if (pyrosequencing) {
            dadaFs <- dada(derepR, err = errR, HOMOPOLYMER_GAP_PENALTY = -1,
                           BAND_SIZE = 32, multithread = TRUE)
            dadaRs <- dada(derepF, err = errF, HOMOPOLYMER_GAP_PENALTY = -1,
                           BAND_SIZE = 32, multithread = TRUE)
          } else {
            dadaFs <- dada(derepR, err = errR, multithread = TRUE)
            dadaRs <- dada(derepF, err = errF, multithread = TRUE)
          }
          merger <- mergePairs(dadaRs, derepF, dadaFs, derepR)
          mergers[[sam]] <- merger
        }
        rm(derepF); rm(derepR)
      } else {
        # Dereplication
        derepFs <- derepFastq(filtFs, verbose = TRUE)
        derepRs <- derepFastq(filtRs, verbose = TRUE)
        # Name the derep-class objects by the sample names
        names(derepFs) <- sample.names
        names(derepRs) <- sample.names
        # Sample Inference
        if (pyrosequencing) {
          dadaFs <- dada(derepFs, err = errF, HOMOPOLYMER_GAP_PENALTY = -1,
                         BAND_SIZE = 32, multithread = TRUE)
          dadaRs <- dada(derepRs, err = errR, HOMOPOLYMER_GAP_PENALTY = -1,
                         BAND_SIZE = 32, multithread = TRUE)
        } else {
          dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
          dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
        }
        # Merge paired reads
        mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose = TRUE)
      }
      # Construct Sequence Table
      seqtab <- makeSequenceTable(mergers)
      # Remove Chimera
      seqtab_nochim <- removeBimeraDenovo(seqtab, method = "consensus",
                                          multithread = TRUE, verbose = TRUE)
      # Track reads through the pipeline
      getN <- function(x) sum(getUniques(x))
      if (big_data) {
        dada2_stat_F = NULL
        dada2_stat_R = NULL
        merge_stat = NULL
        dada2_stat_F[length(dada2_stat_F)+1] <- getN(dadaFs)
        dada2_stat_R[length(dada2_stat_R)+1] <- getN(dadaRs)
        merge_stat[length(merge_stat)+1] <- getN(mergers[[sam]])
      } else {
        dada2_stat_F <- sapply(dadaFs, getN)
        dada2_stat_R <- sapply(dadaRs, getN)
        merge_stat <- sapply(mergers, getN)
      }
      track <- cbind(out, dada2_stat_F, dada2_stat_R, merge_stat, rowSums(seqtab_nochim))
      # If processing a single sample, remove the sapply calls
      # e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
      colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
      rownames(track) <- sample.names
    } else if (read_type == "se") {
      fns <- list.files(file_path, pattern = "\\.fastq\\.gz|\\.fastq")
      filtpath <- file.path(file_path, "filtered")
      # Filter & Trim
      out <- filterAndTrim(fwd = file.path(file_path, fns),
                           filt = file.path(filtpath, fns),
                           truncLen = trunc_len[1],
                           maxEE = max_ee[1],
                           trimLeft = trim_left[1],
                           maxLen = max_len,
                           minLen = min_len,
                           rm.phix = TRUE,
                           compress = TRUE,
                           verbose = TRUE,
                           multithread = TRUE)
      ## Check filter result
      #out_print <- out %>% as.data.frame() %>% select(reads.in, reads.out)
      #out_print
      # Learn error rates
      filts <- list.files(filtpath, pattern = ".fastq.gz|.fastq", full.names = TRUE)
      # Assumes filename = sample.fastq.gz
      sample.names <- sapply(strsplit(basename(fns), ".fastq.gz|.fastq"), `[`, 1)
      names(filts) <- sample.names
      set.seed(100)
      # Learn error rates
      err <- learnErrors(filts, nbases = 1e8, multithread = TRUE, randomize = TRUE)
      if (big_data) {
        # Infer sequence variants
        dds <- vector("list", length(sample.names))
        names(dds) <- sample.names
        if (pyrosequencing) {
          for(sam in sample.names) {
            cat("Processing:", sam, "\n")
            derep <- derepFastq(filts[[sam]])
            dds[[sam]] <- dada(derep, err = err, multithread = TRUE,
                               HOMOPOLYMER_GAP_PENALTY = -1,
                               BAND_SIZE = 32)
          }
        } else {
          for(sam in sample.names) {
            cat("Processing:", sam, "\n")
            derep <- derepFastq(filts[[sam]])
            dds[[sam]] <- dada(derep, err = err, multithread = TRUE)
          }
        }
      } else {
        # Dereplication
        derep <- derepFastq(filts, verbose = TRUE)
        names(derep) <- sample.names
        # Sample Inference
        if (pyrosequencing) {
          dds <- dada(derep, err = err, multithread = TRUE,
                      HOMOPOLYMER_GAP_PENALTY = -1,
                      BAND_SIZE = 32)
        } else {
          dds <- dada(derep, err = err, multithread = TRUE)
        }
      }
      # Construct sequence table
      seqtab <- makeSequenceTable(dds)
      # Remove Chimeras
      seqtab_nochim <- removeBimeraDenovo(seqtab, method = "consensus",
                                          multithread = TRUE, verbose = TRUE)
      ## Check the demension
      #dim(seqtab_nochim)
      ## Inspect sequence remove rate
      #sum(seqtab_nochim)/sum(seqtab)
      # Track reads through pipeline
      getN <- function(x) sum(getUniques(x))
      if (big_data) {
        dada2_stat <- getN(dds[[sam]])
      } else {
        dada2_stat <- sapply(dds, getN)
      }
      track <- cbind(out, dada2_stat, rowSums(seqtab_nochim))
      colnames(track) <- c("input", "filtered", "dereplicated", "nonchim")
      rownames(track) <- sample.names
    }
    if (!is.na(ref_fasta)) {
      # Assign taxonomy
      taxa <- assignTaxonomy(seqs = seqtab_nochim, refFasta = ref_fasta, multithread=TRUE)
      if (!is.na(ref_species)) {
        taxa <- addSpecies(taxtab = taxa, refFasta = ref_species)
      }
      # Results
      dada2_res$seq_tab <- seqtab_nochim
      dada2_res$tax_tab <- taxa
      dada2_res$reads_track <- track
      return(dada2_res)
    } else {
      # Results
      dada2_res$seq_tab <- seqtab_nochim
      dada2_res$reads_track <- track
      return(dada2_res)
    }
  }
}

# Track reads through pipeline ----------------------------------------------------
# dada2_reads_track
if (F) {
  dada2_reads_track <- function(track, single_end = TRUE) {
    track <- rownames_to_column(as.data.frame(track), var = "subject_id")
    track$subject_id <- factor(track$subject_id)
    track <- gather(track, 2:ncol(track), key = "stages", value = "reads")
    if (single_end) {
      track$stages <- factor(track$stages, levels = c("input", "filtered",
                                                      "dereplicated", "nonchim"))
    } else {
      track$stages <- factor(track$stages, levels = c("input", "filtered",
                                                      "denoisedF", "denoisedR",
                                                      "merged", "nonchim"))
    }
    ggplot(track, aes(x = stages, y = reads, color = subject_id)) +
      scale_color_manual(values = distinctive_colors) +
      geom_point() +
      geom_line(aes(group = subject_id)) +
      # move legend position to the top
      theme(legend.position = "top")
  }
}

# Export DADA2 sequences to fasta file --------------------------------------------
if(F) {
  seqs2fasta <-  function(seq_tab, out_path) {
    require(seqinr)
    seqtab.t = as.data.frame(t(seq_tab))
    seqs = row.names(seqtab.t)
    row.names(seqtab.t) = paste0("OTU", 1:nrow(seqtab.t))
    seqs = as.list(seqs)
    seqinr::write.fasta(sequences = seqs, names = row.names(seqtab.t),
                        file.out = out_path)
  }
}

# Extract phyloseq sample data ----------------------------------------------------
if (F) {
  extract_metadata_from_phyloseq <- function(phyloseq, feature = NA) {

    ## This is a function for extracting metadata from phyloseq object
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic tree.
    ## feature: The column name of the feature you want to select.

    require(tidyverse)
    require(phyloseq)
    ## Step 1: Extract metadata from phyloseq and turn into tibble
    metadata <- phyloseq %>%
      sample_data() %>%
      as.matrix() %>%
      as.data.frame()
    if ("subject_id" %in% colnames(metadata)) {
      select(metadata, -subject_id) %>%
        rownames_to_column(var = "subject_id")
      warning("replace 'subject_id' column with rownames")
    } else {
      metadata <- rownames_to_column(metadata, var = "subject_id")
    }
    if (is.na(feature)) {
      return(metadata)
    } else {
      # Select column by feature name
      metadata <- dplyr::select(metadata, subject_id, !!feature)
      # Add levels to column
      metadata[[feature]] <- metadata[[feature]] %>% as.character() %>% factor()
      return(metadata)
    }
  }
}

# Construct OTU table with DADA2 result -------------------------------------------
if (F) {
  construct_otu_table <- function(phyloseq, level = "all") {

    ## This is a function for converting DADA2 reaults to OTU table
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic
    ##           tree.
    ## level: If "all" then retain all taxonomy level and seperate by ";",
    ##        else ONLY retain the given taxonomy level, drop everything else.

    # Set options, prevent R turnning numeric value to factor
    options(stringsAsFactors = FALSE)
    # Check if input 'level' is correct
    if (!level %in% c("all", "Kingdom", "Phylum", "Class", "Order", "Family",
                      "Genus", "Species")) {
      stop('level should be one of "all", "Kingdom", "Phylum", "Class",
           "Order", "Family", "Genus", "Species".')}
    # Read in sequence table and taxonomy table from phyloseq
    otu <- otu_table(phyloseq) %>% as.data.frame() %>% t() %>% as.data.frame() %>%
      rownames_to_column(var = "Feature_ID")
    taxa <- tax_table(phyloseq) %>% as.data.frame() %>%
      rownames_to_column(var = "Feature_ID")
    levels <- colnames(taxa)[2:ncol(taxa)]
    ## Step 1: Clean sequence table and taxonomy table
    # If a sequence's taxonomy is all NA, remove it
    all_NA_taxa <- taxa %>% filter_at(vars(levels), all_vars(is.na(.)))
    taxa <- taxa %>% filter(!Feature_ID %in% all_NA_taxa$Feature_ID)
    ## Step 2: Construct OTU table
    if (level == "all") {
      # If level == "all", collapse all taxonomy levels and separate by "|"
      level <- "Taxonomy"
      taxa <- taxa %>% unite(Taxonomy, 2:ncol(taxa), sep = ";")
    } else {
      # If level != "all", select the given taxonomy level
      taxa <- taxa %>% select(Feature_ID, level) %>% filter(., !is.na(.[[level]]))
      otu <- otu %>% filter(Feature_ID %in% taxa$Feature_ID)
    }
    ## Step 3: Merge sequence table and taxonomy table
    otu <- left_join(otu, taxa)
    ## Step 4: Add abundance of those have the same taxonomy names and convert
    ##         it to data frame
    otu <- otu %>%
      group_by_(level) %>% #group_by_() can pass variable to goup_by() function
      summarise_if(is.numeric, sum, na.rm=TRUE) %>%
      column_to_rownames(var = level) %>%
      t() %>%
      as.data.frame()
    return(otu)
    }
}

# Construct LEfSe OTU table -------------------------------------------------------
if (F) {
  # See http://huttenhower.sph.harvard.edu/galaxy/ ->
  # LEfSe -> Format Data for LEfSe for more details.
  construct_lefse_table <- function(phyloseq, feature, level = "all") {
    # Set options, prevent R turnning numeric value to factor
    options(stringsAsFactors = FALSE)
    # Read in phyloseq object
    # Convert OTU to relative abundance
    otu <- otu_table(phyloseq) %>% as.data.frame() %>% convert_to_percentage() %>%
      t() %>% as.data.frame() %>% rownames_to_column(var = "Feature_ID")
    taxa <- tax_table(phyloseq) %>% as.data.frame() %>%
      rownames_to_column(var = "Feature_ID")
    # Select taxa by given level
    if (level == "all") {
      levels <- colnames(taxa)[2:ncol(taxa)]
    } else {
      taxa <- taxa %>% select(Feature_ID:!!level)
      levels <- colnames(taxa)[2:ncol(taxa)]
    }
    # Remove rows in taxa that is all NA
    all_NA_taxa <- taxa %>% filter_at(vars(levels), all_vars(is.na(.)))
    taxa <- taxa %>% filter(!Feature_ID %in% all_NA_taxa$Feature_ID)
    # Convert metadata and create lefse table
    metadata <- extract_metadata_from_phyloseq(phyloseq = phyloseq, feature = feature)
    lefse <- metadata %>% t() %>% as.data.frame()
    colnames(lefse) <- lefse %>% .[rownames(.) == "subject_id",]
    # Combine otu to lefse table
    for(i in levels) {
      taxa_tmp <- taxa %>% as.data.frame() %>% select(1:which(colnames(taxa) == i)) %>%
        filter_at(vars(i), all_vars(!is.na(.)))
      taxa_tmp <- taxa_tmp %>% unite(Taxonomy, 2:ncol(taxa_tmp), sep = "|")
      otu_tmp <- otu %>% filter(Feature_ID %in% taxa_tmp$Feature_ID) %>%
        left_join(taxa_tmp)
      otu_tmp <- otu_tmp %>%
        group_by(Taxonomy) %>% #group_by_() can pass variable to goup_by() function
        summarise_if(is.numeric, sum, na.rm=TRUE) %>%
        column_to_rownames(var = "Taxonomy")
      lefse <- rbind(lefse, otu_tmp)
    }
    lefse <- rownames_to_column(lefse)
    colnames(lefse) <- 1:ncol(lefse)
    return(lefse)
  }
}

# Plot sparsity -------------------------------------------------------------------
# input file: OTU table (rownames are SampleID, colnames are taxa)
if (F) {
  plot_sparsity <- function(otu, binwidth) {
    # remove all na taxa and transposes otu table
    otu <- otu %>% rownames_to_column() %>%
      filter_if(is.numeric, any_vars(. != 0)) %>%
      column_to_rownames() %>%
      t() %>% as.data.frame()
    # replace 0 with NA
    otu[otu == 0] <- NA
    # calculate sparsity
    otu <- apply(otu, 1, function(x) round((sum(!is.na(x))/ncol(otu))*100, 0)) %>%
      as.data.frame() %>%
      rownames_to_column(var = 'otu_id')
    colnames(otu)[2] <- 'not_0_percentage'
    ggplot(otu, aes(not_0_percentage)) +
      geom_histogram(binwidth = binwidth) +
      xlab('Prevalence of each OTU') +
      ylab('Count') +
      theme_bw() +
      theme(panel.grid = element_blank())
  }
}

# Stacked bar plot of phylogenetic composition ------------------------------------
if (F) {
  stacked_bar_plot <- function(phyloseq, feature, level) {

    ## This is a function for stacked bar plot
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic tree.
    ## level: Taxonomy level to plot.
    ## feature: The feature that shows in x-axis text with different colors.

    ## Step 1: First construct otu then convert to percentage
    otu_percent <- construct_otu_table(phyloseq, level) %>%
      convert_to_percentage(row_sum = TRUE) %>%
      as.data.frame()
    ## Step 2: Construct table for stacked bar plot
    plot_tab <- otu_percent %>%
      # First re-order taxonomy by total counts
      .[,order(colSums(.), decreasing = TRUE)] %>%
      # Then re-order sample by the most abundance taxonomy
      .[order(.[,1], decreasing = TRUE),] %>%
      # Turn subject_id to a column
      rownames_to_column(var = "subject_id")
    # Prepare levels for subject_id
    levels_subject_id <- plot_tab$subject_id
    # Prepare levels for taxonomy levels
    levels_level <- colnames(plot_tab)[2:ncol(plot_tab)]
    ## Step 3: Join metadata
    sample_feature <- extract_metadata_from_phyloseq(phyloseq, feature)
    plot_tab <- left_join(plot_tab, sample_feature)
    # Prepare levels for feature
    levels_feature <- plot_tab[,ncol(plot_tab)]
    # Add levels to subject_id
    plot_tab$subject_id <- factor(plot_tab$subject_id, levels = levels_subject_id)
    ## Step 4: Turn plot_table to a long table for plotting
    plot_tab <- gather(plot_tab,
                       colnames(plot_tab)[2:(ncol(plot_tab)-1)],
                       key = level,
                       value = "abundance")
    # Add levels to taxonomy levels
    plot_tab$level <- factor(plot_tab$level, levels = levels_level)
    ## Step 5: Bar plot
    if (length(levels_level) >= 74) {
      stop("Taxonomy values are more than 74, not enough distinctive colors to
           plot. Please choose another level.")
    } else {
      ggplot(plot_tab, aes(x = subject_id, y = abundance)) +
        theme(axis.text.x = element_text(angle = 90, size = 8, color = levels_feature)) +
        scale_fill_manual(values = distinctive_colors) +
        geom_bar(mapping = aes(fill = level), position = "fill", stat = "identity") +
        # move legend position to the top
        theme(legend.position = "top")
    }
  }
}

# Plot correlation ----------------------------------------------------------------
if (F) {
  plot_cor <- function(cor_tab, x, y, method = "pearson") {
    # Notice: Colnames of the input table can only be letters or numbers.
    if (any(str_detect(colnames(cor_tab), '\\W'))) {
      stop("Colnames of the input table can only be letters or numbers.")
    }
    # Calculate correlation
    cor_res <- cor.test(cor_tab[[x]], cor_tab[[y]], method = method)
    if (cor_res$p.value < 2.2e-16) {
      ggplot(data = cor_tab, aes_string(x = x, y = y)) +
        geom_point() +
        geom_smooth(method = lm) +
        annotate(geom = 'text',
                 x = mean(cor_tab[[x]]),
                 y = max(cor_tab[[y]]) * 1.1,
                 label = paste0('rho = ', round(cor_res$estimate, 3))) +
        annotate(geom = 'text',
                 x = mean(cor_tab[[x]]),
                 y = max(cor_tab[[y]]) * 1.05,
                 label = paste0('p-value < 2.2e-16')) +
        labs(x = as.character(x), y = as.character(y)) +
        theme_bw() +
        theme(panel.grid = element_blank(),
              axis.text.y = element_text(size = 12),
              axis.title = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              legend.text = element_text(size = 12),
              strip.text.x = element_text(size = 14))
    } else {
      ggplot(data = cor_tab, aes_string(x = x, y = y)) +
        geom_point() +
        geom_smooth(method = lm) +
        annotate(geom = 'text',
                 x = mean(cor_tab[[x]]),
                 y = max(cor_tab[[y]]) * 1.1,
                 label = paste0('rho = ', round(cor_res$estimate, 3))) +
        annotate(geom = 'text',
                 x = mean(cor_tab[[x]]),
                 y = max(cor_tab[[y]]) * 1.05,
                 label = paste0('p-value = ', round(cor_res$p.value, 3))) +
        labs(x = as.character(x), y = as.character(y)) +
        theme_bw() +
        theme(panel.grid = element_blank(),
              axis.text.y = element_text(size = 12),
              axis.title = element_text(size = 14),
              axis.text.x = element_text(size = 12),
              legend.text = element_text(size = 12),
              strip.text.x = element_text(size = 14))
    }
  }
}

# Alpha diversity -----------------------------------------------------------------
if (F) {
  alpha_diversity_plot <- function(phyloseq, feature, measures, p_test = "wilcox"){

    ## This is a function for plotting alpha diversity
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic tree.
    ## feature: The column name of the feature you want to select from
    ##          metadata, e.g. "Phenotype".
    ## measures: The measures to calculate alpha diversity (see "plot_richness" function,
    ##           "measures" argument for detail).
    ## p_test: The p-value to test alpha diversity.

    ## Step 1: Use plot_richness function to calculate alpha diversity
    if (!measures %in% c("Observed", "Chao1", "ACE", "Shannon", "Simpson",
                         "InvSimpson", "Fisher")) {
      stop('measures should be one of "Observed", "Chao1", "ACE", "Shannon",
           "Simpson", "InvSimpson", "Fisher".')
    } else {
      alpha_diversity <- plot_richness(phyloseq, x = feature, measures = measures)
    }
    ## Step 2: Calculate p-value
    if (p_test == "wilcox") {
      # Prepare feature table for calculating Mann-Whitney U test(for 2 groups only)
      feature_tab_4_MWtest <- extract_metadata_from_phyloseq(phyloseq, feature)
      # Extract feature levels
      feature1 <- feature_tab_4_MWtest[[feature]] %>% unique() %>% .[1]
      feature2 <- feature_tab_4_MWtest[[feature]] %>% unique() %>% .[2]
      replace_feature <- c(as.character(feature1), as.character(feature2))
      # Revalue feature levels as 0 and 1
      feature_tab_4_MWtest[[feature]] <- plyr::mapvalues(feature_tab_4_MWtest[[feature]],
                                                         from = replace_feature,
                                                         to = c(0, 1))
      p_value <- wilcox.test(alpha_diversity$data$value ~
                               feature_tab_4_MWtest[[feature]])$p.value
    } else if (p_test == "kruskal") {
      # Kruskal test(for 2 or more groups)
      p_value <- kruskal.test(alpha_diversity$data$value,
                              factor(alpha_diversity$data[,feature]))$p.value
    } else {
      stop("The input p_test is not supported")
    }
    ## Step 3: Plot alpha diversity
    require(ggpubr)
    ggboxplot(alpha_diversity$data,
              x = feature,
              y = "value",
              add = "jitter",
              add.params = list(size = 3),
              color = feature,
              outlier.shape = NA,
              palette = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4")) +
      ylab(paste0(measures, " Diversity")) +
      annotate("text",
               x = ((alpha_diversity$data[[feature]] %>% unique() %>% length() + 1)/2),
               y = (max(alpha_diversity$data$value) * 1.1),
               label = paste0(p_test, " p-value = ", round(p_value, 4)),
               size = 3)
  }
  }

# Beta diversity ------------------------------------------------------------------
if (F) {
  beta_diversity_plot <- function(phyloseq, feature, feature2 = NA, method, size = 3){

    ## This is a function for plotting beta diversity (modified from plot_pcoa
    ## function created by Jiangwei)
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic tree.
    ## method: The method to calculate beta diversity. Method should be one of
    ##         "bray", "jaccard", "unifrac", "wunifrac".
    ## feature: The column name of the feature you want to select from
    ##          metadata, e.g. "Phenotype".
    ## feature2: The column name of another feature you want to select from
    ##           metadata, e.g. "Gender".

    require(phyloseq)
    ## Step 1: Calculate beta diversity
    if (!method %in% c("bray", "jaccard", "unifrac", "wunifrac")) {
      stop('beta diversity method should be one of "bray", "jaccard",
           "unifrac", "wunifrac".')
    } else if (method %in% c("unifrac", "wunifrac")) {
      beta_diversity <- cmdscale(phyloseq::distance(physeq = phyloseq,
                                                    method = method), eig = TRUE)
    } else {
      beta_diversity <- cmdscale(vegan::vegdist(otu_table(phyloseq),
                                                method = method), eig = TRUE)
    }
    ## Step 2: Construct plot table
    # Extract PC1 and PC2
    PC <- as.data.frame(beta_diversity$points)
    colnames(PC) <- c("PC1", "PC2")
    PC <- rownames_to_column(PC, var = "subject_id")
    # Extract metadata
    metadata <- extract_metadata_from_phyloseq(phyloseq)
    # Mrtge
    beta_diversity_tab <- left_join(PC, metadata)
    ## Step 3: Plot beta diversity
    # Make x-axis and y-axis names for aes_string
    x_name <- "PC1"
    y_name <- "PC2"
    # Make factor for color, prevent "Error: Continuous value supplied to discrete scale"
    beta_diversity_tab[[feature]] <- beta_diversity_tab[[feature]] %>% as.factor()
    # Plot beta diversity
    # aes_string() can pass variables to ggplot, aes() can't
    if (is.na(feature2)) {
      ggplot(data = beta_diversity_tab, aes_string(x = x_name, y = y_name,
                                                   color = feature)) +
        geom_point(size = size) +
        xlab(paste("PC1:",
                   round(100*as.numeric(beta_diversity$eig[1]/sum(beta_diversity$eig)), 2),
                   "%",
                   sep = " ")) +
        ylab(paste("PC2:",
                   round(100*as.numeric(beta_diversity$eig[2]/sum(beta_diversity$eig)), 2),
                   "%",
                   sep = " ")) +
        scale_color_manual(values = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4"))
    } else {
      ggplot(data = beta_diversity_tab, aes_string(x = x_name, y = y_name, color = feature,
                                                   shape = feature2)) +
        geom_point(size = size) +
        xlab(paste("PC1:",
                   round(100*as.numeric(beta_diversity$eig[1]/sum(beta_diversity$eig)), 2),
                   "%",
                   sep = " ")) +
        ylab(paste("PC2:",
                   round(100*as.numeric(beta_diversity$eig[2]/sum(beta_diversity$eig)), 2),
                   "%",
                   sep = " ")) +
        scale_color_manual(values = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4")) +
        scale_shape_manual(values = c(0:6))
    }
  }
}

# Log2 Fold Change ----------------------------------------------------------------
if (F) {
  log2fc <- function(phyloseq, feature, level = NA, p_value = 0.01, size = 2,
                     save_res = FALSE) {

    ## This is a function for plotting log2 fold change.
    ## Arguments:
    ## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
    ##           metadata and phylogenetic tree.
    ## feature: The column name of the feature you want to select from
    ##          metadata, e.g. "Phenotype".

    require(DESeq2)
    ## Step 1: Construt table for DESeq2
    # Create a string to parse feature argument to DESeq
    feature_formula <- paste0("~ ", feature)
    # Create a DESeq Data Set
    if (is.na(level)) {
      dds <- phyloseq_to_deseq2(phyloseq, design = as.formula(feature_formula))
      cts <- counts(dds)
      geoMeans <- apply(
        cts, 1,
        function(row) if (all(row == 0)) 0
        else exp(sum(log(row[row != 0]))/length(row))
      )
      dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
    } else {
      countData <- construct_otu_table(phyloseq, level) %>% t()
      colData <- extract_metadata_from_phyloseq(phyloseq = phyloseq, feature = feature)
      dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData,
                                    design = as.formula(feature_formula))
    }
    dds <- DESeq(dds)
    ## Step 2: Calculate log2 fold change
    res <- results(dds, addMLE = FALSE)
    res <- res[order(res$padj),]
    if (save_res) {
      # save res object to current working directory
      saveRDS(res, paste0('DESeq2_res_', Sys.Date(), ".rds"))
      print('The DESeq result has been saved to current working directory.')
    }
    ## Step 3: Plot fold change
    # Create table for plotting
    ifelse(is.na(level), var <- "OTU", var <- level)
    log2fc <- res %>% as.data.frame() %>% rownames_to_column(var = var) %>%
      filter(!is.na(padj))
    # Identify differential expression
    log2fc$significant = ifelse(log2fc$padj > p_value,
                                paste0('p-value > ', p_value),
                                paste0('p-value < ', p_value))
    # Extract table that meet the p-value threshold
    log2fc_print <- log2fc %>%
      .[.$padj < p_value,] %>%
      select(!!var, log2FoldChange, padj) %>%
      .[order(.[,2], decreasing = TRUE),]
    print(log2fc_print)
    # EnhancedVolcano plot
    require(EnhancedVolcano)
    EnhancedVolcano(log2fc, lab = log2fc[[var]], x = "log2FoldChange", y = "padj",
                    FCcutoff = 1, pCutoff = p_value,
                    transcriptPointSize = size,
                    col = c("darkgrey", "#00AFBB", "#FC4E07", "red2"),
                    legend = c("Not Significant",
                               paste0("p > ", p_value, ", log2 fold change > 1"),
                               paste0("p < ", p_value, ", log2 fold change < 1"),
                               paste0("p < ", p_value, ", log2 fold change > 1"))
    )
  }
}

# Separate qiime2 taxonomy --------------------------------------------------------
if (F) {
  separate_qiime2_taxa <- function(qiime2_taxa_path) {
    separate_taxa <- read_tsv(qiime2_taxa_path) %>%
      separate(Taxon,
               into = c("Kingdom", "Phylum", "Class", "Order", "Family",
                        "Genus", "Species"),
               sep = ";") %>%
      select(-Confidence) %>%
      column_to_rownames(var = "Feature ID")
    return(separate_taxa)
  }
}

# Permutation test ----------------------------------------------------------------
if (F) {
  #原文:https://blog.csdn.net/u011467621/article/details/47971917
  A_group <- c(24, 43, 58, 67, 61, 44, 67, 49, 59, 52, 62, 50)
  B_group <- c(42, 43, 65, 26, 33, 41, 19, 54, 42, 20, 17, 60, 37, 42, 55, 28)
  # 零假设:A/B group没有显著差异
  # 若零假设成立,那么A组数据的分布和B组数据的分布是一样的,也就是服从同个分布
  # 将A/B group的值合并,前12个是A group,后16个是B group
  a <- c(A_group, B_group)
  # 添加factor到对应的值,并生成data frame
  group <- factor(c(rep("A", 12), rep("B", 16)))
  data <- data.frame(group, a)
  # 创建计算A_group mean值减去B_group mean值的function
  find.mean <- function (x) {
    mean(x[group == "A",2]) - mean(x[group == "B",2])
  }
  # 随机取12个值作为A,16个值作为B,计算mean值的差,并重复999次,建立分布图
  set.seed(100) #随机取值会有波动,所以需要set.seed函数
  results <- replicate(999, find.mean(data.frame(group, sample(data[,2]))))
  # 查看随机取值的分布情况
  hist(results, breaks = 20, prob = TRUE)
  lines(density(results))
  # 计算假设mean值比真实mean值大的概率
  p.value <- length(
    results[
      results > mean(data[group == "A", 2]) - mean(data[group == "B", 2])
      ]
  ) / 1000 #999+1
  # 若小于0.5,则说明真实mean值在假设情况中出现的概率小于0.5,原假设不成立,
  # A/B group有显著差异
}

# ComplexHeatmap ----------------------------------------------------------------
if (FALSE) {
  plot_ComplexHeatmap <- function(df, method = "pearson", row_order = NULL, column_order = NULL,
                                  fontsize = 12) {
    #  df <- df %>%
    #    log2() %>%
    #    apply(., 2, scale) %>% #保留两位小数
    #    as.data.frame()
    cormat <- cor(df, method = method)# %>% signif(digits = 2)
    library(ComplexHeatmap)
    library(circlize)
    col_fun = circlize::colorRamp2(c(-1, 0, 1), c("blue", "white", "red"))
    if (is.null(row_order) | is.null(column_order)) {
      Heatmap(
        cormat,
        name = "correlation",
        col = col_fun, #set color range
        # Add value to each cell
        cell_fun = function(j, i, x, y, width, height, fill) {
          #"%.1f": 保留一位小数
          grid.text(sprintf("%.1f", cormat[i, j]), x, y, gp = gpar(fontsize = fontsize))
        }
      )
    } else {
      Heatmap(
        cormat,
        name = "correlation",
        col = col_fun,
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        row_order = row_order,
        column_order = column_order,
        cell_fun = function(j, i, x, y, width, height, fill) {
          grid.text(sprintf("%.1f", cormat[i, j]), x, y, gp = gpar(fontsize = fontsize))
        }
      )
    }
  }
}

# Divide Genus OTU table to 3 group ---------------------------------------------
if(F) {
  divide_genus <- function(
    otutable,
    samplenames = colnames(otutable),
    method = NA,
    probs = c(0.35, 0.7)
  ) {
    # divide a vector into 0-35%/35%-70%/70%-100% 3 group
    dividedGenus <- NULL
    if(is.na(method)) {
      otugenus <- otutable %>% rownames_to_column("Genus")
      # 数值大于等于70%分位数的genus table
      otugenustop <- select(otugenus, Genus)
      for(i in samplenames) {
        quantilevalue <- quantile(otugenus[[i]], probs = probs) %>% as.numeric()
        topi <- otugenus %>% select(Genus, !!i) %>%
          filter_if(is.numeric, all_vars(. >= quantilevalue[2]))
        otugenustop <- left_join(otugenustop, topi)
      }
      otugenustop <- otugenustop %>%
        filter_if(is.numeric, any_vars(!is.na(.))) %>%
        column_to_rownames("Genus")
      otugenustop[is.na(otugenustop)] <- 0
      # 数值在70%-35%分位数之间的genus table
      otugenusmid <- select(otugenus, Genus)
      for(i in samplenames) {
        quantilevalue <- quantile(otugenus[[i]][otugenus[[i]]!=0], probs = probs) %>% as.numeric()
        midi <- otugenus %>% select(Genus, !!i) %>%
          filter_if(is.numeric, all_vars(. > quantilevalue[1] & . < quantilevalue[2]))
        otugenusmid <- left_join(otugenusmid, midi)
      }
      otugenusmid <- otugenusmid %>%
        filter_if(is.numeric, any_vars(!is.na(.))) %>%
        column_to_rownames("Genus")
      otugenusmid[is.na(otugenusmid)] <- 0
      # 数值小于等于35%分位数的genus table
      otugenusbot <- select(otugenus, Genus)
      for(i in samplenames) {
        quantilevalue <- quantile(otugenus[[i]], probs = probs) %>% as.numeric()
        boti <- otugenus %>% select(Genus, !!i) %>%
          filter_if(is.numeric, all_vars(. <= quantilevalue[1]))
        otugenusbot <- left_join(otugenusbot, boti)
      }
      otugenusbot <- otugenusbot %>%
        filter_if(is.numeric, any_vars(!is.na(.))) %>%
        column_to_rownames("Genus")
      otugenusbot[is.na(otugenusbot)] <- 0
      dividedGenus$top <- otugenustop
      dividedGenus$mid <- otugenusmid
      dividedGenus$bottom <- otugenusbot
      return(dividedGenus)
    } else if(method == "mean") {

      genus_value = as.numeric(apply(otutable, 1, mean))

    } else if(method == "median") {

      genus_value = as.numeric(apply(otutable, 1, median))

    }

    quantilevalue <- quantile(genus_value, probs = probs) %>% as.numeric()

    top_genus <- otutable %>%
      rownames_to_column("Genus") %>%
      mutate(genus_value=genus_value) %>%
      select(Genus, genus_value) %>%
      filter_if(is.numeric, all_vars(. >= quantilevalue[2])) %>% .$Genus

    mid_genus <-  otutable %>%
      rownames_to_column("Genus") %>%
      mutate(genus_value=genus_value) %>%
      select(Genus, genus_value) %>%
      filter_if(is.numeric, all_vars(. > quantilevalue[1] & . < quantilevalue[2])) %>% .$Genus

    bot_genus <-  otutable %>%
      rownames_to_column("Genus") %>%
      mutate(genus_value=genus_value) %>%
      select(Genus, genus_value) %>%
      filter_if(is.numeric, all_vars(. <= quantilevalue[1])) %>% .$Genus

    otugenustop <- otutable %>%
      rownames_to_column("Genus") %>%
      filter(Genus %in% top_genus) %>%
      column_to_rownames("Genus")

    otugenusmid <- otutable %>%
      rownames_to_column("Genus") %>%
      filter(Genus %in% mid_genus) %>%
      column_to_rownames("Genus")

    otugenusbot <- otutable %>%
      rownames_to_column("Genus") %>%
      filter(Genus %in% bot_genus) %>%
      column_to_rownames("Genus")

    dividedGenus$top <- otugenustop
    dividedGenus$mid <- otugenusmid
    dividedGenus$bottom <- otugenusbot

    return(dividedGenus)
  }
}
yeguanhuav/visualization416S documentation built on March 22, 2022, 9:03 p.m.