#!/usr/bin/env Rscript
######################### 16S Analysis Functions ##################################
## Copyright (C) Xbiome Biotech Co., Ltd. - All Rights Reserved
## Unauthorized copying of this file, via any medium is strictly prohibited
## Proprietary and confidential
##
## Functions of 16S analysis
##
## Written by Guanhua Ye <yeguanhua@xbiome.cn>
## Updated by Guanhua Ye <yeguanhua@xbiome.cn>
# Set environment -----------------------------------------------------------------
if(F) {
#rm(list = ls())
options(stringsAsFactors = FALSE)
library(pacman)
p_load(dada2, phyloseq)
}
# Set colors ----------------------------------------------------------------------
if (F) {
# Joined all qualitative palettes from RColorBrewer package. Qualitative palettes are supposed to
# provide X most distinctive colors each. Of course, mixing them joins into one palette also
# similar colours, but that's the best I can get (74 colors).
library(RColorBrewer)
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
# Take all R colors from graphical devices and sample from them.
# Removed shades of grey as they are too similar.
# This gives 433 colors
color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = T)] %>% as.character()
distinctive_colors <- c(col_vector, color)
}
# Convert to percentage -----------------------------------------------------------
if (F) {
convert_to_percentage <- function (df, row_sum = TRUE) {
# df must be data frame only contain numeric
if (row_sum) {
# 将df中每一行求和,然后用每一行中的每一个数字除以行和
sweep(df, 1, rowSums(df), '/')
} else {
# 将df中每一列求和,然后用每一列中的每一个数字除以列和
sweep(df, 2, colSums(df), '/')
}
}
}
# Distinctive colors in R (maximum 74 distinctive colors) -------------------------
if (F) {
distinctive_colors <- function(number = 74) {
library(RColorBrewer)
qual_col_pals <- brewer.pal.info[brewer.pal.info$category == 'qual',]
distinctive_colors <- unlist(mapply(brewer.pal, qual_col_pals$maxcolors,
rownames(qual_col_pals)))
distinctive_colors <- distinctive_colors[1:number]
return(distinctive_colors)
}
distinctive_colors <- distinctive_colors()
}
# Raw reads from fastqc -----------------------------------------------------------
# plot total & unique reads
if (F) {
# do the next 2 steps to obtain the input file:
# 1. download the fastqc_sequence_counts_plot data from multiqc HTML.
# 2. add a 'Total Reads' column, which is the sum of Unique Reads & Duplicate Reads.
# the input table should at least contain 'Category', 'Unique Reads', 'Total Reads'.
plot_raw_reads <- function(raw_reads) {
# create a long table with 3 column: Category, ReadsType, Reads
reads_table <- gather(raw_reads, `Unique Reads`, `Total Reads`,
key = 'ReadsType', value = 'Reads')
reads_table$Reads <- log10(reads_table$Reads)
ggplot(reads_table, aes(x = Category, y = Reads, fill = ReadsType)) +
geom_bar(position = 'dodge', stat = "identity") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 90))
}
}
# DADA2 workflow ------------------------------------------------------------------
if (FALSE) {
dada2_workflow <- function(
file_path,
read_type = "se",
# Truncate reads longer than this truncLen. Reads shorter than this are discarded.
trunc_len = 0,
# After truncation, reads with higher than maxEE "expected errors" will be discarded.
max_ee = Inf,
ref_fasta = "/home/yeguanhua/database/dada2/silva_nr_v132_train_set.fa.gz",
ref_species = "/home/yeguanhua/database/dada2/silva_species_assignment_v132.fa.gz",
pyrosequencing = FALSE,
max_len = Inf, #Remove reads with length greater than maxLen.
#Additional filtering of 454 sequences by maximum length.
min_len = 20, #Remove reads with length less than minLen.
#minLen is enforced after trimming and truncation.
big_data = FALSE,
trim_left = 0
) {
dada2_res <- list()
if (read_type == "pe") {
# Forward and reverse fastq filenames must be following format:
# SAMPLENAME_R1.fastq.gz / SAMPLENAME_R2.fastq.gz or SAMPLENAME_R1.fastq / SAMPLENAME_R2.fastq
fnFs <- sort(
list.files(file_path, pattern = "\\_R1\\.fastq\\.gz|\\_R1\\.fastq", full.names = T)
)
fnRs <- sort(
list.files(file_path, pattern = "\\_R2\\.fastq\\.gz|\\_R2\\.fastq", full.names = T)
)
if (length(fnFs) == 0) stop ("No file in given file_path.")
if (length(fnFs) != length(fnRs)) stop ("Forward and reverse files do not match.")
# Extract sample names, assuming filenames have format:
# SAMPLENAME_R1.fastq.gz / SAMPLENAME_R1.fastq or SAMPLENAME_R1.fastq / SAMPLENAME_R2.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_R1.fastq.gz|_R1.fastq"), `[`, 1)
# Filter & Trim
filtFs <- file.path(file_path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(file_path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
out <- filterAndTrim(fwd = fnFs,
filt = filtFs,
rev = fnRs,
filt.rev = filtRs,
truncLen = trunc_len,
maxEE = max_ee,
trimLeft = trim_left,
maxLen = max_len,
minLen = min_len,
rm.phix = TRUE,
compress = TRUE,
multithread = TRUE,
verbose = TRUE)
## Check filter result
#out_print <- out %>% as.data.frame() %>% select(reads.in, reads.out)
#out_print
# Learn the Error Rates
names(filtFs) <- sample.names
names(filtRs) <- sample.names
set.seed(100)
errF <- learnErrors(filtFs, nbases = 1e8, multithread = TRUE)
errR <- learnErrors(filtRs, nbases = 1e8, multithread = TRUE)
if (big_data) {
# Sample inference and merger of paired-end reads
mergers <- vector("list", length(sample.names))
names(mergers) <- sample.names
for(sam in sample.names) {
cat("Processing:", sam, "\n")
derepF <- derepFastq(filtFs[[sam]])
derepR <- derepFastq(filtRs[[sam]])
if (pyrosequencing) {
dadaFs <- dada(derepR, err = errR, HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32, multithread = TRUE)
dadaRs <- dada(derepF, err = errF, HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32, multithread = TRUE)
} else {
dadaFs <- dada(derepR, err = errR, multithread = TRUE)
dadaRs <- dada(derepF, err = errF, multithread = TRUE)
}
merger <- mergePairs(dadaRs, derepF, dadaFs, derepR)
mergers[[sam]] <- merger
}
rm(derepF); rm(derepR)
} else {
# Dereplication
derepFs <- derepFastq(filtFs, verbose = TRUE)
derepRs <- derepFastq(filtRs, verbose = TRUE)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names
# Sample Inference
if (pyrosequencing) {
dadaFs <- dada(derepFs, err = errF, HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32, multithread = TRUE)
} else {
dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
}
# Merge paired reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose = TRUE)
}
# Construct Sequence Table
seqtab <- makeSequenceTable(mergers)
# Remove Chimera
seqtab_nochim <- removeBimeraDenovo(seqtab, method = "consensus",
multithread = TRUE, verbose = TRUE)
# Track reads through the pipeline
getN <- function(x) sum(getUniques(x))
if (big_data) {
dada2_stat_F = NULL
dada2_stat_R = NULL
merge_stat = NULL
dada2_stat_F[length(dada2_stat_F)+1] <- getN(dadaFs)
dada2_stat_R[length(dada2_stat_R)+1] <- getN(dadaRs)
merge_stat[length(merge_stat)+1] <- getN(mergers[[sam]])
} else {
dada2_stat_F <- sapply(dadaFs, getN)
dada2_stat_R <- sapply(dadaRs, getN)
merge_stat <- sapply(mergers, getN)
}
track <- cbind(out, dada2_stat_F, dada2_stat_R, merge_stat, rowSums(seqtab_nochim))
# If processing a single sample, remove the sapply calls
# e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
} else if (read_type == "se") {
fns <- list.files(file_path, pattern = "\\.fastq\\.gz|\\.fastq")
filtpath <- file.path(file_path, "filtered")
# Filter & Trim
out <- filterAndTrim(fwd = file.path(file_path, fns),
filt = file.path(filtpath, fns),
truncLen = trunc_len[1],
maxEE = max_ee[1],
trimLeft = trim_left[1],
maxLen = max_len,
minLen = min_len,
rm.phix = TRUE,
compress = TRUE,
verbose = TRUE,
multithread = TRUE)
## Check filter result
#out_print <- out %>% as.data.frame() %>% select(reads.in, reads.out)
#out_print
# Learn error rates
filts <- list.files(filtpath, pattern = ".fastq.gz|.fastq", full.names = TRUE)
# Assumes filename = sample.fastq.gz
sample.names <- sapply(strsplit(basename(fns), ".fastq.gz|.fastq"), `[`, 1)
names(filts) <- sample.names
set.seed(100)
# Learn error rates
err <- learnErrors(filts, nbases = 1e8, multithread = TRUE, randomize = TRUE)
if (big_data) {
# Infer sequence variants
dds <- vector("list", length(sample.names))
names(dds) <- sample.names
if (pyrosequencing) {
for(sam in sample.names) {
cat("Processing:", sam, "\n")
derep <- derepFastq(filts[[sam]])
dds[[sam]] <- dada(derep, err = err, multithread = TRUE,
HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32)
}
} else {
for(sam in sample.names) {
cat("Processing:", sam, "\n")
derep <- derepFastq(filts[[sam]])
dds[[sam]] <- dada(derep, err = err, multithread = TRUE)
}
}
} else {
# Dereplication
derep <- derepFastq(filts, verbose = TRUE)
names(derep) <- sample.names
# Sample Inference
if (pyrosequencing) {
dds <- dada(derep, err = err, multithread = TRUE,
HOMOPOLYMER_GAP_PENALTY = -1,
BAND_SIZE = 32)
} else {
dds <- dada(derep, err = err, multithread = TRUE)
}
}
# Construct sequence table
seqtab <- makeSequenceTable(dds)
# Remove Chimeras
seqtab_nochim <- removeBimeraDenovo(seqtab, method = "consensus",
multithread = TRUE, verbose = TRUE)
## Check the demension
#dim(seqtab_nochim)
## Inspect sequence remove rate
#sum(seqtab_nochim)/sum(seqtab)
# Track reads through pipeline
getN <- function(x) sum(getUniques(x))
if (big_data) {
dada2_stat <- getN(dds[[sam]])
} else {
dada2_stat <- sapply(dds, getN)
}
track <- cbind(out, dada2_stat, rowSums(seqtab_nochim))
colnames(track) <- c("input", "filtered", "dereplicated", "nonchim")
rownames(track) <- sample.names
}
if (!is.na(ref_fasta)) {
# Assign taxonomy
taxa <- assignTaxonomy(seqs = seqtab_nochim, refFasta = ref_fasta, multithread=TRUE)
if (!is.na(ref_species)) {
taxa <- addSpecies(taxtab = taxa, refFasta = ref_species)
}
# Results
dada2_res$seq_tab <- seqtab_nochim
dada2_res$tax_tab <- taxa
dada2_res$reads_track <- track
return(dada2_res)
} else {
# Results
dada2_res$seq_tab <- seqtab_nochim
dada2_res$reads_track <- track
return(dada2_res)
}
}
}
# Track reads through pipeline ----------------------------------------------------
# dada2_reads_track
if (F) {
dada2_reads_track <- function(track, single_end = TRUE) {
track <- rownames_to_column(as.data.frame(track), var = "subject_id")
track$subject_id <- factor(track$subject_id)
track <- gather(track, 2:ncol(track), key = "stages", value = "reads")
if (single_end) {
track$stages <- factor(track$stages, levels = c("input", "filtered",
"dereplicated", "nonchim"))
} else {
track$stages <- factor(track$stages, levels = c("input", "filtered",
"denoisedF", "denoisedR",
"merged", "nonchim"))
}
ggplot(track, aes(x = stages, y = reads, color = subject_id)) +
scale_color_manual(values = distinctive_colors) +
geom_point() +
geom_line(aes(group = subject_id)) +
# move legend position to the top
theme(legend.position = "top")
}
}
# Export DADA2 sequences to fasta file --------------------------------------------
if(F) {
seqs2fasta <- function(seq_tab, out_path) {
require(seqinr)
seqtab.t = as.data.frame(t(seq_tab))
seqs = row.names(seqtab.t)
row.names(seqtab.t) = paste0("OTU", 1:nrow(seqtab.t))
seqs = as.list(seqs)
seqinr::write.fasta(sequences = seqs, names = row.names(seqtab.t),
file.out = out_path)
}
}
# Extract phyloseq sample data ----------------------------------------------------
if (F) {
extract_metadata_from_phyloseq <- function(phyloseq, feature = NA) {
## This is a function for extracting metadata from phyloseq object
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic tree.
## feature: The column name of the feature you want to select.
require(tidyverse)
require(phyloseq)
## Step 1: Extract metadata from phyloseq and turn into tibble
metadata <- phyloseq %>%
sample_data() %>%
as.matrix() %>%
as.data.frame()
if ("subject_id" %in% colnames(metadata)) {
select(metadata, -subject_id) %>%
rownames_to_column(var = "subject_id")
warning("replace 'subject_id' column with rownames")
} else {
metadata <- rownames_to_column(metadata, var = "subject_id")
}
if (is.na(feature)) {
return(metadata)
} else {
# Select column by feature name
metadata <- dplyr::select(metadata, subject_id, !!feature)
# Add levels to column
metadata[[feature]] <- metadata[[feature]] %>% as.character() %>% factor()
return(metadata)
}
}
}
# Construct OTU table with DADA2 result -------------------------------------------
if (F) {
construct_otu_table <- function(phyloseq, level = "all") {
## This is a function for converting DADA2 reaults to OTU table
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic
## tree.
## level: If "all" then retain all taxonomy level and seperate by ";",
## else ONLY retain the given taxonomy level, drop everything else.
# Set options, prevent R turnning numeric value to factor
options(stringsAsFactors = FALSE)
# Check if input 'level' is correct
if (!level %in% c("all", "Kingdom", "Phylum", "Class", "Order", "Family",
"Genus", "Species")) {
stop('level should be one of "all", "Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "Species".')}
# Read in sequence table and taxonomy table from phyloseq
otu <- otu_table(phyloseq) %>% as.data.frame() %>% t() %>% as.data.frame() %>%
rownames_to_column(var = "Feature_ID")
taxa <- tax_table(phyloseq) %>% as.data.frame() %>%
rownames_to_column(var = "Feature_ID")
levels <- colnames(taxa)[2:ncol(taxa)]
## Step 1: Clean sequence table and taxonomy table
# If a sequence's taxonomy is all NA, remove it
all_NA_taxa <- taxa %>% filter_at(vars(levels), all_vars(is.na(.)))
taxa <- taxa %>% filter(!Feature_ID %in% all_NA_taxa$Feature_ID)
## Step 2: Construct OTU table
if (level == "all") {
# If level == "all", collapse all taxonomy levels and separate by "|"
level <- "Taxonomy"
taxa <- taxa %>% unite(Taxonomy, 2:ncol(taxa), sep = ";")
} else {
# If level != "all", select the given taxonomy level
taxa <- taxa %>% select(Feature_ID, level) %>% filter(., !is.na(.[[level]]))
otu <- otu %>% filter(Feature_ID %in% taxa$Feature_ID)
}
## Step 3: Merge sequence table and taxonomy table
otu <- left_join(otu, taxa)
## Step 4: Add abundance of those have the same taxonomy names and convert
## it to data frame
otu <- otu %>%
group_by_(level) %>% #group_by_() can pass variable to goup_by() function
summarise_if(is.numeric, sum, na.rm=TRUE) %>%
column_to_rownames(var = level) %>%
t() %>%
as.data.frame()
return(otu)
}
}
# Construct LEfSe OTU table -------------------------------------------------------
if (F) {
# See http://huttenhower.sph.harvard.edu/galaxy/ ->
# LEfSe -> Format Data for LEfSe for more details.
construct_lefse_table <- function(phyloseq, feature, level = "all") {
# Set options, prevent R turnning numeric value to factor
options(stringsAsFactors = FALSE)
# Read in phyloseq object
# Convert OTU to relative abundance
otu <- otu_table(phyloseq) %>% as.data.frame() %>% convert_to_percentage() %>%
t() %>% as.data.frame() %>% rownames_to_column(var = "Feature_ID")
taxa <- tax_table(phyloseq) %>% as.data.frame() %>%
rownames_to_column(var = "Feature_ID")
# Select taxa by given level
if (level == "all") {
levels <- colnames(taxa)[2:ncol(taxa)]
} else {
taxa <- taxa %>% select(Feature_ID:!!level)
levels <- colnames(taxa)[2:ncol(taxa)]
}
# Remove rows in taxa that is all NA
all_NA_taxa <- taxa %>% filter_at(vars(levels), all_vars(is.na(.)))
taxa <- taxa %>% filter(!Feature_ID %in% all_NA_taxa$Feature_ID)
# Convert metadata and create lefse table
metadata <- extract_metadata_from_phyloseq(phyloseq = phyloseq, feature = feature)
lefse <- metadata %>% t() %>% as.data.frame()
colnames(lefse) <- lefse %>% .[rownames(.) == "subject_id",]
# Combine otu to lefse table
for(i in levels) {
taxa_tmp <- taxa %>% as.data.frame() %>% select(1:which(colnames(taxa) == i)) %>%
filter_at(vars(i), all_vars(!is.na(.)))
taxa_tmp <- taxa_tmp %>% unite(Taxonomy, 2:ncol(taxa_tmp), sep = "|")
otu_tmp <- otu %>% filter(Feature_ID %in% taxa_tmp$Feature_ID) %>%
left_join(taxa_tmp)
otu_tmp <- otu_tmp %>%
group_by(Taxonomy) %>% #group_by_() can pass variable to goup_by() function
summarise_if(is.numeric, sum, na.rm=TRUE) %>%
column_to_rownames(var = "Taxonomy")
lefse <- rbind(lefse, otu_tmp)
}
lefse <- rownames_to_column(lefse)
colnames(lefse) <- 1:ncol(lefse)
return(lefse)
}
}
# Plot sparsity -------------------------------------------------------------------
# input file: OTU table (rownames are SampleID, colnames are taxa)
if (F) {
plot_sparsity <- function(otu, binwidth) {
# remove all na taxa and transposes otu table
otu <- otu %>% rownames_to_column() %>%
filter_if(is.numeric, any_vars(. != 0)) %>%
column_to_rownames() %>%
t() %>% as.data.frame()
# replace 0 with NA
otu[otu == 0] <- NA
# calculate sparsity
otu <- apply(otu, 1, function(x) round((sum(!is.na(x))/ncol(otu))*100, 0)) %>%
as.data.frame() %>%
rownames_to_column(var = 'otu_id')
colnames(otu)[2] <- 'not_0_percentage'
ggplot(otu, aes(not_0_percentage)) +
geom_histogram(binwidth = binwidth) +
xlab('Prevalence of each OTU') +
ylab('Count') +
theme_bw() +
theme(panel.grid = element_blank())
}
}
# Stacked bar plot of phylogenetic composition ------------------------------------
if (F) {
stacked_bar_plot <- function(phyloseq, feature, level) {
## This is a function for stacked bar plot
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic tree.
## level: Taxonomy level to plot.
## feature: The feature that shows in x-axis text with different colors.
## Step 1: First construct otu then convert to percentage
otu_percent <- construct_otu_table(phyloseq, level) %>%
convert_to_percentage(row_sum = TRUE) %>%
as.data.frame()
## Step 2: Construct table for stacked bar plot
plot_tab <- otu_percent %>%
# First re-order taxonomy by total counts
.[,order(colSums(.), decreasing = TRUE)] %>%
# Then re-order sample by the most abundance taxonomy
.[order(.[,1], decreasing = TRUE),] %>%
# Turn subject_id to a column
rownames_to_column(var = "subject_id")
# Prepare levels for subject_id
levels_subject_id <- plot_tab$subject_id
# Prepare levels for taxonomy levels
levels_level <- colnames(plot_tab)[2:ncol(plot_tab)]
## Step 3: Join metadata
sample_feature <- extract_metadata_from_phyloseq(phyloseq, feature)
plot_tab <- left_join(plot_tab, sample_feature)
# Prepare levels for feature
levels_feature <- plot_tab[,ncol(plot_tab)]
# Add levels to subject_id
plot_tab$subject_id <- factor(plot_tab$subject_id, levels = levels_subject_id)
## Step 4: Turn plot_table to a long table for plotting
plot_tab <- gather(plot_tab,
colnames(plot_tab)[2:(ncol(plot_tab)-1)],
key = level,
value = "abundance")
# Add levels to taxonomy levels
plot_tab$level <- factor(plot_tab$level, levels = levels_level)
## Step 5: Bar plot
if (length(levels_level) >= 74) {
stop("Taxonomy values are more than 74, not enough distinctive colors to
plot. Please choose another level.")
} else {
ggplot(plot_tab, aes(x = subject_id, y = abundance)) +
theme(axis.text.x = element_text(angle = 90, size = 8, color = levels_feature)) +
scale_fill_manual(values = distinctive_colors) +
geom_bar(mapping = aes(fill = level), position = "fill", stat = "identity") +
# move legend position to the top
theme(legend.position = "top")
}
}
}
# Plot correlation ----------------------------------------------------------------
if (F) {
plot_cor <- function(cor_tab, x, y, method = "pearson") {
# Notice: Colnames of the input table can only be letters or numbers.
if (any(str_detect(colnames(cor_tab), '\\W'))) {
stop("Colnames of the input table can only be letters or numbers.")
}
# Calculate correlation
cor_res <- cor.test(cor_tab[[x]], cor_tab[[y]], method = method)
if (cor_res$p.value < 2.2e-16) {
ggplot(data = cor_tab, aes_string(x = x, y = y)) +
geom_point() +
geom_smooth(method = lm) +
annotate(geom = 'text',
x = mean(cor_tab[[x]]),
y = max(cor_tab[[y]]) * 1.1,
label = paste0('rho = ', round(cor_res$estimate, 3))) +
annotate(geom = 'text',
x = mean(cor_tab[[x]]),
y = max(cor_tab[[y]]) * 1.05,
label = paste0('p-value < 2.2e-16')) +
labs(x = as.character(x), y = as.character(y)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
axis.text.x = element_text(size = 12),
legend.text = element_text(size = 12),
strip.text.x = element_text(size = 14))
} else {
ggplot(data = cor_tab, aes_string(x = x, y = y)) +
geom_point() +
geom_smooth(method = lm) +
annotate(geom = 'text',
x = mean(cor_tab[[x]]),
y = max(cor_tab[[y]]) * 1.1,
label = paste0('rho = ', round(cor_res$estimate, 3))) +
annotate(geom = 'text',
x = mean(cor_tab[[x]]),
y = max(cor_tab[[y]]) * 1.05,
label = paste0('p-value = ', round(cor_res$p.value, 3))) +
labs(x = as.character(x), y = as.character(y)) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.y = element_text(size = 12),
axis.title = element_text(size = 14),
axis.text.x = element_text(size = 12),
legend.text = element_text(size = 12),
strip.text.x = element_text(size = 14))
}
}
}
# Alpha diversity -----------------------------------------------------------------
if (F) {
alpha_diversity_plot <- function(phyloseq, feature, measures, p_test = "wilcox"){
## This is a function for plotting alpha diversity
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic tree.
## feature: The column name of the feature you want to select from
## metadata, e.g. "Phenotype".
## measures: The measures to calculate alpha diversity (see "plot_richness" function,
## "measures" argument for detail).
## p_test: The p-value to test alpha diversity.
## Step 1: Use plot_richness function to calculate alpha diversity
if (!measures %in% c("Observed", "Chao1", "ACE", "Shannon", "Simpson",
"InvSimpson", "Fisher")) {
stop('measures should be one of "Observed", "Chao1", "ACE", "Shannon",
"Simpson", "InvSimpson", "Fisher".')
} else {
alpha_diversity <- plot_richness(phyloseq, x = feature, measures = measures)
}
## Step 2: Calculate p-value
if (p_test == "wilcox") {
# Prepare feature table for calculating Mann-Whitney U test(for 2 groups only)
feature_tab_4_MWtest <- extract_metadata_from_phyloseq(phyloseq, feature)
# Extract feature levels
feature1 <- feature_tab_4_MWtest[[feature]] %>% unique() %>% .[1]
feature2 <- feature_tab_4_MWtest[[feature]] %>% unique() %>% .[2]
replace_feature <- c(as.character(feature1), as.character(feature2))
# Revalue feature levels as 0 and 1
feature_tab_4_MWtest[[feature]] <- plyr::mapvalues(feature_tab_4_MWtest[[feature]],
from = replace_feature,
to = c(0, 1))
p_value <- wilcox.test(alpha_diversity$data$value ~
feature_tab_4_MWtest[[feature]])$p.value
} else if (p_test == "kruskal") {
# Kruskal test(for 2 or more groups)
p_value <- kruskal.test(alpha_diversity$data$value,
factor(alpha_diversity$data[,feature]))$p.value
} else {
stop("The input p_test is not supported")
}
## Step 3: Plot alpha diversity
require(ggpubr)
ggboxplot(alpha_diversity$data,
x = feature,
y = "value",
add = "jitter",
add.params = list(size = 3),
color = feature,
outlier.shape = NA,
palette = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4")) +
ylab(paste0(measures, " Diversity")) +
annotate("text",
x = ((alpha_diversity$data[[feature]] %>% unique() %>% length() + 1)/2),
y = (max(alpha_diversity$data$value) * 1.1),
label = paste0(p_test, " p-value = ", round(p_value, 4)),
size = 3)
}
}
# Beta diversity ------------------------------------------------------------------
if (F) {
beta_diversity_plot <- function(phyloseq, feature, feature2 = NA, method, size = 3){
## This is a function for plotting beta diversity (modified from plot_pcoa
## function created by Jiangwei)
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic tree.
## method: The method to calculate beta diversity. Method should be one of
## "bray", "jaccard", "unifrac", "wunifrac".
## feature: The column name of the feature you want to select from
## metadata, e.g. "Phenotype".
## feature2: The column name of another feature you want to select from
## metadata, e.g. "Gender".
require(phyloseq)
## Step 1: Calculate beta diversity
if (!method %in% c("bray", "jaccard", "unifrac", "wunifrac")) {
stop('beta diversity method should be one of "bray", "jaccard",
"unifrac", "wunifrac".')
} else if (method %in% c("unifrac", "wunifrac")) {
beta_diversity <- cmdscale(phyloseq::distance(physeq = phyloseq,
method = method), eig = TRUE)
} else {
beta_diversity <- cmdscale(vegan::vegdist(otu_table(phyloseq),
method = method), eig = TRUE)
}
## Step 2: Construct plot table
# Extract PC1 and PC2
PC <- as.data.frame(beta_diversity$points)
colnames(PC) <- c("PC1", "PC2")
PC <- rownames_to_column(PC, var = "subject_id")
# Extract metadata
metadata <- extract_metadata_from_phyloseq(phyloseq)
# Mrtge
beta_diversity_tab <- left_join(PC, metadata)
## Step 3: Plot beta diversity
# Make x-axis and y-axis names for aes_string
x_name <- "PC1"
y_name <- "PC2"
# Make factor for color, prevent "Error: Continuous value supplied to discrete scale"
beta_diversity_tab[[feature]] <- beta_diversity_tab[[feature]] %>% as.factor()
# Plot beta diversity
# aes_string() can pass variables to ggplot, aes() can't
if (is.na(feature2)) {
ggplot(data = beta_diversity_tab, aes_string(x = x_name, y = y_name,
color = feature)) +
geom_point(size = size) +
xlab(paste("PC1:",
round(100*as.numeric(beta_diversity$eig[1]/sum(beta_diversity$eig)), 2),
"%",
sep = " ")) +
ylab(paste("PC2:",
round(100*as.numeric(beta_diversity$eig[2]/sum(beta_diversity$eig)), 2),
"%",
sep = " ")) +
scale_color_manual(values = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4"))
} else {
ggplot(data = beta_diversity_tab, aes_string(x = x_name, y = y_name, color = feature,
shape = feature2)) +
geom_point(size = size) +
xlab(paste("PC1:",
round(100*as.numeric(beta_diversity$eig[1]/sum(beta_diversity$eig)), 2),
"%",
sep = " ")) +
ylab(paste("PC2:",
round(100*as.numeric(beta_diversity$eig[2]/sum(beta_diversity$eig)), 2),
"%",
sep = " ")) +
scale_color_manual(values = c("#00AFBB", "#FC4E07", "#7FC97F", "#BEAED4")) +
scale_shape_manual(values = c(0:6))
}
}
}
# Log2 Fold Change ----------------------------------------------------------------
if (F) {
log2fc <- function(phyloseq, feature, level = NA, p_value = 0.01, size = 2,
save_res = FALSE) {
## This is a function for plotting log2 fold change.
## Arguments:
## phyloseq: A phyloseq object contain otu table, taxonomy table, sample
## metadata and phylogenetic tree.
## feature: The column name of the feature you want to select from
## metadata, e.g. "Phenotype".
require(DESeq2)
## Step 1: Construt table for DESeq2
# Create a string to parse feature argument to DESeq
feature_formula <- paste0("~ ", feature)
# Create a DESeq Data Set
if (is.na(level)) {
dds <- phyloseq_to_deseq2(phyloseq, design = as.formula(feature_formula))
cts <- counts(dds)
geoMeans <- apply(
cts, 1,
function(row) if (all(row == 0)) 0
else exp(sum(log(row[row != 0]))/length(row))
)
dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
} else {
countData <- construct_otu_table(phyloseq, level) %>% t()
colData <- extract_metadata_from_phyloseq(phyloseq = phyloseq, feature = feature)
dds <- DESeqDataSetFromMatrix(countData = countData, colData = colData,
design = as.formula(feature_formula))
}
dds <- DESeq(dds)
## Step 2: Calculate log2 fold change
res <- results(dds, addMLE = FALSE)
res <- res[order(res$padj),]
if (save_res) {
# save res object to current working directory
saveRDS(res, paste0('DESeq2_res_', Sys.Date(), ".rds"))
print('The DESeq result has been saved to current working directory.')
}
## Step 3: Plot fold change
# Create table for plotting
ifelse(is.na(level), var <- "OTU", var <- level)
log2fc <- res %>% as.data.frame() %>% rownames_to_column(var = var) %>%
filter(!is.na(padj))
# Identify differential expression
log2fc$significant = ifelse(log2fc$padj > p_value,
paste0('p-value > ', p_value),
paste0('p-value < ', p_value))
# Extract table that meet the p-value threshold
log2fc_print <- log2fc %>%
.[.$padj < p_value,] %>%
select(!!var, log2FoldChange, padj) %>%
.[order(.[,2], decreasing = TRUE),]
print(log2fc_print)
# EnhancedVolcano plot
require(EnhancedVolcano)
EnhancedVolcano(log2fc, lab = log2fc[[var]], x = "log2FoldChange", y = "padj",
FCcutoff = 1, pCutoff = p_value,
transcriptPointSize = size,
col = c("darkgrey", "#00AFBB", "#FC4E07", "red2"),
legend = c("Not Significant",
paste0("p > ", p_value, ", log2 fold change > 1"),
paste0("p < ", p_value, ", log2 fold change < 1"),
paste0("p < ", p_value, ", log2 fold change > 1"))
)
}
}
# Separate qiime2 taxonomy --------------------------------------------------------
if (F) {
separate_qiime2_taxa <- function(qiime2_taxa_path) {
separate_taxa <- read_tsv(qiime2_taxa_path) %>%
separate(Taxon,
into = c("Kingdom", "Phylum", "Class", "Order", "Family",
"Genus", "Species"),
sep = ";") %>%
select(-Confidence) %>%
column_to_rownames(var = "Feature ID")
return(separate_taxa)
}
}
# Permutation test ----------------------------------------------------------------
if (F) {
#原文:https://blog.csdn.net/u011467621/article/details/47971917
A_group <- c(24, 43, 58, 67, 61, 44, 67, 49, 59, 52, 62, 50)
B_group <- c(42, 43, 65, 26, 33, 41, 19, 54, 42, 20, 17, 60, 37, 42, 55, 28)
# 零假设:A/B group没有显著差异
# 若零假设成立,那么A组数据的分布和B组数据的分布是一样的,也就是服从同个分布
# 将A/B group的值合并,前12个是A group,后16个是B group
a <- c(A_group, B_group)
# 添加factor到对应的值,并生成data frame
group <- factor(c(rep("A", 12), rep("B", 16)))
data <- data.frame(group, a)
# 创建计算A_group mean值减去B_group mean值的function
find.mean <- function (x) {
mean(x[group == "A",2]) - mean(x[group == "B",2])
}
# 随机取12个值作为A,16个值作为B,计算mean值的差,并重复999次,建立分布图
set.seed(100) #随机取值会有波动,所以需要set.seed函数
results <- replicate(999, find.mean(data.frame(group, sample(data[,2]))))
# 查看随机取值的分布情况
hist(results, breaks = 20, prob = TRUE)
lines(density(results))
# 计算假设mean值比真实mean值大的概率
p.value <- length(
results[
results > mean(data[group == "A", 2]) - mean(data[group == "B", 2])
]
) / 1000 #999+1
# 若小于0.5,则说明真实mean值在假设情况中出现的概率小于0.5,原假设不成立,
# A/B group有显著差异
}
# ComplexHeatmap ----------------------------------------------------------------
if (FALSE) {
plot_ComplexHeatmap <- function(df, method = "pearson", row_order = NULL, column_order = NULL,
fontsize = 12) {
# df <- df %>%
# log2() %>%
# apply(., 2, scale) %>% #保留两位小数
# as.data.frame()
cormat <- cor(df, method = method)# %>% signif(digits = 2)
library(ComplexHeatmap)
library(circlize)
col_fun = circlize::colorRamp2(c(-1, 0, 1), c("blue", "white", "red"))
if (is.null(row_order) | is.null(column_order)) {
Heatmap(
cormat,
name = "correlation",
col = col_fun, #set color range
# Add value to each cell
cell_fun = function(j, i, x, y, width, height, fill) {
#"%.1f": 保留一位小数
grid.text(sprintf("%.1f", cormat[i, j]), x, y, gp = gpar(fontsize = fontsize))
}
)
} else {
Heatmap(
cormat,
name = "correlation",
col = col_fun,
cluster_rows = FALSE,
cluster_columns = FALSE,
row_order = row_order,
column_order = column_order,
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%.1f", cormat[i, j]), x, y, gp = gpar(fontsize = fontsize))
}
)
}
}
}
# Divide Genus OTU table to 3 group ---------------------------------------------
if(F) {
divide_genus <- function(
otutable,
samplenames = colnames(otutable),
method = NA,
probs = c(0.35, 0.7)
) {
# divide a vector into 0-35%/35%-70%/70%-100% 3 group
dividedGenus <- NULL
if(is.na(method)) {
otugenus <- otutable %>% rownames_to_column("Genus")
# 数值大于等于70%分位数的genus table
otugenustop <- select(otugenus, Genus)
for(i in samplenames) {
quantilevalue <- quantile(otugenus[[i]], probs = probs) %>% as.numeric()
topi <- otugenus %>% select(Genus, !!i) %>%
filter_if(is.numeric, all_vars(. >= quantilevalue[2]))
otugenustop <- left_join(otugenustop, topi)
}
otugenustop <- otugenustop %>%
filter_if(is.numeric, any_vars(!is.na(.))) %>%
column_to_rownames("Genus")
otugenustop[is.na(otugenustop)] <- 0
# 数值在70%-35%分位数之间的genus table
otugenusmid <- select(otugenus, Genus)
for(i in samplenames) {
quantilevalue <- quantile(otugenus[[i]][otugenus[[i]]!=0], probs = probs) %>% as.numeric()
midi <- otugenus %>% select(Genus, !!i) %>%
filter_if(is.numeric, all_vars(. > quantilevalue[1] & . < quantilevalue[2]))
otugenusmid <- left_join(otugenusmid, midi)
}
otugenusmid <- otugenusmid %>%
filter_if(is.numeric, any_vars(!is.na(.))) %>%
column_to_rownames("Genus")
otugenusmid[is.na(otugenusmid)] <- 0
# 数值小于等于35%分位数的genus table
otugenusbot <- select(otugenus, Genus)
for(i in samplenames) {
quantilevalue <- quantile(otugenus[[i]], probs = probs) %>% as.numeric()
boti <- otugenus %>% select(Genus, !!i) %>%
filter_if(is.numeric, all_vars(. <= quantilevalue[1]))
otugenusbot <- left_join(otugenusbot, boti)
}
otugenusbot <- otugenusbot %>%
filter_if(is.numeric, any_vars(!is.na(.))) %>%
column_to_rownames("Genus")
otugenusbot[is.na(otugenusbot)] <- 0
dividedGenus$top <- otugenustop
dividedGenus$mid <- otugenusmid
dividedGenus$bottom <- otugenusbot
return(dividedGenus)
} else if(method == "mean") {
genus_value = as.numeric(apply(otutable, 1, mean))
} else if(method == "median") {
genus_value = as.numeric(apply(otutable, 1, median))
}
quantilevalue <- quantile(genus_value, probs = probs) %>% as.numeric()
top_genus <- otutable %>%
rownames_to_column("Genus") %>%
mutate(genus_value=genus_value) %>%
select(Genus, genus_value) %>%
filter_if(is.numeric, all_vars(. >= quantilevalue[2])) %>% .$Genus
mid_genus <- otutable %>%
rownames_to_column("Genus") %>%
mutate(genus_value=genus_value) %>%
select(Genus, genus_value) %>%
filter_if(is.numeric, all_vars(. > quantilevalue[1] & . < quantilevalue[2])) %>% .$Genus
bot_genus <- otutable %>%
rownames_to_column("Genus") %>%
mutate(genus_value=genus_value) %>%
select(Genus, genus_value) %>%
filter_if(is.numeric, all_vars(. <= quantilevalue[1])) %>% .$Genus
otugenustop <- otutable %>%
rownames_to_column("Genus") %>%
filter(Genus %in% top_genus) %>%
column_to_rownames("Genus")
otugenusmid <- otutable %>%
rownames_to_column("Genus") %>%
filter(Genus %in% mid_genus) %>%
column_to_rownames("Genus")
otugenusbot <- otutable %>%
rownames_to_column("Genus") %>%
filter(Genus %in% bot_genus) %>%
column_to_rownames("Genus")
dividedGenus$top <- otugenustop
dividedGenus$mid <- otugenusmid
dividedGenus$bottom <- otugenusbot
return(dividedGenus)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.