Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(ulrb)
library(cluster)
library(dplyr)
library(ggplot2)
library(tidyr)
# a vector with some colors
qualitative_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
## -----------------------------------------------------------------------------
# Load raw OTU table from N-ICE
data("nice_raw", package = "ulrb")
# Change name of first column
nice_clean <- rename(nice_raw, Taxonomy = "X.SampleID")
# Select 16S rRNA amplicon sequencing samples
selected_samples <- c("ERR2044662", "ERR2044663", "ERR2044664",
"ERR2044665", "ERR2044666", "ERR2044667",
"ERR2044668", "ERR2044669", "ERR2044670")
# Add a column with phylogenetic units ID (OTU in this case)
nice_clean <- mutate(nice_clean, OTU = paste0("OTU_", row_number()))
# Select relevant collumns
nice_clean <- select(nice_clean, selected_samples, OTU, Taxonomy)
# Separate Taxonomy column into each taxonomic level
nice_clean <- separate(nice_clean,
Taxonomy,
c("Domain","Kingdom","Phylum",
"Class","Order","Family",
"Genus","Species"),
sep=";")
# Remove Kingdom column, because it is not used for prokaryotes
nice_clean <- select(nice_clean, -Kingdom)
# Remove eukaryotes
nice_clean <- filter(nice_clean, Domain != "sk__Eukaryota")
# Remove unclassified OTUs at phylum level
nice_clean <- filter(nice_clean, !is.na(Phylum))
# Simplify name
nice <- nice_clean
# Tidy data
nice_tidy <- prepare_tidy_data(nice,
sample_names = selected_samples,
samples_in = "cols")
## -----------------------------------------------------------------------------
rb_default <- define_rb(nice_tidy)
## ----fig.width=5, fig.height=4------------------------------------------------
plot_ulrb_clustering(rb_default,
sample_id = "ERR2044662",
taxa_col = "OTU",
log_scaled = TRUE)
## ----fig.width=8, fig.height=8------------------------------------------------
plot_ulrb_clustering(rb_default,
taxa_col = "OTU",
log_scaled = TRUE,
plot_all = TRUE)
## -----------------------------------------------------------------------------
rb_k2 <- define_rb(nice_tidy, classification_vector = c("Rare", "Abundant"))
## ----fig.width = 5, fig.height= 4---------------------------------------------
plot_ulrb_clustering(rb_k2,
taxa_col = "OTU",
plot_all = TRUE,
log_scaled = TRUE,
colors = c("#009E73", "#F0E442"))
## ----fig.width=15-------------------------------------------------------------
gridExtra::grid.arrange(
plot_ulrb_clustering(rb_default,
taxa_col = "OTU",
log_scaled = TRUE,
plot_all = TRUE),
plot_ulrb_clustering(rb_k2,
taxa_col = "OTU",
plot_all = TRUE,
log_scaled = TRUE,
colors = c("#009E73", "#F0E442")),
nrow = 1
)
## -----------------------------------------------------------------------------
#
rb_k4 <- define_rb(nice_tidy,
classification_vector = c("very rare", "rare", "abundant", "very abundant"))
#
## ----fig.width=15, fig.height=4-----------------------------------------------
# One sample as example
plot_ulrb(rb_k4,
sample_id = "ERR2044662",
taxa_col = "OTU",
colors = c("#009E73", "#F0E442", "grey","#CC79A7"),
log_scaled = TRUE)
## ----fig.width=15, fig.height=8-----------------------------------------------
# all samples
plot_ulrb(rb_k4,
taxa_col = "OTU",
colors = c("#009E73", "#F0E442", "grey","#CC79A7"),
log_scaled = TRUE,
plot_all = TRUE)
## -----------------------------------------------------------------------------
#
rb_k5 <- define_rb(nice_tidy,
classification_vector = c("very rare", "rare", "undetermined", "abundant", "very abundant"))
## ----fig.width=15, fig.height=4-----------------------------------------------
# One sample as example
plot_ulrb(rb_k5,
sample_id = "ERR2044662",
taxa_col = "OTU",
colors = qualitative_colors[1:5],
log_scaled = TRUE)
## ----fig.width=15, fig.height=8-----------------------------------------------
# All samples
plot_ulrb(rb_k5,
taxa_col = "OTU",
colors = qualitative_colors[1:5],
log_scaled = TRUE,
plot_all = TRUE)
## -----------------------------------------------------------------------------
#
rb_k1 <- define_rb(nice_tidy, classification_vector = c("rare"))
## ----fig.width=5, fig.height=4------------------------------------------------
plot_ulrb_clustering(rb_k1,
taxa_col = "OTU",
colors = "green4",
plot_all = TRUE,
log_scaled = TRUE)
## -----------------------------------------------------------------------------
rb_sample1 <- nice_tidy %>% filter(Sample == "ERR2044662")
# Calculate maximum k
max_k_sample1 <- rb_sample1 %>% pull(Abundance) %>% unique() %>% length()
#
max_k_sample1
# Improvise a classification vector for maximum k
# that is just any vector with the same length
classification_vector_max_k_sample1 <- seq_along(1:max_k_sample1)
#
rb_sample1_max_k <-
define_rb(rb_sample1,
classification_vector = classification_vector_max_k_sample1)
#
rb_sample1_max_k %>% select(OTU, Classification, Abundance) %>% head(10)
## -----------------------------------------------------------------------------
suggest_k(nice_tidy, detailed = TRUE)
## -----------------------------------------------------------------------------
suggest_k(nice_tidy, detailed = TRUE, range = 10:20)
## -----------------------------------------------------------------------------
## One sample
# To get values
check_avgSil(nice_tidy, sample_id = selected_samples[1])
# To plot results
check_avgSil(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)
## -----------------------------------------------------------------------------
## Davie-Boulding index
# To get values
check_DB(nice_tidy, sample_id = selected_samples[1])
# To plot results
check_DB(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)
## Calinski-Harabasz index
# To get values
check_CH(nice_tidy, sample_id = selected_samples[1])
# To plot results
check_CH(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)
## -----------------------------------------------------------------------------
evaluate_sample_k(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)
## -----------------------------------------------------------------------------
## To get values
evaluate_k(nice_tidy)
## To plot
evaluate_k(nice_tidy, with_plot = TRUE)
## -----------------------------------------------------------------------------
# default option with average Silhouette score
suggest_k(nice_tidy)
# best k for Davies-Bouldin
suggest_k(nice_tidy, index = "Davies-Bouldin")
# best k for Calinski-Harabasz
suggest_k(nice_tidy, index = "Calinski-Harabasz")
## -----------------------------------------------------------------------------
automatic_classification <- define_rb(nice_tidy, automatic = TRUE)
# Plot automatic result
plot_ulrb(automatic_classification, taxa_col = "OTU", plot_all = TRUE)
## -----------------------------------------------------------------------------
more_complex_automatic_classification <- define_rb(nice_tidy,
automatic = TRUE,
index = "Calinski-Harabasz",
range = 4:6)
## -----------------------------------------------------------------------------
# Plot automatic result
plot_ulrb(more_complex_automatic_classification,
plot_all = TRUE,
taxa_col = "OTU",
colors = qualitative_colors[1:5],
log_scaled = TRUE)
## -----------------------------------------------------------------------------
# Start by deciding the maximum range across the entire dataset
max_k <- nice_tidy %>%
filter(Abundance > 0, !is.na(Abundance)) %>%
group_by(Sample) %>%
summarise(topK = length(unique(Abundance))) %>%
ungroup() %>%
pull(topK) %>%
min()
# print maximum number of clusters allowed for all samples in the N-ICE dataset
max_k
## -----------------------------------------------------------------------------
evaluate_k(nice_tidy, with_plot = TRUE, range = 2:max_k)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.