explore-classifications.R
In ulrb: Unsupervised Learning Based Definition of Microbial Rare Biosphere

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(ulrb)
library(cluster)
library(dplyr)
library(ggplot2)
library(tidyr)
# a vector with some colors
qualitative_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

## -----------------------------------------------------------------------------
# Load raw OTU table from N-ICE
data("nice_raw", package = "ulrb")

# Change name of first column
nice_clean <- rename(nice_raw, Taxonomy = "X.SampleID")

# Select 16S rRNA amplicon sequencing samples
selected_samples <- c("ERR2044662", "ERR2044663", "ERR2044664",
                      "ERR2044665", "ERR2044666", "ERR2044667",
                      "ERR2044668", "ERR2044669", "ERR2044670")

# Add a column with phylogenetic units ID (OTU in this case)
nice_clean <- mutate(nice_clean, OTU = paste0("OTU_", row_number()))

# Select relevant collumns
nice_clean <- select(nice_clean, selected_samples, OTU, Taxonomy)

# Separate Taxonomy column into each taxonomic level
nice_clean <- separate(nice_clean, 
                       Taxonomy,
                       c("Domain","Kingdom","Phylum",
                         "Class","Order","Family",
                         "Genus","Species"), 
                       sep=";")

# Remove Kingdom column, because it is not used for prokaryotes
nice_clean <- select(nice_clean, -Kingdom)

# Remove eukaryotes
nice_clean <- filter(nice_clean, Domain != "sk__Eukaryota")

# Remove unclassified OTUs at phylum level
nice_clean <- filter(nice_clean, !is.na(Phylum))

# Simplify name
nice <- nice_clean

# Tidy data
nice_tidy <- prepare_tidy_data(nice, 
                               sample_names = selected_samples, 
                               samples_in = "cols")

## -----------------------------------------------------------------------------
rb_default <- define_rb(nice_tidy)

## ----fig.width=5, fig.height=4------------------------------------------------
plot_ulrb_clustering(rb_default, 
                     sample_id = "ERR2044662",
                     taxa_col = "OTU",
                     log_scaled = TRUE)

## ----fig.width=8, fig.height=8------------------------------------------------
plot_ulrb_clustering(rb_default, 
                     taxa_col = "OTU",
                     log_scaled = TRUE,
                     plot_all = TRUE)

## -----------------------------------------------------------------------------
rb_k2 <- define_rb(nice_tidy, classification_vector = c("Rare", "Abundant"))

## ----fig.width = 5, fig.height= 4---------------------------------------------
plot_ulrb_clustering(rb_k2,
                     taxa_col = "OTU",
                     plot_all = TRUE, 
                     log_scaled = TRUE, 
                     colors = c("#009E73", "#F0E442"))

## ----fig.width=15-------------------------------------------------------------
gridExtra::grid.arrange(
  plot_ulrb_clustering(rb_default, 
                     taxa_col = "OTU",
                     log_scaled = TRUE,
                     plot_all = TRUE),
  plot_ulrb_clustering(rb_k2,
                     taxa_col = "OTU",
                     plot_all = TRUE, 
                     log_scaled = TRUE, 
                     colors = c("#009E73", "#F0E442")),
  nrow = 1
)

## -----------------------------------------------------------------------------
#
rb_k4 <- define_rb(nice_tidy, 
                   classification_vector = c("very rare", "rare", "abundant", "very abundant"))
#

## ----fig.width=15, fig.height=4-----------------------------------------------
# One sample as example
plot_ulrb(rb_k4,
          sample_id = "ERR2044662", 
          taxa_col = "OTU",
          colors = c("#009E73", "#F0E442", "grey","#CC79A7"),
          log_scaled = TRUE)

## ----fig.width=15, fig.height=8-----------------------------------------------
# all samples
plot_ulrb(rb_k4,
          taxa_col = "OTU",
          colors = c("#009E73", "#F0E442", "grey","#CC79A7"),
          log_scaled = TRUE,
          plot_all = TRUE)

## -----------------------------------------------------------------------------
#
rb_k5 <- define_rb(nice_tidy, 
                   classification_vector = c("very rare", "rare", "undetermined", "abundant", "very abundant"))

## ----fig.width=15, fig.height=4-----------------------------------------------
# One sample as example
plot_ulrb(rb_k5,
          sample_id = "ERR2044662", 
          taxa_col = "OTU",
          colors = qualitative_colors[1:5],
          log_scaled = TRUE)

## ----fig.width=15, fig.height=8-----------------------------------------------
# All samples
plot_ulrb(rb_k5,
          taxa_col = "OTU",
          colors = qualitative_colors[1:5],
          log_scaled = TRUE,
          plot_all = TRUE)

## -----------------------------------------------------------------------------
#
rb_k1 <- define_rb(nice_tidy, classification_vector = c("rare"))

## ----fig.width=5, fig.height=4------------------------------------------------
plot_ulrb_clustering(rb_k1, 
                     taxa_col = "OTU", 
                     colors = "green4", 
                     plot_all = TRUE, 
                     log_scaled = TRUE)

## -----------------------------------------------------------------------------
rb_sample1 <- nice_tidy %>% filter(Sample == "ERR2044662")

# Calculate maximum k
max_k_sample1 <- rb_sample1 %>% pull(Abundance) %>% unique() %>% length()
#
max_k_sample1
# Improvise a classification vector for maximum k
# that is just any vector with the same length
classification_vector_max_k_sample1 <- seq_along(1:max_k_sample1)
#
rb_sample1_max_k <- 
  define_rb(rb_sample1,
            classification_vector = classification_vector_max_k_sample1)
#
rb_sample1_max_k %>% select(OTU, Classification, Abundance) %>% head(10)

## -----------------------------------------------------------------------------
suggest_k(nice_tidy, detailed = TRUE)  

## -----------------------------------------------------------------------------
suggest_k(nice_tidy, detailed = TRUE, range = 10:20)  

## -----------------------------------------------------------------------------
## One sample
# To get values
check_avgSil(nice_tidy, sample_id = selected_samples[1])

# To plot results
check_avgSil(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)

## -----------------------------------------------------------------------------
## Davie-Boulding index
# To get values
check_DB(nice_tidy, sample_id = selected_samples[1])

# To plot results
check_DB(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)

## Calinski-Harabasz index
# To get values
check_CH(nice_tidy, sample_id = selected_samples[1])

# To plot results
check_CH(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)

## -----------------------------------------------------------------------------
evaluate_sample_k(nice_tidy, sample_id = selected_samples[1], with_plot = TRUE)

## -----------------------------------------------------------------------------
## To get values
evaluate_k(nice_tidy)

## To plot
evaluate_k(nice_tidy, with_plot = TRUE)

## -----------------------------------------------------------------------------
# default option with average Silhouette score
suggest_k(nice_tidy)

# best k for Davies-Bouldin
suggest_k(nice_tidy, index = "Davies-Bouldin")

# best k for Calinski-Harabasz
suggest_k(nice_tidy, index = "Calinski-Harabasz")

## -----------------------------------------------------------------------------
automatic_classification <- define_rb(nice_tidy, automatic = TRUE)

# Plot automatic result

plot_ulrb(automatic_classification, taxa_col = "OTU", plot_all = TRUE)

## -----------------------------------------------------------------------------
more_complex_automatic_classification <- define_rb(nice_tidy, 
                                                   automatic = TRUE,
                                                   index = "Calinski-Harabasz",
                                                   range = 4:6)

## -----------------------------------------------------------------------------
# Plot automatic result
plot_ulrb(more_complex_automatic_classification, 
          plot_all = TRUE, 
          taxa_col = "OTU", 
          colors = qualitative_colors[1:5],
          log_scaled = TRUE)

## -----------------------------------------------------------------------------
# Start by deciding the maximum range across the entire dataset
max_k <- nice_tidy %>%
    filter(Abundance > 0, !is.na(Abundance)) %>%
    group_by(Sample) %>%
    summarise(topK = length(unique(Abundance))) %>%
    ungroup() %>%
    pull(topK) %>%
    min()
# print maximum number of clusters allowed for all samples in the N-ICE dataset
max_k

## -----------------------------------------------------------------------------
evaluate_k(nice_tidy, with_plot = TRUE, range = 2:max_k)

Any scripts or data that you put into this service are public.

ulrb documentation built on April 11, 2025, 5:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ulrb
Unsupervised Learning Based Definition of Microbial Rare Biosphere

inst/doc/explore-classifications.R
In ulrb: Unsupervised Learning Based Definition of Microbial Rare Biosphere

Try the ulrb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ulrb Unsupervised Learning Based Definition of Microbial Rare Biosphere

inst/doc/explore-classifications.R In ulrb: Unsupervised Learning Based Definition of Microbial Rare Biosphere

Try the ulrb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ulrb
Unsupervised Learning Based Definition of Microbial Rare Biosphere

inst/doc/explore-classifications.R
In ulrb: Unsupervised Learning Based Definition of Microbial Rare Biosphere