Taxon phenotypes data

library(tidyverse)
library(usethis)

Allowed values

allowed_taxonomic_rank_values <- c(
  "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
allowed_aerobic_status_values <- c(
  "obligate anaerobe",
  "facultative anaerobe",
  "aerobe",
  NA)
refine_aerobic_status_values <- function (df) {
  df %>%
    mutate(aerobic_status = case_when(
      aerobic_status == "anaerobe (unspecified)" ~ "obligate anaerobe",
      # Assume that organisms filed under "microaerobe" and "microaerobe or
      # anaerobe" have genes to neutralize O2 but do not carry out aerobic
      # respiration
      aerobic_status == "microaerobe" ~ "obligate anaerobe",
      aerobic_status == "microaerobe or anaerobe" ~ "obligate anaerobe",
      # Assume that organisms filed under "microaerobe or aerobe" can carry out
      # aerobic respiration
      aerobic_status == "microaerobe or aerobe" ~ "aerobe",
      # This is a facultative anaerobe if I ever saw one
      aerobic_status == "aerobe, microaerobe, or anaerobe" ~ "facultative anaerobe",
      # Likewise, assume "aerotolerant" means facultative anaerobe
      aerobic_status == "aerotolerant" ~ "facultative anaerobe",
      # The labels "variable" and "not indicated" do not help us here; we need
      # definite answers
      aerobic_status == "variable" ~ NA_character_,
      aerobic_status == "not indicated" ~ NA_character_,
      TRUE ~ aerobic_status))
}
allowed_gram_stain_values <- c("Gram-positive", "Gram-negative", NA)
refine_gram_stain_values <- function (df) {
  df %>%
    mutate(gram_stain = case_when(
      gram_stain == "Gram-variable" ~ NA_character_,
      gram_stain == "not indicated" ~ NA_character_,
      TRUE ~ gram_stain))
}

Curated database from Celeste Gaughan

phenotype_genus <- read_tsv("genera_0831.txt", na = "N/A") %>%
  mutate(rank = "Genus") %>%
  select(taxon = name, rank, aerobic_status, gram_stain, doi) %>%
  refine_aerobic_status_values() %>%
  refine_gram_stain_values() %>%
  filter(!(is.na(aerobic_status) & is.na(gram_stain)))

phenotype_genus %>%
  count(aerobic_status)
phenotype_genus %>%
  count(gram_stain)
phenotype_species <- read_tsv("species_0831.txt", na = "N/A") %>%
  mutate(rank = "Species") %>%
  select(taxon = name, rank, aerobic_status, gram_stain, doi) %>%
  refine_aerobic_status_values() %>%
  refine_gram_stain_values() %>%
  filter(!(is.na(aerobic_status) & is.na(gram_stain)))

phenotype_species %>%
  count(taxon) %>%
  filter(n > 1)
phenotype_species %>%
  count(aerobic_status)
phenotype_species %>%
  count(gram_stain)

Misc database

phenotype_misc <- read_csv("phenotype_misc.csv", show_col_types = FALSE)

Final check

taxon_phenotypes <- bind_rows(
  phenotype_misc, phenotype_genus, phenotype_species)
taxon_phenotypes %>%
  count(rank)
taxon_phenotypes %>%
  count(aerobic_status)
taxon_phenotypes %>%
  count(gram_stain)

Check for duplicate rows

taxon_phenotypes %>%
  count(taxon) %>%
  filter(n > 1)

Check for expected taxonomic ranks

taxon_phenotypes %>%
  filter(!(rank %in% allowed_taxonomic_rank_values))

Check for unexpected aerobic status values

taxon_phenotypes %>%
  filter(!(aerobic_status %in% allowed_aerobic_status_values))

Check for unexpected Gram stain values

taxon_phenotypes %>%
  filter(!(gram_stain %in% allowed_gram_stain_values))
rank_colors <- structure(
  RColorBrewer::brewer.pal(9, "GnBu")[4:9],
  names = allowed_taxonomic_rank_values[2:7])

n_aerobic <- taxon_phenotypes %>%
  filter(!is.na(aerobic_status)) %>%
  count(rank, aerobic_status) %>%
  mutate(aerobic_status = fct_reorder(aerobic_status, n)) %>%
  rename(value = aerobic_status) %>%
  mutate(count_by = "Aerobic status")

n_gram <- taxon_phenotypes %>%
  filter(!is.na(gram_stain)) %>%
  count(rank, gram_stain) %>%
  mutate(gram_stain = fct_reorder(gram_stain, n)) %>%
  rename(value = gram_stain) %>%
  mutate(count_by = "Gram stain")

bind_rows(n_aerobic, n_gram) %>%
  mutate(value = fct_reorder(value, n, .fun = sum)) %>%
  mutate(rank = fct_relevel(rank,  allowed_taxonomic_rank_values[2:7])) %>%
  ggplot(aes(y = value, x = n, fill = rank)) +
  geom_col() +
  facet_grid(count_by ~ ., scales = "free_y", space = "free_y") +
  scale_fill_manual(values = rank_colors) +
  labs(x = "Number of taxa", y = "", fill = "") +
  theme_bw() +
  theme(strip.background = element_blank())
ggsave("phenotype_counts.png", width = 8, height = 4, dpi = 300)
taxon_phenotypes %>%
  count(rank) %>%
  mutate(rank = fct_relevel(rank, allowed_taxonomic_rank_values[2:7])) %>%
  ggplot() +
  geom_col(aes(x = n, y = rank, fill = rank)) +
  scale_fill_manual(values = rank_colors) +
  labs(x = "Number of taxa annotated", y = "") +
  theme_bw() +
  theme(legend.position = "none")
ggsave("taxon_counts.png", width = 4, height = 2, dpi = 300)

Export

taxon_phenotypes <- as.data.frame(taxon_phenotypes)
usethis::use_data(taxon_phenotypes, overwrite = T)


tuv292/abx_idx documentation built on Jan. 12, 2023, 12:48 a.m.