library(tidyverse) library(usethis)
allowed_taxonomic_rank_values <- c( "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
allowed_aerobic_status_values <- c( "obligate anaerobe", "facultative anaerobe", "aerobe", NA) refine_aerobic_status_values <- function (df) { df %>% mutate(aerobic_status = case_when( aerobic_status == "anaerobe (unspecified)" ~ "obligate anaerobe", # Assume that organisms filed under "microaerobe" and "microaerobe or # anaerobe" have genes to neutralize O2 but do not carry out aerobic # respiration aerobic_status == "microaerobe" ~ "obligate anaerobe", aerobic_status == "microaerobe or anaerobe" ~ "obligate anaerobe", # Assume that organisms filed under "microaerobe or aerobe" can carry out # aerobic respiration aerobic_status == "microaerobe or aerobe" ~ "aerobe", # This is a facultative anaerobe if I ever saw one aerobic_status == "aerobe, microaerobe, or anaerobe" ~ "facultative anaerobe", # Likewise, assume "aerotolerant" means facultative anaerobe aerobic_status == "aerotolerant" ~ "facultative anaerobe", # The labels "variable" and "not indicated" do not help us here; we need # definite answers aerobic_status == "variable" ~ NA_character_, aerobic_status == "not indicated" ~ NA_character_, TRUE ~ aerobic_status)) }
allowed_gram_stain_values <- c("Gram-positive", "Gram-negative", NA) refine_gram_stain_values <- function (df) { df %>% mutate(gram_stain = case_when( gram_stain == "Gram-variable" ~ NA_character_, gram_stain == "not indicated" ~ NA_character_, TRUE ~ gram_stain)) }
phenotype_genus <- read_tsv("genera_0831.txt", na = "N/A") %>% mutate(rank = "Genus") %>% select(taxon = name, rank, aerobic_status, gram_stain, doi) %>% refine_aerobic_status_values() %>% refine_gram_stain_values() %>% filter(!(is.na(aerobic_status) & is.na(gram_stain))) phenotype_genus %>% count(aerobic_status) phenotype_genus %>% count(gram_stain)
phenotype_species <- read_tsv("species_0831.txt", na = "N/A") %>% mutate(rank = "Species") %>% select(taxon = name, rank, aerobic_status, gram_stain, doi) %>% refine_aerobic_status_values() %>% refine_gram_stain_values() %>% filter(!(is.na(aerobic_status) & is.na(gram_stain))) phenotype_species %>% count(taxon) %>% filter(n > 1) phenotype_species %>% count(aerobic_status) phenotype_species %>% count(gram_stain)
phenotype_misc <- read_csv("phenotype_misc.csv", show_col_types = FALSE)
taxon_phenotypes <- bind_rows( phenotype_misc, phenotype_genus, phenotype_species) taxon_phenotypes %>% count(rank) taxon_phenotypes %>% count(aerobic_status) taxon_phenotypes %>% count(gram_stain)
Check for duplicate rows
taxon_phenotypes %>% count(taxon) %>% filter(n > 1)
Check for expected taxonomic ranks
taxon_phenotypes %>% filter(!(rank %in% allowed_taxonomic_rank_values))
Check for unexpected aerobic status values
taxon_phenotypes %>% filter(!(aerobic_status %in% allowed_aerobic_status_values))
Check for unexpected Gram stain values
taxon_phenotypes %>% filter(!(gram_stain %in% allowed_gram_stain_values))
rank_colors <- structure( RColorBrewer::brewer.pal(9, "GnBu")[4:9], names = allowed_taxonomic_rank_values[2:7]) n_aerobic <- taxon_phenotypes %>% filter(!is.na(aerobic_status)) %>% count(rank, aerobic_status) %>% mutate(aerobic_status = fct_reorder(aerobic_status, n)) %>% rename(value = aerobic_status) %>% mutate(count_by = "Aerobic status") n_gram <- taxon_phenotypes %>% filter(!is.na(gram_stain)) %>% count(rank, gram_stain) %>% mutate(gram_stain = fct_reorder(gram_stain, n)) %>% rename(value = gram_stain) %>% mutate(count_by = "Gram stain") bind_rows(n_aerobic, n_gram) %>% mutate(value = fct_reorder(value, n, .fun = sum)) %>% mutate(rank = fct_relevel(rank, allowed_taxonomic_rank_values[2:7])) %>% ggplot(aes(y = value, x = n, fill = rank)) + geom_col() + facet_grid(count_by ~ ., scales = "free_y", space = "free_y") + scale_fill_manual(values = rank_colors) + labs(x = "Number of taxa", y = "", fill = "") + theme_bw() + theme(strip.background = element_blank()) ggsave("phenotype_counts.png", width = 8, height = 4, dpi = 300)
taxon_phenotypes %>% count(rank) %>% mutate(rank = fct_relevel(rank, allowed_taxonomic_rank_values[2:7])) %>% ggplot() + geom_col(aes(x = n, y = rank, fill = rank)) + scale_fill_manual(values = rank_colors) + labs(x = "Number of taxa annotated", y = "") + theme_bw() + theme(legend.position = "none") ggsave("taxon_counts.png", width = 4, height = 2, dpi = 300)
taxon_phenotypes <- as.data.frame(taxon_phenotypes) usethis::use_data(taxon_phenotypes, overwrite = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.