In scottdaniel/bacphene: Imports Bacterial Phenotype Data Into R

Code to prepare datasets

Also saves a large type strain list for later use if needed

Pro-tip to search for a given bacdive ID: list.filter(list_holder, General$BacDive-ID == "141146")

library(BacDive)
library(tidyverse)
library(usethis)
library(magrittr)
library(here)
library(bacphene)
library(rlist)

list_holder <- getStrains(typestrain_only = F)

#for unit tests
# random_set <- sample(seq(1,length(list_holder)), 10)
random_set <- c(46551L, 376L, 78203L, 51907L, 20151L, 41583L, 11608L, 36060L, 38371L, 6841L)
test_list <- list()
for (i in random_set) {
  test_list <- list.append(test_list, list_holder[[i]])
}
test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Acidicapsa dinghuensis", typestrain_only = T)[[1]]) #for testing getAbx
test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Acinetobacter calcoaceticus", typestrain_only = T)[[1]]) #for testing getAbx (antibiogram)
test_list <- list.append(test_list, getStrainLocal(list_holder, query = 21374, typestrain_only = F)[[1]]) #for testing getAbx
test_list <- list.append(test_list, getStrainLocal(list_holder, query = 280, typestrain_only = F)[[1]]) #for testing getAbx
test_list <- list.append(test_list, getStrainLocal(list_holder, query = 8094, typestrain_only = F)[[1]]) #for testing getAbx
test_list <- list.append(test_list, getStrainLocal(list_holder, query = 159751, typestrain_only = F)[[1]]) #for testing getAbx
test_list <- list.append(test_list, getStrainLocal(list_holder, query = 158420, typestrain_only = F)[[1]]) #for testing getAbx

test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Abyssibacter profundi", typestrain_only = T)[[1]]) #for testing getEnzymes
test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Bacteroides fragilis", typestrain_only = T)[[1]])
test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Bacteroides fragilis", typestrain_only = F)[[2]]) #for testing getStrainLocal and having multiple strains of the same species

attr(test_list, "date_downloaded") <- attr(list_holder, "date_downloaded")
write_rds(test_list, file = here("tests/testthat/test_data.rda"))

Now we want to extract the important bits (gram stain, oxygen tolerance, abx sensitivity / resistance, and urease activity)

bacdive_morphology

Cell morphology -> gram_stain

bacdive_morphology <- getMorphology(list_holder)

usethis::use_data(bacdive_morphology, overwrite = TRUE)

bacdive_oxygen

Oxygen tolerance -> aerobic_status

bacdive_oxygen <- getOxygen(list_holder)

usethis::use_data(bacdive_oxygen, overwrite = TRUE)

bacdive_phenotypes

Here we are combining the essential parts of bacdive_morphology and bacdive_oxygen to get a dataframe that is compatible with the abxidx package.

bacdive_phenotypes <- getPhenotypes(morphology_df = bacdive_morphology, oxygen_df = bacdive_oxygen)

usethis::use_data(bacdive_phenotypes, overwrite = TRUE)

# maybe concatenate multiple entries?
# summarise(
#   gram_stain = paste(gram_stain, collapse = ", "),
#   aerobic_status = paste(aerobic_status, collapse = ", "),
#   ID = paste(ID, collapse = ", ")
# )

# If we want to exactly match the values for abxidx
# bacdive_phenotypes <- bacdive_phenotypes %>%
#   mutate(gram_stain = case_when(gram_stain %in% "positive" ~ "Gram-positive",
#                                 gram_stain %in% "negative" ~ "Gram-negative"))
# this turns all the "variable" values for gram_stain into NA's

bacdive_susceptibility

bacdive_abx contains antibiotics information from bacdive, including multiple strains and details for antibiotic sensitivity / resistance. Antibiotics information that is in the "antibiogram" format is in the bacdive_antibiogram data frame. bacdive_susceptibility combines the preceding two data frames and contains a simplified version that gets the most common values for each taxon. bacdive_susceptibility is compatible with the abxidx package.

bacdive_abx <- getAbx(list_holder)
bacdive_antibiogram <- getAntibiogram(list_holder)

# strains_with_abx <- bacdive_abx %>% 
#   select(ID, taxon, rank, type_strain) %>%
#   bind_rows(bacdive_antibiogram %>% select(ID, taxon, rank, type_strain)) %>%
#   distinct() %>%
#   mutate(abx_info = if_else(ID %in% bacdive_abx$ID, T, F)) %>%
#   mutate(antibiogram = if_else(ID %in% bacdive_antibiogram$ID, T, F))

bacdive_combined_abx <- bind_rows(bacdive_abx, bacdive_antibiogram)

bacdive_susceptibility <- getSimplifiedAbx(data = bacdive_combined_abx, extra_info = F, most_common = T, remove_unknown = T)

usethis::use_data(bacdive_abx, overwrite = TRUE)
usethis::use_data(bacdive_antibiogram, overwrite = TRUE)
usethis::use_data(bacdive_susceptibility, overwrite = TRUE)

bacdive_enzymes

bacdive_enzymes <- getEnzymes(list_holder, most_common = T, remove_unknown = T)

usethis::use_data(bacdive_enzymes, overwrite = TRUE)

Shen2021

Shen2021 <- read_csv(here("data-raw/Shen2021.csv"), show_col_types = FALSE)

usethis::use_data(Shen2021, overwrite = TRUE)

Celeste Gaughan's data

manual_curation_species <- read_tsv(here("data-raw/species_0831.txt")) %>%
  rename(taxon = name) %>%
  mutate(rank = "Species") %>%
  mutate(aerobic_status = case_when(aerobic_status %in% "not indicated" ~ NA_character_,
                                    !is.na(aerobic_status) ~ aerobic_status))

manual_curation_genera <- read_tsv(here("data-raw/genera_0831.txt")) %>%
  rename(taxon = name) %>%
  mutate(rank = "Genus") %>%
  mutate(aerobic_status = case_when(aerobic_status %in% "not indicated" ~ NA_character_,
                                    !is.na(aerobic_status) ~ aerobic_status))

usethis::use_data(manual_curation_species, overwrite = TRUE)
usethis::use_data(manual_curation_genera, overwrite = TRUE)

IJSEM

#read table
ijsem<-read.delim(here("data-raw", "IJSEM/IJSEM_pheno_db_v1.0.txt"), sep="\t", header=T, check.names=F, fill=T,
                  na.strings=c("NA", "", "Not indicated", " Not indicated","not indicated", "Not Indicated", "n/a", "N/A", "Na", "Not given", "not given","Not given for yeasts", "not indicated, available in the online version", "Not indicated for yeasts", "Not Stated", "Not described for yeasts", "Not determined", "Not determined for yeasts"), stringsAsFactors = T)

#simplify column names
colnames(ijsem)<-c("Habitat", "Year", "DOI", "rRNA16S", "GC", "Oxygen",
                  "Length", "Width", "Motility", "Spore", "MetabAssays", "Genus", "Species", "Strain", "pH_optimum", "pH_range", "Temp_optimum", "Temp_range", "Salt_optimum", "Salt_range", "Pigment", "Shape", "Aggregation", "FirstPage", "CultureCollection", "CarbonSubstrate", "Genome", "Gram", "Subhabitat", "Biolog")

#clean Habitat column
levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater (river, lake, pond)"]<-"freshwater"
levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater sediment (river, lake, pond"]<-"freshwater sediment"

#clean Oxygen column
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="aerobic"]<-"obligate aerobe"
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="anaerobic"]<-"obligate anaerobe"
levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="microerophile"]<-"microaerophile"

#clean pH_optimum column
ijsem$pH_optimum<-as.character(ijsem$pH_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$pH_optimum<-sapply(ijsem$pH_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))})
#remove pH values <0 and >10
ijsem$pH_optimum[ijsem$pH_optimum<0 | ijsem$pH_optimum>10]<-NA

#clean Temp_optimum column
ijsem$Temp_optimum<-as.character(ijsem$Temp_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$Temp_optimum<-sapply(ijsem$Temp_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))})

#clean Salt_optimum column
ijsem$Salt_optimum<-as.character(ijsem$Salt_optimum)
#this step splits the range values and takes the mean value
#values that are not numeric are transformed to NAs
ijsem$Salt_optimum<-sapply(ijsem$Salt_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))})
#there are some formatting issues that should be solved

usethis::use_data(ijsem, overwrite = TRUE)