Also saves a large type strain list for later use if needed
Pro-tip to search for a given bacdive ID: list.filter(list_holder, General$BacDive-ID
== "141146")
library(BacDive) library(tidyverse) library(usethis) library(magrittr) library(here) library(bacphene) library(rlist)
list_holder <- getStrains(typestrain_only = F)
#for unit tests # random_set <- sample(seq(1,length(list_holder)), 10) random_set <- c(46551L, 376L, 78203L, 51907L, 20151L, 41583L, 11608L, 36060L, 38371L, 6841L) test_list <- list() for (i in random_set) { test_list <- list.append(test_list, list_holder[[i]]) } test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Acidicapsa dinghuensis", typestrain_only = T)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Acinetobacter calcoaceticus", typestrain_only = T)[[1]]) #for testing getAbx (antibiogram) test_list <- list.append(test_list, getStrainLocal(list_holder, query = 21374, typestrain_only = F)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = 280, typestrain_only = F)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = 8094, typestrain_only = F)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = 159751, typestrain_only = F)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = 158420, typestrain_only = F)[[1]]) #for testing getAbx test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Abyssibacter profundi", typestrain_only = T)[[1]]) #for testing getEnzymes test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Bacteroides fragilis", typestrain_only = T)[[1]]) test_list <- list.append(test_list, getStrainLocal(list_holder, query = "Bacteroides fragilis", typestrain_only = F)[[2]]) #for testing getStrainLocal and having multiple strains of the same species attr(test_list, "date_downloaded") <- attr(list_holder, "date_downloaded") write_rds(test_list, file = here("tests/testthat/test_data.rda"))
Now we want to extract the important bits (gram stain, oxygen tolerance, abx sensitivity / resistance, and urease activity)
Cell morphology -> gram_stain
bacdive_morphology <- getMorphology(list_holder) usethis::use_data(bacdive_morphology, overwrite = TRUE)
Oxygen tolerance -> aerobic_status
bacdive_oxygen <- getOxygen(list_holder) usethis::use_data(bacdive_oxygen, overwrite = TRUE)
Here we are combining the essential parts of bacdive_morphology
and bacdive_oxygen
to get a dataframe that is compatible with the abxidx package.
bacdive_phenotypes <- getPhenotypes(morphology_df = bacdive_morphology, oxygen_df = bacdive_oxygen) usethis::use_data(bacdive_phenotypes, overwrite = TRUE) # maybe concatenate multiple entries? # summarise( # gram_stain = paste(gram_stain, collapse = ", "), # aerobic_status = paste(aerobic_status, collapse = ", "), # ID = paste(ID, collapse = ", ") # ) # If we want to exactly match the values for abxidx # bacdive_phenotypes <- bacdive_phenotypes %>% # mutate(gram_stain = case_when(gram_stain %in% "positive" ~ "Gram-positive", # gram_stain %in% "negative" ~ "Gram-negative")) # this turns all the "variable" values for gram_stain into NA's
bacdive_abx
contains antibiotics information from bacdive, including multiple strains and details for antibiotic sensitivity / resistance. Antibiotics information that is in the "antibiogram" format is in the bacdive_antibiogram
data frame. bacdive_susceptibility
combines the preceding two data frames and contains a simplified version that gets the most common values for each taxon. bacdive_susceptibility
is compatible with the abxidx package.
bacdive_abx <- getAbx(list_holder) bacdive_antibiogram <- getAntibiogram(list_holder) # strains_with_abx <- bacdive_abx %>% # select(ID, taxon, rank, type_strain) %>% # bind_rows(bacdive_antibiogram %>% select(ID, taxon, rank, type_strain)) %>% # distinct() %>% # mutate(abx_info = if_else(ID %in% bacdive_abx$ID, T, F)) %>% # mutate(antibiogram = if_else(ID %in% bacdive_antibiogram$ID, T, F)) bacdive_combined_abx <- bind_rows(bacdive_abx, bacdive_antibiogram) bacdive_susceptibility <- getSimplifiedAbx(data = bacdive_combined_abx, extra_info = F, most_common = T, remove_unknown = T) usethis::use_data(bacdive_abx, overwrite = TRUE) usethis::use_data(bacdive_antibiogram, overwrite = TRUE) usethis::use_data(bacdive_susceptibility, overwrite = TRUE)
bacdive_enzymes <- getEnzymes(list_holder, most_common = T, remove_unknown = T) usethis::use_data(bacdive_enzymes, overwrite = TRUE)
Shen2021 <- read_csv(here("data-raw/Shen2021.csv"), show_col_types = FALSE) usethis::use_data(Shen2021, overwrite = TRUE)
manual_curation_species <- read_tsv(here("data-raw/species_0831.txt")) %>% rename(taxon = name) %>% mutate(rank = "Species") %>% mutate(aerobic_status = case_when(aerobic_status %in% "not indicated" ~ NA_character_, !is.na(aerobic_status) ~ aerobic_status)) manual_curation_genera <- read_tsv(here("data-raw/genera_0831.txt")) %>% rename(taxon = name) %>% mutate(rank = "Genus") %>% mutate(aerobic_status = case_when(aerobic_status %in% "not indicated" ~ NA_character_, !is.na(aerobic_status) ~ aerobic_status)) usethis::use_data(manual_curation_species, overwrite = TRUE) usethis::use_data(manual_curation_genera, overwrite = TRUE)
#read table ijsem<-read.delim(here("data-raw", "IJSEM/IJSEM_pheno_db_v1.0.txt"), sep="\t", header=T, check.names=F, fill=T, na.strings=c("NA", "", "Not indicated", " Not indicated","not indicated", "Not Indicated", "n/a", "N/A", "Na", "Not given", "not given","Not given for yeasts", "not indicated, available in the online version", "Not indicated for yeasts", "Not Stated", "Not described for yeasts", "Not determined", "Not determined for yeasts"), stringsAsFactors = T) #simplify column names colnames(ijsem)<-c("Habitat", "Year", "DOI", "rRNA16S", "GC", "Oxygen", "Length", "Width", "Motility", "Spore", "MetabAssays", "Genus", "Species", "Strain", "pH_optimum", "pH_range", "Temp_optimum", "Temp_range", "Salt_optimum", "Salt_range", "Pigment", "Shape", "Aggregation", "FirstPage", "CultureCollection", "CarbonSubstrate", "Genome", "Gram", "Subhabitat", "Biolog") #clean Habitat column levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater (river, lake, pond)"]<-"freshwater" levels(ijsem$Habitat)[levels(ijsem$Habitat)=="freshwater sediment (river, lake, pond"]<-"freshwater sediment" #clean Oxygen column levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="aerobic"]<-"obligate aerobe" levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="anaerobic"]<-"obligate anaerobe" levels(ijsem$Oxygen)[levels(ijsem$Oxygen)=="microerophile"]<-"microaerophile" #clean pH_optimum column ijsem$pH_optimum<-as.character(ijsem$pH_optimum) #this step splits the range values and takes the mean value #values that are not numeric are transformed to NAs ijsem$pH_optimum<-sapply(ijsem$pH_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}) #remove pH values <0 and >10 ijsem$pH_optimum[ijsem$pH_optimum<0 | ijsem$pH_optimum>10]<-NA #clean Temp_optimum column ijsem$Temp_optimum<-as.character(ijsem$Temp_optimum) #this step splits the range values and takes the mean value #values that are not numeric are transformed to NAs ijsem$Temp_optimum<-sapply(ijsem$Temp_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}) #clean Salt_optimum column ijsem$Salt_optimum<-as.character(ijsem$Salt_optimum) #this step splits the range values and takes the mean value #values that are not numeric are transformed to NAs ijsem$Salt_optimum<-sapply(ijsem$Salt_optimum, simplify=T, function(x){mean(as.numeric(unlist(strsplit(x, split="-", fixed=T))))}) #there are some formatting issues that should be solved usethis::use_data(ijsem, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.