taxa_supported = c("amphibian", "bird", "fish", "mammal", "plant", "reptile", "shark_ray", "bee", "butterfly")
usethis::use_data(taxa_supported, overwrite = T)
library(tidyverse)
library(ape)
library(tidytree)
# classifications of tips in the mega-trees ----
# # plants ----
# load(rawConnection(RCurl::getBinaryURL("https://raw.githubusercontent.com/jinyizju/V.PhyloMaker/master/data/nodes.info.1.rda")))
# classification_plants = select(nodes.info.1, genus, family) %>%
# unique() %>%
# filter(genus != "") %>%
# as_tibble()
# # by looking at the phylogeny, two species do not have classification info:
# # Malaisia_scandens, Lithraea_molleoides
# # https://en.wikipedia.org/wiki/Trophis_scandens
# # tips$family[tips$species == "Malaisia_scandens"] = "Euphorbiaceae" # wiki
# # but the plant list said it is an annoymous...which makes me wonder how
# # V.PhyloMaker did their name standardization. I probably should do it by myself.
# # http://www.theplantlist.org/tpl1.1/record/kew-5837
# # tips$family[tips$species == "Lithraea_molleoides"] = "Anacardiaceae" # wiki
# # https://en.wikipedia.org/wiki/Lithraea_molleoides
# if(!"Malaisia" %in% classification_plants$genus)
# classification_plants = add_row(classification_plants,
# genus = "Malaisia",
# family = "Euphorbiaceae")
# if(!"Lithraea" %in% classification_plants$genus)
# classification_plants = add_row(classification_plants,
# genus = "Lithraea",
# family = "Anacardiaceae")
# # all genus and family from The Plant List ----
# tpl_family = xml2::read_html("http://theplantlist.org/1.1/browse/-/") %>%
# rvest::html_nodes(".family") %>%
# rvest::html_text() # 652 families
#
# get_sp_per_family = function(x = "Didiereaceae"){
# cat(x, "\t")
# base_url = paste0("http://theplantlist.org/1.1/browse/A/", x, "/", x, ".csv")
# out = try(read_csv(base_url))
# if(inherits(out, "try-error")){
# base_url = paste0("http://theplantlist.org/1.1/browse/B/", x, "/", x, ".csv")
# out = try(read_csv(base_url))
# }
# if(inherits(out, "try-error")){
# base_url = paste0("http://theplantlist.org/1.1/browse/P/", x, "/", x, ".csv")
# out = try(read_csv(base_url))
# }
# if(inherits(out, "try-error")){
# base_url = paste0("http://theplantlist.org/1.1/browse/G/", x, "/", x, ".csv")
# out = try(read_csv(base_url))
# }
# out = try(out %>%
# select(genus = Genus, family = Family) %>%
# unique())
# out
# }
# classification_plant = map(tpl_family, get_sp_per_family)
# classification_plant_TPL = bind_rows(classification_plant)
# usethis::use_data(classification_plant_TPL)
# classification_plant = bind_rows(classification_plant_TPL,
# # from Jin & Qian, 2019
# tibble::tribble(~genus,~family,
# "Davilanthus","Asteraceae",
# "Ewartiothamnus","Asteraceae",
# "Myrovernix","Asteraceae",
# "Gongyloglossa","Asteraceae",
# "Laevicarpa","Asteraceae",
# "Monticapra","Asteraceae",
# "Leucosyris","Asteraceae",
# "Kieslingia","Asteraceae",
# "Tephrothamnus","Asteraceae",
# "Kurziella","Asteraceae",
# "Sampera","Asteraceae",
# "Platycarphella","Asteraceae",
# "Pseudocodon","Campanulaceae",
# "Pankycodon","Campanulaceae",
# "Himalacodon","Campanulaceae",
# "Rivasmartinezia","Apiaceae",
# "Schultzia","Apiaceae",
# "Rughidia","Apiaceae",
# "Szovitsia","Apiaceae",
# "Spuriopimpinella","Apiaceae",
# "Trichera","Caprifoliaceae",
# "Erythranthe","Phrymaceae",
# "Diceratotheca","Acanthaceae",
# "Chayamaritia","Gesneriaceae",
# "Somrania","Gesneriaceae",
# "Tribounia","Gesneriaceae",
# "Lesia","Gesneriaceae",
# "Trichodrymonia","Gesneriaceae",
# "Johnstonella","Boraginaceae",
# "Greeneocharis","Boraginaceae",
# "Foonchewia","Rubiaceae",
# "Edrastima","Rubiaceae",
# "Dimetia","Rubiaceae",
# "Rubiaceae","Rubiaceae",
# "Rhachicallis","Rubiaceae",
# "Anemotrochus","Apocynaceae",
# "Monsanima","Apocynaceae",
# "Calciphila","Apocynaceae",
# "Richtersveldia","Apocynaceae",
# "White-sloanea","Apocynaceae",
# "Agiortia","Ericaceae",
# "Acrothamnus","Ericaceae",
# "Leptecophylla","Ericaceae",
# "Pleioluma","Sapotaceae",
# "Bemangidia","Sapotaceae",
# "Kewa","Kewaceae",
# "Pseudocherleria","Caryophyllaceae",
# "Sedobassia","Amaranthaceae",
# "Bactria","Polygonaceae",
# "Solori","Fabaceae",
# "Oberholzeria","Fabaceae",
# "Gabonius","Fabaceae",
# "Symbegonia","Begoniaceae",
# "Synostemon","Phyllanthaceae",
# "Gitara","Euphorbiaceae",
# "Hartogiopsis","Celastraceae",
# "Thelypodieae","Brassicaceae",
# "Phyllolepidum","Brassicaceae",
# "Kitaibela","Malvaceae",
# "Anthocarapa","Meliaceae",
# "Tetracarpaea","Tetracarpaeaceae",
# "Eucarpha","Proteaceae",
# "Oncidiinae","Orchidaceae",
# "Schlimmia","Orchidaceae",
# "Orthochilus","Orchidaceae",
# "Pendulorchis","Orchidaceae",
# "Neooreophilus","Orchidaceae",
# "Dracontia","Orchidaceae",
# "Sansonia","Orchidaceae",
# "Danxiaorchis","Orchidaceae",
# "Orchidaceae","Orchidaceae",
# "Tsaiorchis","Orchidaceae",
# "Dithrix","Orchidaceae",
# "Sinocurculigo","Hypoxidaceae",
# "Dupontiopsis","Poaceae",
# "Koordersiochloa","Poaceae",
# "Calliscirpus","Cyperaceae",
# "Wallisia","Bromeliaceae",
# "Stigmatodon","Bromeliaceae",
# "Zizkaea","Bromeliaceae",
# "Josemania","Bromeliaceae",
# "Borneocola","Zingiberaceae",
# "Ripogonum","Ripogonaceae",
# "Onixotis","Colchicaceae",
# "Schottarum","Araceae",
# "Fenestratarum","Araceae",
# "Hottarum","Araceae",
# "Guamia","Annonaceae",
# "Winitia","Annonaceae",
# "Huberantha","Annonaceae",
# "Sirdavidia","Annonaceae",
# "Hypodematium","Hypodematiaceae",
# "Desmophlebium","Desmophlebiaceae")
# )
# filter(classification_plant_TPL, genus %in% filter(classification_plant_TPL, duplicated(genus))$genus)
# any(duplicated(classification_plant_TPL$genus)) # all genus monophytic? No...
# fish ----
fishurl3 = "https://fishtreeoflife.org/downloads/PFC_taxonomy.csv.xz"
tempf2 = tempfile()
download.file(fishurl3, tempf2)
fish_names = read.csv(tempf2, stringsAsFactors = F) %>% as_tibble() %>%
select(genus.species, genus, family)
unlink(tempf2)
fish_names = mutate(fish_names, genus2 = gsub("^([-A-Za-z]*) .*$", "\\1", genus))
all(fish_names$genus == fish_names$genus2)
fish_names = select(fish_names, -genus2) %>%
rename(species = genus.species) %>%
mutate(species = gsub(" ", "_", species))
classification_fish = select(fish_names, genus, family) %>%
unique()
any(duplicated(classification_fish$genus)) # all genus monophytic? T
# birds ----
tempf = tempfile(fileext = ".zip")
download.file("http://datazone.birdlife.org/userfiles/file/Species/Taxonomy/HBW-BirdLife_Checklist_v3_Nov18.zip",
tempf)
unzip(tempf, list = T)
unzip(tempf, file = "HBW-BirdLife_Checklist_Version_3.xlsx")
bird_names = readxl::read_excel("HBW-BirdLife_Checklist_Version_3.xlsx", skip = 1)
unlink(tempf)
unlink("HBW-BirdLife_Checklist_Version_3.xlsx")
bird_names = unique(dplyr::select(bird_names, species = `Scientific name`, family = `Family name`)) %>%
mutate(species = gsub(" ", "_", species))
bird_names = mutate(bird_names, genus = gsub("^([-A-Za-z]*)_.*$", "\\1", species))
classification_bird = select(bird_names, genus, family) %>% unique()
tips = read_csv("https://data.vertlife.org/birdtree/BLIOCPhyloMasterTax.csv") %>%
select(species = TipLabel, family = BLFamilyLatin) %>%
mutate(genus = gsub("^([-A-Za-z]*)_.*$", "\\1", species))
select(tips, genus, family) %>% unique()
all(tips$genus %in% classification_bird$genus)
setdiff(tips$genus, classification_bird$genus)
setdiff(classification_bird$genus, tips$genus)
full_join(unique(select(tips, genus, family)), classification_bird)
# filter(classification_bird, genus == "Bias")
# filter(tips, genus == "Bias")
# It seems the online version of birdlife is more accurate, if a genus has different family,
# use birdlife's version.
classification_bird2 = filter(unique(select(tips, genus, family)),
!genus %in% classification_bird$genus)
filter(tips, genus %in%
classification_bird2$genus[duplicated(classification_bird2$genus)])
# double checked, and it should be an error,
# genus Chlorothraupis belongs to family Cardinalidae
classification_bird2 = filter(classification_bird2, !(genus == "Chlorothraupis" & family == "Thraupidae"))
classification_bird = bind_rows(
classification_bird,
classification_bird2
)
any(duplicated(classification_bird$genus))
# mammals ----
names_mammal = read_csv("https://raw.githubusercontent.com/MegaPast2Future/PHYLACINE_1.2/master/Data/Taxonomy/Synonymy_table_valid_species_only.csv")
names_mammal = unique(select(names_mammal, species = Binomial.1.2, genus = Genus.1.2, family = Family.1.2))
classification_mammal = unique(select(names_mammal, genus, family)) %>% mutate(taxon = "mammal")
any(duplicated(classification_mammal$genus))
# combine classifications ----
classifications = bind_rows(mutate(classification_plants, taxon = "plant"),
mutate(classification_fish, taxon = "fish"))
classifications = bind_rows(classifications,
mutate(classification_bird, taxon = "bird"))
classifications = bind_rows(classifications, classification_mammal)
# other genus based on later tests
classifications = add_row(classifications,
genus = "Epifagus", family = "Orobanchaceae", taxon = "plant") %>%
add_row(genus = "Elytrigia", family = "Poaceae", taxon = "plant")
classifications = add_row(classifications,
genus = "Rumex", family = "Polygonaceae", taxon = "plant")
classifications = bind_rows(classifications,
read_csv("genus,family,taxon\nParacoelops,Hipposideridae,mammal\nDesmalopex,Pteropodidae,mammal\nSubmyotodon,Vespertilionidae,mammal\nEudiscoderma,Megadermatidae,mammal\nParastrellus,Vespertilionidae,mammal\nDryadonycteris,Phyllostomidae,mammal\nHsunycteris,Phyllostomidae,mammal\nPseudalopex,Canidae,mammal\nLeontocebus,Cebidae,mammal\nCallibella,Callitrichidae,mammal\nPipanacoctomys,Octodontidae,mammal\nLiomys,Heteromyidae,mammal\nMusseromys,Muridae,mammal\nMirzamys,Muridae,mammal\nHalmaheramys,Muridae,mammal\nWaiomys,Muridae,mammal\nDrymoreomys,Cricetidae,mammal\nCalassomys,Cricetidae,mammal\nParalomys,Cricetidae,mammal\n")) %>%
unique() %>%
arrange(taxon, genus)
# taxonlookup ----
# devtools::install_github("wcornwell/taxonlookup")
# maily from TPL too.
aplant = taxonlookup::plant_lookup() %>%
select(genus, family) %>%
mutate(taxon = "plant") %>%
as_tibble()
classifications = bind_rows(classifications, aplant) %>%
unique() %>%
arrange(taxon, family, genus)
classifications$family[classifications$family == "Isoëtaceae"] = "Isoetaceae"
classifications = tibble::add_row(classifications,
genus = c("Dasyatis", "Entosphenus", "Ichthyomyzon", "Lampetra", "Lethenteron", "Petromyzon"),
family = c("Dasyatidae", rep("Petromyzontidae", 5)), taxon = "fish")
classifications = unique(classifications)
classifications = filter(classifications, !(genus == "Nyssa" & family == "Cornaceae"))
classifications = add_row(classifications,
genus = c("Podagrostis", "Hesperostipa"), family = c("Poaceae", "Poaceae"),
taxon = c("plant", "plant"))
usethis::use_data(classifications, overwrite = T, compress = "xz")
# duplicated genera ----
dp = filter(classifications, taxon == "plant") %>%
group_by(genus) %>%
tally() %>%
filter(n > 1) %>% pull(genus)
# # checked with POWO http://www.plantsoftheworldonline.org/
# # several genus not there or have different family
# # used Wiki or TPL instead
# taxize::get_pow_("Balbisia")[[1]] %>%
# filter(rank == "Genus", accepted) %>%
# pull(family)
# taxize::pow_lookup("urn:lsid:ipni.org:names:7831-1")$meta$taxonomicStatus
# taxize::pow_lookup("urn:lsid:ipni.org:names:329554-2")$meta$taxonomicStatus
# taxize::pow_synonyms(id = 'urn:lsid:ipni.org:names:7831-1')
# taxize::pow_synonyms(id = 'urn:lsid:ipni.org:names:329554-2')
# xb = taxize::pow_search(q = "Balbisia")
# xb$meta
# View(xb$data)
gfam_powo = tribble(
~genus, ~family, ~taxon,
"Alzatea","Alzateaceae","plant",
"Anthobolus","Santalaceae","plant",
"Apodytes","Metteniusaceae","plant",
"Axinandra","Crypteroniaceae","plant",
"Balbisia","Francoaceae","plant",
"Batis","Bataceae","plant",
"Bersama","Francoaceae","plant",
"Bornmuellerantha","Orobanchaceae","plant",
"Borthwickia","Resedaceae","plant",
"Bottegoa","Rutaceae","plant",
"Brandisia","Orobanchaceae","plant",
"Calandrinia","Montiaceae","plant",
"Calatola","Metteniusaceae","plant",
"Calophyllum","Calophyllaceae","plant",
"Camptotheca","Nyssaceae","plant",
"Cedrelopsis","Rutaceae","plant",
"Chaetocarpus","Peraceae","plant",
"Chamaescilla","Asphodelaceae","plant",
"Cissarobryon","Francoaceae","plant",
"Cistanthe","Montiaceae","plant",
"Cleoserrata","Cleomaceae","plant",
"Corrigiola","Molluginaceae","plant",
"Crypteronia","Crypteroniaceae","plant",
"Cubitanthus","Gesneriaceae","plant",
"Cyathobasis","Amaranthaceae","plant",
"Dactylocladus","Crypteroniaceae","plant",
"Davidia","Nyssaceae","plant",
"Dendrobangia","Metteniusaceae","plant",
"Diplopanax","Nyssaceae","plant",
"Dodartia","Mazaceae","plant",
"Emmotum","Metteniusaceae","plant",
"Euryodendron","Theaceae","plant",
"Forchhammeria","Resedaceae","plant",
"Francoa","Francoaceae","plant",
"Gallesia","Petiveriaceae","plant",
"Greyia","Francoaceae","plant",
"Griselinia","Griseliniaceae","plant",
"Hemidictyum","Aspleniaceae","plant",
"Hilleria","Petiveriaceae","plant",
"Kaliphora","Cornaceae","plant",
"Lancea","Mazaceae","plant",
"Lapiedra","Amaryllidaceae","plant",
"Ledenbergia","Petiveriaceae","plant",
"Lindenbergia","Orobanchaceae","plant",
"Macarthuria","Macarthuriaceae","plant",
"Malaisia","Moraceae","plant",
"Mastixia","Nyssaceae","plant",
"Maundia","Maundiaceae","plant",
"Mazus","Mazaceae","plant",
"Melianthus","Francoaceae","plant",
"Microtea","Microteaceae","plant",
"Nuttallia","Rosaceae","plant",
"Oecopetalum","Metteniusaceae","plant",
"Ottoschulzia","Metteniusaceae","plant",
"Peltanthera","Gesneriaceae","plant",
"Petiveria","Petiveriaceae","plant",
"Philcoxia","Plantaginaceae","plant",
"Pittosporopsis","Icacinaceae","plant",
"Platea","Icacinaceae","plant",
"Poraqueiba","Metteniusaceae","plant",
"Purdiaea","Clethraceae","plant",
"Pyrsonota","Elaeocarpaceae","plant",
"Rehmannia","Orobanchaceae","plant",
"Rhaphiostylis","Metteniusaceae","plant",
"Rhipogonum","Rhipogonaceae","plant",
"Rhynchotheca","Geraniaceae","plant",
"Richea","Rhizophoraceae","plant",
"Rivina","Petiveriaceae","plant",
"Seguieria","Petiveriaceae","plant",
"Stemodiopsis","Linderniaceae","plant",
"Stilbocarpa","Apiaceae","plant",
"Stixis","Resedaceae","plant",
"Tetilla","Saxifragaceae","plant",
"Triaenophora","Plantaginaceae","plant",
"Trichopodium","Dioscoreaceae","plant",
"Trichostigma","Petiveriaceae","plant",
"Trigonopleura","Peraceae","plant",
"Viviania","Vivianiaceae","plant",
"Wendtia","Francoaceae","plant"
)
classifications = bind_rows(
filter(classifications, taxon == "plant",
!genus %in% dp),
gfam_powo
) %>%
bind_rows(filter(classifications, taxon != "plant"))
usethis::use_data(classifications, overwrite = T, compress = "xz")
# catalogue of life 2019 ----
## https://www.catalogueoflife.org/content/annual-checklist-archive
catl_2019 = vroom::vroom("~/Downloads/2019-annual/taxa.txt")
sort(unique(catl_2019$kingdom))
xc = filter(catl_2019, kingdom == "Plantae")
filter(xc, taxonomicStatus == "accepted name")
xc_cls = select(xc, family, genus) %>%
drop_na(family, genus) %>%
distinct()
n_distinct(xc_cls$family)
n_distinct(xc_cls$genus)
xc_cls2 = left_join(xc_cls, filter(classifications, taxon == "plant"))
filter(xc_cls2, is.na(taxon)) %>% View()
xc_cls3 = filter(xc_cls2, is.na(taxon), family != "Not assigned") %>%
mutate(taxon = "plant")
xc_cls3[!xc_cls3$genus %in% classifications$genus, ] # 26 new genus
classifications = bind_rows(classifications,
xc_cls3[!xc_cls3$genus %in% classifications$genus, ])
filter(classifications, taxon == "plant") %>%
group_by(genus) %>%
summarise(n_f = n_distinct(family)) %>%
arrange(desc(n_f)) %>%
filter(n_f > 1) # no duplicated genus
# Plant of World online data ----
# https://github.com/RBGKew/powo-data/blob/master/data-prod.json
"https://storage.googleapis.com/powop-content/backbone/powoNames.zip" # download and unzip. it is large
powo = data.table::fread("/media/dli/Data/common_data/taxon_powo.txt")
powo2 = filter(powo, V3 == "Genus", V28 == "Accepted")
powo_gf = select(powo2, genus = V6, family = V5) %>% distinct() %>%
mutate(taxon = "plant") %>% as_tibble()
setdiff(filter(classifications, taxon == "plant")$genus, powo_gf$genus)
setdiff(powo_gf$genus, filter(classifications, taxon == "plant")$genus)
filter(powo_gf, genus %in% intersect(powo_gf$genus, filter(classifications, taxon == "plant")$genus)) %>%
rename(family_powo = family) %>%
left_join(filter(classifications, taxon == "plant")) %>%
mutate(dame = family_powo == family) -> tst
filter(tst, !dame) %>% View()
class_plant = bind_rows(
# unique genus from other sources
filter(classifications, taxon == "plant", !genus %in% powo_gf$genus),
# common genus between other sources and POWO, use accepted info from POWO instead
filter(powo_gf, genus %in% intersect(powo_gf$genus, filter(classifications, taxon == "plant")$genus)),
# unique genus from POWO
filter(powo_gf, genus %in% setdiff(powo_gf$genus, filter(classifications, taxon == "plant")$genus))
)
classifications = bind_rows(class_plant,
filter(classifications, taxon != "plant")) %>%
distinct()
tools::showNonASCII(classifications$genus)
# classifications$genus[15153] = "Leptochloopsis" # "Leptochloöpsis"
# classifications$genus = stringi::stri_trans_general(classifications$genus, "Latin-ASCII")
# amphibians ----
amph0 = read_csv("https://data.vertlife.org/amphibiantree/download/amph_shl_new_Classification.csv")
xfun::download_file("https://data.vertlife.org/amphibiantree/download/amph_shl_new_Classification.csv", tempf)
amph = filter(amph0, Taxon != "Outgroup") %>%
dplyr::select(sp = `Scientific Name`, family = Family) %>%
mutate(genus = str_extract(sp, "^[^ ]*")) %>%
select(genus, family) %>%
distinct() %>%
mutate(taxon = "amphibian") %>%
arrange(genus) %>%
filter(genus != "")
unlink(tempf)
classifications = bind_rows(amph, classifications) %>%
as_tibble() %>%
distinct()
classifications = tibble::add_row(classifications, genus = "Ooeidozyga",
family = "Dicroglossidae", taxon = "amphibian")
classifications = dplyr::arrange(classifications, taxon, genus)
# mammal vertlife ---
mammal_class_vertlife = read_csv("https://data.vertlife.org/mammaltree/taxonomy_mamPhy_5911species.csv")
mammal_class_vertlife = dplyr::select(mammal_class_vertlife, genus = gen, family = fam) %>%
distinct() %>%
mutate(family = cap_first_letter(tolower(family)))
setdiff(mammal_class_vertlife$genus,
filter(classifications, taxon == "mammal")$genus)
left_join(mammal_class_vertlife,
rename(filter(classifications, taxon == "mammal"), f = family)) %>%
mutate(same = family == f) %>%
filter(!same) %>% View()
# it seems the vertlife classification is more accurate
classifications = filter(classifications, !(taxon == "mammal" & genus %in% mammal_class_vertlife$genus))
classifications = bind_rows(mutate(mammal_class_vertlife, taxon = "mammal"),
classifications) %>%
distinct() %>%
arrange(taxon, genus)
# shark ---
sharks = read_csv("https://data.vertlife.org/sharktree/Species.list.csv")
shark_genus = sort(unique(str_extract(sharks$Species_list, "^[^_]*")))
setdiff(shark_genus, classifications$genus) # no...
shark_class = vector("list", length(shark_genus))
for(i in 144:length(shark_genus)){
shark_class[[i]] = taxize::tax_name(shark_genus[i], get = "family", db = "ncbi")
if(i %% 10 == 0) Sys.sleep(time = 2)
}
shark_class = bind_rows(shark_class)
filter(shark_class, is.na(family))
x = taxize::tax_name(filter(shark_class, is.na(family))$query, get = "family", db = "itis")
x$family[x$query == "Electrolux"] = "Narkidae"
x$family[x$query == "Makararaja"] = "Dasyatidae"
x$family[x$query == "Spiniraja"] = "Rajidae"
x$family[x$query == "Taeniurops"] = "Dasyatidae"
ss = ape::read.nexus("shark_10.cal.tree.nex")
ss1 = ss[[1]]
setdiff(sharks$Species_list, ss1$tip.label)
setdiff(ss1$tip.label, sharks$Species_list)
shark_class = bind_rows(shark_class, x) %>%
filter(!is.na(family))
shark_class = dplyr::select(shark_class, -db, genus = query, family) %>%
mutate(taxon = "shark_ray") %>%
arrange(genus)
classifications = bind_rows(classifications, shark_class) %>%
arrange(taxon, genus) %>%
distinct()
# reptile ----
rept = read_csv("https://data.vertlife.org/squamatetree/sqamate_names.csv", col_names = F)
rept_genus = sort(unique(str_extract(rept$X1, "^[^ ]*")))
setdiff(rept_genus, rtrees::classifications$genus) # no...
rept_class = vector("list", length(rept_genus))
for(i in 1:length(rept_genus)){
rept_class[[i]] = taxize::tax_name(rept_genus[i], get = "family", db = "ncbi", ask = F)
if(i %% 10 == 0) Sys.sleep(time = 30)
}
rept_class = bind_rows(rept_class)
x_rep = taxize::tax_name(filter(rept_class, is.na(family))$query, get = "family", db = "ncbi")
rept_class = read_csv("~/Documents/rept_class.csv")
n_distinct(rept_class$family) # 62
# get from wikipedia https://en.wikipedia.org/wiki/List_of_reptile_genera
rep_2 = readLines("~/Documents/reptile.txt")
f2 = grep("Family", rep_2, value = T)
rept_class_wiki = vector("list", length = length(f2))
names(rept_class_wiki) = f2
j = 1
for(i in 2:length(rep_2)){
cat("i = ", i, "\t")
if(rep_2[i] %in% names(rept_class_wiki)){
j = j + 1
cat("j = ", j, "\n")
next()
}
if(!grepl(pattern = "Family", x = rep_2[i])){
rept_class_wiki[[j]] = c(rept_class_wiki[[j]], rep_2[i])
}
}
rept_class_wiki = bind_rows(lapply(rept_class_wiki, as.data.frame), .id = "family") %>%
set_names(c("family", "genus")) %>%
filter(genus != "") %>%
as_tibble() %>%
mutate(family = gsub("Family ", "", family),
family = str_trim(family),
genus = str_trim(genus))
setdiff(rept_class$genus, rept_class_wiki$genus)
setdiff(rept_class_wiki$genus, rept_class$genus)
rept_class_wiki2 = set_names(rept_class_wiki, c("family_wiki", "genus"))
xx = full_join(rept_class, rept_class_wiki2) %>%
mutate(same = family == family_wiki)
xx = mutate(xx, family_wiki = ifelse(is.na(family_wiki), family, family_wiki))
reptile_class = select(xx, genus, family = family_wiki) %>%
mutate(taxon = "reptile") %>%
distinct()
reptile_class[which(duplicated(reptile_class$genus)),]
reptile_class = filter(reptile_class, !genus %in% c("Homoroselaps", "Xylophis"))
reptile_class = add_row(reptile_class,
genus = c("Homoroselaps", "Xylophis"),
family = c("Atractaspididae", "Pareidae"),
taxon = "reptile")
classifications = bind_rows(classifications, reptile_class) %>%
arrange(taxon, genus) %>%
distinct()
classifications = add_row(classifications,
genus = c("Elachistodon", "Thalesius", "Parahelicops", "Pararhabdophis", "Vietnascincus",
"Haackgreerius", "Geomyersia", "Geoscincus", "Tachygyia", "Leptoseps", "Chabanaudia",
"Scolecoseps", "Chalcidoseps", "Sepsophis", "Nessia", "Jarujinia", "Barkudia", "Rhinogecko"),
family = c("Colubridae", "Colubridae", "Colubridae", "Colubridae", "Scincidae",
"Scincidae", "Scincidae", "Scincidae", "Scincidae", "Scincidae", "Scincidae",
"Scincidae", "Scincidae", "Scincidae", "Scincidae", "Scincidae", "Scincidae", "Gekkonidae"
),
taxon = "reptile")
filter(classifications, taxon == "amphibian", genus == "Homo")
o
# plants world flora online ----
xfun::download_file("http://104.198.143.165/files/WFO_Backbone/_WFOCompleteBackbone/WFO_Backbone.zip")
unzip("WFO_Backbone.zip", list = T)
unzip("WFO_Backbone.zip", file = "classification.txt")
pfo = read_delim("classification.txt", delim = "\t")
pfo2 = unique(select(filter(pfo, taxonomicStatus == "ACCEPTED", taxonRank == "SPECIES",
majorGroup %in% c("A", "G", "P")), # Angiosperms, Gymnosperms, Pteridophytes
genus, family, majorGroup))
file.remove("classification.txt", "WFO_Backbone.zip")
pfo2$genus[which(duplicated(pfo2$genus))] # genus with multiple family ...
filter(pfo2, genus == "Athyrium")
# use the family with higher frequency??
pfo3 = filter(pfo, genus %in% pfo2$genus[which(duplicated(pfo2$genus))],
taxonomicStatus == "ACCEPTED", taxonRank == "SPECIES",
majorGroup %in% c("A", "G", "P")) %>% # Angiosperms, Gymnosperms, Pteridophytes
group_by(genus, family, majorGroup) %>%
tally()
# use the family with higher frequency!
pfo4 = arrange(pfo3, genus, desc(n)) %>%
group_by(genus) %>%
slice_max(order_by = n) %>%
select(-n)
pfo5 = filter(pfo2, !genus %in% pfo2$genus[which(duplicated(pfo2$genus))])
pfo_final = bind_rows(pfo5, pfo4) %>%
arrange(genus)
table(pfo_final$majorGroup)
any(duplicated(pfo_final$genus))
# check with existing data
x = filter(classifications, taxon == "plant")
setdiff(pfo_final$genus, x$genus) # 80
filter(pfo_final, genus %in% x$genus) %>%
left_join(x, by = "genus") %>%
filter(family.x != family.y) %>% View()
# keep the WFO version
setdiff(x$genus, pfo_final$genus)
pc1 = filter(x, !genus %in% pfo_final$genus) %>%
bind_rows(select(pfo_final, -majorGroup) %>%
mutate(taxon = "plant"))
classifications = filter(classifications, taxon != "plant") %>%
bind_rows(pc1)
classifications = add_row(classifications,
genus = c("Psalidodon", "Curculionichthys"),
family = c("Characidae", "Loricariidae"),
taxon = "fish")
# V.PhyloMaker2
xv = mutate(V.PhyloMaker2::tips.info.TPL, genus2 = str_extract(species, "^[^_]+"),
same = genus == genus2) %>% filter(same) %>%
as_tibble() %>%
select(group, genus, family) %>%
distinct()
xc = filter(classifications, taxon == "plant")
# v phylomaker only
xv2 = filter(xv, genus %in% setdiff(xv$genus, xc$genus)) %>%
select(-group) %>%
mutate(taxon = "plant")
# some genus in V.PhyloMaker2 have diff family as the World Flora online,
# I keep the World Flora Online version here
classifications = bind_rows(classifications, xv2)
group_by(classifications, taxon) %>%
summarise(dup = any(duplicated(genus)))
xa = filter(classifications, taxon == "amphibian")
filter(xa, duplicated(genus))
filter(xa, genus == "Ingerana")
classifications = filter(classifications, !(genus == "Ingerana" & family == "Ceratobatrachidae"))
# bee ----
readxl::read_excel("http://beetreeoflife.org/downloads/files/BEE_taxonomic_database.xlsx", sheet = 2)
xfun::download_file("http://beetreeoflife.org/downloads/files/BEE_taxonomic_database.xlsx",
"~/Downloads/BEE_taxonomic_database.xlsx")
bees_class = readxl::read_excel("~/Downloads/BEE_taxonomic_database.xlsx", sheet = 2) |>
dplyr::select(genus = Genus, family = Subfamily, Family) |>
unique() |>
mutate(taxon = "bee",
family = ifelse(is.na(family), Family, family)) |>
dplyr::select(-Family)
# used subfamily!!
bees_class = add_row(bees_class, genus = "Micralictoides", family = "Rophitinae", taxon = "bee") |>
add_row(genus = "Neopasites", family = "Nomadinae", taxon = "bee")
classifications = bind_rows(classifications, bees_class)
# butterfly ----
https://www.nature.com/articles/s41559-023-02041-9#MOESM1
# download its supplementary data
# https://springernature.figshare.com/articles/dataset/A_global_phylogeny_of_butterflies_reveals_their_evolutionary_history_ancestral_host_plants_and_biogeographic_origins/21774899?file=39124943
x = read.nexus("~/Downloads/Data.S24.RevBayes_Papilionoidea_BDS_rates_MAP2.tre")
x
plot(x, show.tip.label = F)
x$tip.label[1:100]
x_= str_count(x$tip.label, "_")
table(x_)
x$tip.label[which(x_ == 9)]
x$tip.label[which(x_ == 8)]
xx = x$tip.label[which(x_ == 7)]
x$tip.label = str_replace(x$tip.label, "__", "_")
x$tip.label = str_replace(x$tip.label, "_mulinzii_mulinzii", "_mulinzii")
x$tip.label = str_remove_all(x$tip.label, "'")
x$tip.label = str_remove_all(x$tip.label, "[.]$")
x$tip.label = str_remove(x$tip.label, "_X_ME$|_ME$")
xx = tibble(xx = x$tip.label) |>
separate(xx, c("x1", "x2", "family", "subfamily", "tribe", "genus", "species", "subspecies"),
remove = F)
drop_na(xx, subspecies) # |> View()
xx = mutate(xx,
species = ifelse(species == "c", paste(species, subspecies, sep = "-"), species),
sp = paste(genus, species, sep = "_"))
n_distinct(xx$sp)
spd = xx$sp[which(duplicated(xx$sp))]
filter(xx, sp %in% spd) # |> View()
# remove some of the duplicated tips
spd2 = filter(xx, sp %in% spd)
tip_to_rm = group_by(spd2, sp) |>
slice_head(n = 1) |>
pull(xx)
x = drop.tip(x, tip_to_rm)
x$tip.label = left_join(tibble(xx = x$tip.label), xx, by = "xx")$sp
any(duplicated(x$tip.label))
plot(x, show.tip.label = F, type = "fan")
classification_butterfly = dplyr::select(xx, genus, family) |>
distinct() |>
mutate(taxon = "butterfly")
classifications = bind_rows(classifications, classification_butterfly)
# all together and save ----
classifications = arrange(classifications, taxon, genus) %>%
distinct()
usethis::use_data(classifications, overwrite = T, compress = "xz")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.