# Clear workspace
rm(list = ls())
# Setup
################################################################################
# Packages
library(tidyverse)
# library(rfishbase)
# Directories
indir <- "data-raw/raw"
outdir <- "data-raw/processed"
# Source custom functions
source(file.path("R","custom_functions.R"))
# Read data
data_orig <- readRDS(file.path(outdir, "AFCD_data_pass2.Rds"))
# Read ref key
ref_key <- readRDS(file.path(outdir, "AFCD_reference_key.Rds"))
# FAO-SAU ASFIS Key
fao_sau_key <- read_csv(file.path(indir,"FAO_SAU_merge_key.csv"))
# Prepare data: with taxonomy worked out
################################################################################
# Build initial data
data_sci1 <- data_orig %>%
# Reduce to taxa without taxonomic information
filter(!taxa_name_source %in% c("Food name (English)", "Food name (original)")) %>%
# Rename
rename(sciname=taxa_name, sciname_source=taxa_name_source)
# Identify taxa groups
orders <- sort(unique(data_sci1$order))
families <- sort(unique(data_sci1$family))
genera <- sort(unique(data_sci1$genus))
# Build species key
################################################################################
# Species key 1
spp_key1 <- data_sci1 %>%
# Unique species
select(sciname) %>%
unique() %>%
# Recode species
rename(sciname_orig=sciname) %>%
mutate(sciname=sciname_orig) %>%
# Delete dangling commas
mutate(sciname=gsub(",$|_$", "", sciname)) %>%
# Delete ugly characters
mutate(sciname=gsub("<c2><a0>|<ca>|<c3><8d>", "", sciname)) %>%
# Delete synonyms in brackets
mutate(sciname=gsub("\\s*\\[[^\\)]+\\]", "", sciname)) %>%
# Replace semicolons with commas
mutate(sciname=gsub(';', ",", sciname)) %>%
# Replace underscore with commas
mutate(sciname=gsub(" _ ", ", ", sciname)) %>%
# Replace AND with comma
mutate(sciname=gsub(" and ", ", ", sciname)) %>%
# Add period to end of all SPPs
mutate(sciname=gsub("spp.", "spp", sciname),
sciname=gsub("spp", "spp.", sciname)) %>%
# Add period to end of all trailing SPs
mutate(sciname=gsub(" sp$", " sp.", sciname)) %>%
# Fix a few 1 worders
mutate(
sciname=recode(sciname,
"Anadara<be>spp."="Anadara spp.",
"Anisotremusvirginicus"="Anisotremus virginicus",
"Balistescarolinensis"="Balistes carolinensis",
"Callinectessapidus"="Callinectes sapidus",
# "Callorhynchus"="",
# "Can"="",
"Cancerporteri"="Cancer porteri",
"Caulerpa<be>spp."="Caulerpa spp.",
"Cichlaspp."="Cichla spp.",
"Clupeasardina"="Clupea sardina",
"Diplodusannularis"="Diplodus annularis",
"Donaxvariabilis"="Donax variabilis",
"Durvilleaantarctica"="Durvillea antarctica",
"Eleutheronematetractylum"="Eleutheronema tetradactylum",
"Epinephulussexfasciatus"="Epinephelus fasciatus",
"Gelidiumpusillum"="Gelidium pusillum",
"Genypterusblancodes"="Genypterus blacodes",
"Homarusgammarus"="Homarus gammarus",
"Hoplosternumlittorale"="Hoplosternum littorale",
"Leporinusobtusidens"="Leporinus obtusidens",
"Mugilcephalus"="Mugil cephalus",
"Mylossomaspp."="Mylossoma spp.",
"Mytiluschilensis"="Mytilus chilensis",
"Mytilusedulis"="Mytilus edulis",
"Obliadamelanura"="Oblada melanura",
"Oncorhynchusmykiss"="Oncorhynchus mykiss",
# "Palaemonidae/penaeidae"="",
"Palinurusvulgaris"="Palinurus vulgaris",
"Pleuronectesflesus"="Pleuronectes flesus",
"Pseudoplatystomacoruscans"="Pseudoplatystoma corruscans",
"Rasbora,sp"="Rasbora sp.",
"Sander<c2><a0>lucioperca"="Sander lucioperca",
"Sardasarda"="Sarda sarda",
"Selenevomer"="Selene vomer",
"Seriolellaspecie"="Seriolella spp.",
"Taliepusdentatus"="Taliepus dentatus",
"Thunnusvulgaris"="Thunnus vulgaris",
"Trachuruspicturatus"="Trachurus picturatus",
"Ulvalactuca"="Ulva lactuca")) %>%
# Add SPP to end of 1 word groups
mutate(nwords=nwords_freeR(sciname)) %>%
mutate(sciname=ifelse(nwords==1, paste(sciname, "spp."), sciname)) %>%
select(-nwords) %>%
# Remove blank
filter(sciname!="" & !is.na(sciname)) %>%
# Remove dangling letters
mutate(sciname=gsub(" a\\.", "", sciname)) %>%
mutate(sciname=gsub(" l\\.", "", sciname)) %>%
mutate(sciname=gsub(" b\\.", "", sciname)) %>%
mutate(sciname=gsub(" v\\.", "", sciname)) %>%
mutate(sciname=gsub(" c\\.", "", sciname)) %>%
mutate(sciname=gsub(" h\\.", "", sciname)) %>%
# Fix ones with punctuation
mutate(
sciname=recode(sciname,
"A. nodosum (r.)"="Ascophyllum nodosum",
"A. nodosum (s.)"="Ascophyllum nodosum",
"Amphioctopus fangsiao_"="Amphioctopus fangsiao",
"C. fragile"="Codium fragile", # , and
"C. mosullensis"="Chalcalburnus mosullensis",
"C. capoeta umbla"="Capoeta capoeta umbla",
"C. crucian"="Carassius carassius",
"Cyprinus carpio var. specularis)"="Cyprinus carpio",
"Cystoseira abies-marina"="Treptacantha abies-marina", # hyphen is correct
"Engraulis encrasicolus)"="Engraulis encrasicolus",
"F. spiralis"="Fucus spiralis",
"F. vesiculosus"="Fucus vesiculosus",
"G. chilensis"="Gracilaria chilensis",
# "Gracilaria bursa-pastoris"="", # hyphen is correct
# "Hydrocharis morsus-ranae"="", # hyphen is correct
"L. graellsii"="Luciobarbus graellsii",
"L. xanthochilus"="Lethrinus xanthochilus",
"L. bohar"="Lutjanus bohar",
"M. pyrifera"="Macrocystis pyrifera",
"M. cephalus"="Mugil cephalus",
"Megaloancistrus aculeatus)"="Megaloancistrus aculeatus",
"Melcertus latisculatus (family penaeidae)"="Melicertus latisulcatus",
"Neomeris van -bosseae"="Neomeris vanbosseae",
"Neomeris van-bosseae"="Neomeris vanbosseae",
"O. aureus"="Oreochromis aureus",
"Oncorhynchus mykiss)"="Oncorhynchus mykiss",
"Oreochromis niloticus (juvenile)"="Oreochromis niloticus",
"Pangasianodon hypophthalmus (juvenile)"="Pangasianodon hypophthalmus",
"Paralichthys oli<ea>aceus"="Paralichthys olivaceus",
"Perca -uviatilis"="Perca fluviatilis",
"Pinirampus pinirampu)"="Pinirampus pirinampu",
"Pseudoplatystoma corruscans)"="Pseudoplatystoma corruscans",
"S. sierra"="Scomberomorus sierra",
"Salmo trutta m. lacustris"="Salmo trutta",
"Sepia o.cinalis"="Sepia officinalis",
"Skeletonema marinoi-dohrnii"="Skeletonema dohrnii",
"Spisula (pseudocardium) sachalinensis"="Spisula sachalinensis",
"Tenualosa ilisha (juvenile)"="Tenualosa ilisha")) %>%
# Fix ones with more than two words
mutate(
sciname=recode(sciname,
"Spyridia fi lamentosa"="Spyridia filamentosa",
"T rachurus mediterraneus"="Trachurus mediterraneus")) %>%
# Mark ones with punctuation still
mutate(punct=grepl("[[:punct:]]", sciname)) %>%
# Fix a bunch of long ones
mutate(
sciname=recode(sciname,
"Acanthropagrus australis or butcheri"="Acanthropagrus australis, Acanthropagrus butcheri",
"Centropristes striata andteolabrax japonicus"="Centropristes striata, Andteolabrax japonicus",
"Includes a mix of species belonging to the astacidae"="Astacidae spp.",
"Includes a mix of species belonging to the ommastrephidae family"="Ommastrephidae spp.",
"Includes a mix of species belonging to the palaemonidae family"="Palaemonidae spp.",
"Navodon modestus lephiomus setigerus"="Navodon modestus, Lephiomus setigerus",
"Osmerus mordax dentex steindachner"="Osmerus mordax",
"Ostreidae family including crassostrea gigas"="Ostreidae spp.",
"Species belonging to the portunidae family"="Portunidae spp.",
"Spratelloides robustus or sardinops sagax"="Spratelloides robustus, Sardinops sagax",
"Salmo trutta morpha fario"="Salmo trutta",
"Notarchus indicus armatus baba"="Notarchus punctatus armatus",
"Engraulis anchoita hubbs larini"="Engraulis anchoita")) %>%
# Fix ones that don't get matched in GNR resolve (below)
mutate(
sciname=recode(sciname,
"Acanthoparagus bifasciatus"="Acanthopagrus bifasciatus",
"Apoleichthus taprobanensis"="Paraploactis taprobanensis",
"Artem longinaris"="Artemesia longinaris",
"Aristae omorphafoliacea"="Aristaeomorpha foliacea",
"Brakyptorosis serrulata"="Brachypterois serrulata",
"Bregmaceros mcclellandi"="Bregmaceros mcclellandi",
"Caulpera sertularioides"="Caulerpa sertularioides",
"Carpioides meridionalis"="Carpiodes carpio",
"Clupisoma pseudeutropius atherinoides"="Pachypterus atherinoides",
"Corralina mediterranea"="Corallina mediterranea",
"Coregonus artedisueur"="Coregonus artedi",
"Euchemia cottonii"="Eucheuma cottonii",
"Eriscion nebulosos"="Cynoscion nebulosus",
"Fueguine sardine"="Clupea fueguensis",
"Gadus mangala"="Cirrhinus mrigala",
"Gracilaria pusillum"="Gelidium pusillum",
"Gracilaria turuturu"="Grateloupia turuturu",
"Haliotidae haliotis"="Haliotis spp.",
"Helicolenus dactylopterus labillei"="Helicolenus dactylopterus",
"Hilsa hilsa"="Tenualosa ilisha",
"Holopragus guntheri"="Hoplopagrus guentherii",
"Hyme mulleri"="Hymeniacidon mulleri",
"Hyppoglossus hyppogl"="Hippoglossus hippoglossus",
"Isoctysis galbana"="Isochrysis galbana",
"Johinus borneersis"="Johnius borneensis",
"Laurencia mcdermid"="Laurencia mcdermidiae",
"Leiostomus xanthurascepede"="Leiostomus xanthuras",
"Lithod antarcticus"="Lithodes antarcticus",
"Liza strongy locephalus"="Liza strongylocephalus",
"Merluccius species"="Merluccius spp.",
"Metapenaeus shrimp"="Metapenaeus spp.",
"Monronr americanus"="Morone americana",
"Meurex meurex"="Murex trapa",
"Mullussur muletus"="Mullus surmuletus",
"Mulus barbatus ponticus"="Mullus barbatus",
"Mylopharyngodon piceusch"="Mylopharyngodon piceus",
"Mylopharyngodon piceuschn"="Mylopharyngodon piceus",
"Naticaproble maticareeve"="Neverita didyma",
"Noplopoma timbria pallas"="Anoplopoma fimbria",
"Ostreobrama cotio cotio"="Osteobrama cotio",
"Polyrenus species"="Polyrenus spp.",
"Salvelinus naresi"="Salvelinus alpinus",
"Salmonidae family"="Salmonidae spp.",
"Sardinex saga"="Sardinops sagax",
"Sargussum turbinaria"="Sargassum turbinaria",
"Sadra sarda"="Sarda sarda",
"Scarus ghabon"="Scarus ghobban",
"Scopelegadus mizoiepis mizolepis"="Scopelogadus mizolepis",
"Silvestre milossoma"="Trachinotus goodei",
"Spicara vulgaris"="Spicara smaris",
"Sudananautes africanus africanus"="Sudananautes africanus",
"Sulculus diversicolor aquatieis"="Haliotis diversicolor",
"Octopus vulgarismarck"="Octopus vulgaris",
"Order teuthoidea"="Teuthoidea spp.",
"Osmerus epelanus mordax"="Osmerus mordax",
"Palinurus borealis"="Palinurus elephas",
"Parambassis wollf"="Parambassis wolffii",
"Paeneus kerathurus"="Penaeus kerathurus",
"Parophrys ve tutus"="Parophrys vetulus",
"Paralycthis adpersus"="Paralichthys adspersus",
"Pectinidae family"="Pectinidae spp.",
"Pink perch"="Labeo rohita",
"Polinicies aemingiana"="Polinices aemingiana",
"Pseudopimelodus fasciatum"="Bagre spp.",
"Pimedolus clarias"="Pimelodus clarias",
"Puntis carana"="Puntius sarana",
"Polynema sextarius"="Polynemus sextarius",
"Pseudo pleuronectes"="Pseudopleuronectes spp.",
"Rajja specie"="Rajja spp.",
"Roughear scad"="Decapterus tabl",
"Various species"="Various spp.",
"Tilapia oreochromis"="Oreochromis spp.",
"Tympanostomus fuscatus radula"="Tympanotonos fuscatus",
"Uppenus sulphureus"="Upeneus sulphureus",
"Wak cuja"="Macrospinosa cuja")) %>%
# Fix some ones that bother you
mutate(
sciname=recode(sciname,
"Acanthopagrus schlegeli"="Acanthopagrus schlegelii")) %>%
# Mark species or group specific
mutate(type=ifelse(grepl("spp\\.|sp\\.|,|/| x ", sciname), "group", "species")) %>% # x=hybrids, commas/slashes is multiple
# Count number of words
mutate(nwords_orig=nwords_freeR(sciname_orig),
nwords=nwords_freeR(sciname)) %>%
# Trim
mutate(sciname=stringr::str_trim(sciname)) %>%
# Identify taxa level
mutate(group=ifelse(grepl("sp\\.|spp\\.", sciname), gsub(" sp\\.| spp\\.", "", sciname), NA)) %>%
mutate(taxa_level=ifelse(type=="species", "species",
ifelse(group %in% genera, "genus",
ifelse(group %in% families, "family",
ifelse(group %in% orders, "order", "other")))))
# Inspect groups
group_key <- spp_key1 %>%
filter(type=="group")
# Fix some 1 word groups - these look good
group_key %>% filter(nwords_orig==1 & !(sciname_orig %in% c(families, orders, genera))) %>% pull(sciname)
# Inspect species with punctuation - these 3 are correct
spp_key1 %>% filter(type=="species" & punct==T) %>% pull(sciname) %>% sort()
# Inspect species with more than two words
spp_key1 %>% filter(type=="species" & nwords>2) %>% pull(sciname) %>% sort()
# Check species names
################################################################################
# Species names
spp_names <- spp_key1 %>% filter(type=="species") %>% pull(sciname) %>% unique() %>% sort()
# Get species suggestions
spp_names_chunks <- split(spp_names, ceiling(seq_along(spp_names)/100))
spp_suggestions <- purrr::map_df(1:length(spp_names_chunks), function(x){
spp_names_do <- spp_names_chunks[[x]]
spp_suggest_chunk <- taxize::gnr_resolve(sci = spp_names_do, best_match_only=T, canonical = T, cap_first=T)
})
# Format suggestions
spp_suggestions1 <- spp_suggestions %>%
# Number of words in suggestion
mutate(nwords_in_suggestion=nwords_freeR(matched_name2)) %>%
# Suggestion type
mutate(suggest_type=ifelse(matched_name2==user_supplied_name, "correct", "updated")) %>%
# Simplify
unique()
# Suggestions
table(spp_suggestions1$suggest_type)
# One with one word suggestions
spp_suggestions1 %>% filter(nwords_in_suggestion==1) %>% pull(user_supplied_name) %>% sort()
# Build final key
spp_key2 <- spp_key1 %>%
# Add suggestions
left_join(spp_suggestions1, by=c("sciname"="user_supplied_name")) %>%
# Rename
rename(taxa_type=type, sciname_matched=matched_name2, match_type=suggest_type, sciname_matched_nwords=nwords_in_suggestion) %>%
# Simplify
select(sciname_orig, sciname, sciname_matched, sciname_matched, match_type, sciname_matched_nwords, taxa_type, taxa_level) %>%
# Determine final name
mutate(sciname_final=ifelse(match_type=="correct" | sciname_matched_nwords==1 | is.na(sciname_matched_nwords) | taxa_type=="group", sciname, sciname_matched)) %>%
# Simplify
select(taxa_type, taxa_level, sciname_final, sciname_orig, sciname_matched_nwords, match_type) %>%
# Rename and arrange
rename(sciname=sciname_final) %>%
arrange(taxa_type, taxa_level, sciname) %>%
# Squish
mutate(sciname=stringr::str_squish(sciname))
# Inspect
# freeR::complete(spp_key2)
# Ones without out names
# Cyparica samplomoneta, Sciania hatei, Chichorus virginicus = don't know who these are
spp_key2 %>% filter(taxa_level=="species" & is.na(sciname_matched_nwords)) %>% pull(sciname_orig) %>% sort()
# Inspect remaining species with more than two words
spp_key2 %>% filter(taxa_type=="species" & nwords_freeR(sciname)>2) %>% pull(sciname) %>% sort()
# Inspect remaining species with more than two words
all_fish <- function(){
# Build FB key
taxa_key_fb <- rfishbase::load_taxa(server="fishbase") %>% #new syntax for loading fishbase taxa
as.data.frame() %>%
mutate(type="fish") %>%
select(type, everything()) %>%
setNames(tolower(colnames(.))) %>%
rename(sciname=species) %>%
mutate(species=stringr::word(sciname, start=2, end=sapply(strsplit(sciname, " "), length))) %>%
mutate_all(as.character)
# Build SLB key
taxa_key_slb <- rfishbase::load_taxa(server="sealifebase") %>%#new syntax for loading sealifebase taxa
as.data.frame() %>%
mutate(type="invert") %>%
select(type, everything()) %>%
setNames(tolower(colnames(.))) %>%
mutate(sciname=paste(genus, species)) %>%
mutate_all(as.character)
taxa_key <- taxa_key_fb %>%
bind_rows(taxa_key_slb) %>%
setNames(tolower(names(.))) %>%
select(type, class, order, family, genus, species, sciname) %>%
unique()
return(taxa_key)
}
taxa_table = all_fish() %>% #freeR package needs an update, I fixed it and included the function immediately above
mutate(is_right = 1) %>%
select(sciname, is_right) %>%
unique()
long_names = spp_key2 %>% filter(taxa_type=="species" & nwords_freeR(sciname)>2) %>% select(sciname, sciname_orig) %>%
separate(sciname, c("spp1", "spp2", "spp3"), " ", remove=F) %>%
mutate(name1 = paste(spp1, spp2, sep=" "),
name2 = paste(spp1, spp3, sp = " ")) %>%
select(sciname, sciname_orig, name1, name2) %>%
reshape2::melt(id.vars = c("sciname", "sciname_orig")) %>%
rename(name = value) %>%
select(-variable) %>%
mutate(name = stringr::str_squish(name)) %>%
left_join(taxa_table, by = c("name" = "sciname")) %>% #and because all_fish doesn't work, this doesn't work
drop_na(is_right) %>%
select(-sciname, -is_right) %>%
rename(sciname2 = name) %>%
distinct(sciname_orig, .keep_all = T)
spp_key3 = spp_key2 %>%
left_join(long_names) %>%
rename(sciname1 = sciname) %>%
mutate(sciname = if_else(is.na(sciname2), sciname1, sciname2)) %>%
select(taxa_type, taxa_level, sciname, sciname_orig, sciname_matched_nwords, match_type)
# Add updated scientific names to data
################################################################################
# Format
data_sci2 <- data_sci1 %>%
# Rename
rename(sciname_orig=sciname) %>%
# Add updated scientific name
left_join(spp_key3 %>% select(taxa_type:sciname_orig), by=c("sciname_orig")) %>%
# Simplify
select(sciname, sciname_orig, taxa_type, taxa_level, everything()) %>%
# Remove columns
select(-sciname_source)
# Inspect
# freeR::complete(data_sci2)
# Confirm that the datasets are the right size
#nrow(data_comm) + nrow(data_sci2) == nrow(data_orig)
# Input scientific name based on common name
################################################################################
##Assign scientific names
sci_common_names <- data_sci2 %>%
select(common_name, sciname, sciname_orig, taxa_type, taxa_level) %>%
distinct(common_name, .keep_all = TRUE) %>%
drop_na(common_name)
# Seperate those without scientific name
data_comm <- data_orig %>%
# Reduce to taxa without taxonomic information
filter(taxa_name_source %in% c("Food name (English)", "Food name (original)")) %>%
left_join(sci_common_names) %>%
select(-taxa_name, -taxa_name_source)
data_comm_sci = data_comm %>%
filter(!is.na(sciname))
data_sci3 = rbind(data_sci2, data_comm_sci) %>%
##Remove non-aquatic animals
filter(!sciname %in% c("Crocothemis servilia",
"Cybister tripunctatus",
"Laccotrephes maculatus",
"Lethocerus indicus",
"Hydrophilus olivaceous",
"Chichorus virginicus",
"Sciania hatei",
"Megaloancistrus aculeatus",
"Lysimachia nummularia",
"Rorippa amphibia",
"Sarcocornia ambigua",
"Najas armata",
"Lactobacillus delbrueckii",
"Ephemeroptera spp.",
"Polyrenus spp.",
"Dangia sp.",
"Chironomus sp")) %>%
##Fix more scinames
mutate(sciname = recode(sciname,
"Chionoectes opilio" = "Chionoecetes opilio",
"Salmophasia phulo" = "Salmostoma phulo",
"Lutianus blackfordii" = "Lutjanus campechanus",
"Somanniathelphusa sexpunctata" = "Sayamia sexpunctata",
"Odohenus rosmarus" = "Odobenus rosmarus",
"Nordotis discus discus" = "Haliotis discus",
"Photololigo chinensis" = "Uroteuthis chinensis",
"Dosinorbis japonicus" = "Dosinia japonica",
"Nordotis gigantea" = "Haliotis gigantea",
"Peronidia venulosa" = "Megangulus venulosus",
"Sulculus diversicolor supertexta" = "Haliotis discus",
"Patinigera magellanica" = "Nacella magellanica",
"Cyparica samplomoneta" = "Cypraea tigris",
"Clarius batrachus" = "Clarias batrachus",
"Pseudolithus senegalensis" = "Pseudotolithus senegalensis",
"Ophiocephalus maruleus" = "Channa marulius",
"Pleurogramma monopterygius" = "Pleurogrammus monopterygius",
"Sanguina sanguine" = "Pseudopusula sanguinea",
"Chrysophrys haffara" = "Rhabdosargus sarba",
"Batillus cornutus" = "Turbo cornutus",
"Nordotis discus" = "Haliotis discus",
"Euphasia superba" = "Euphausia superba",
"Pinctata radiata" = "Pinctada radiata",
"Rastelliger kanagurta" = "Rastrelliger kanagurta",
"Halopeltis wilsonis" = "Rhodymenia wilsonis"))
##Fill in taxonomic informtion
##Load Taxa_table
taxa_table <- readRDS(file=file.path(outdir,"taxa_table.Rds"))
missing_spp_matched_WORMS <- read_csv("data-raw/raw/missing_spp_matched_WORMS.csv")
dta_species = data_sci3 %>%
filter(taxa_level=="species") %>%
select(-c(genus, kingdom:family)) %>%
separate(sciname, c("genus", "spp"), " ", remove=FALSE) %>%
select(-spp) %>%
mutate(genus = tolower(genus)) %>%
left_join(taxa_table)
##Carrect otdated scientific names
spp_missing = dta_species %>%
filter(is.na(family)) %>%
left_join(missing_spp_matched_WORMS %>% select(ScientificName, ScientificName_accepted), by=c("sciname" = "ScientificName")) %>%
rename(sciname_new = ScientificName_accepted) %>%
select(-c(genus, kingdom:family)) %>%
separate(sciname_new, c("genus", "spp"), " ", remove=FALSE) %>%
select(-spp) %>%
mutate(genus = tolower(genus)) %>%
left_join(taxa_table)
spp_corrected = spp_missing %>%
filter(!is.na(family)) %>%
select(-sciname) %>%
rename(sciname = sciname_new)
dta_species = dta_species %>%
filter(!is.na(family)) %>%
rbind(spp_corrected)
##Fill taxonomic information for genus level
dta_genus = data_sci3 %>%
filter(taxa_level=="genus") %>%
select(-c(kingdom:genus)) %>%
separate(sciname, c("genus", "spp"), " ", remove=T) %>%
mutate(sciname = NA,
genus = tolower(genus)) %>%
select(-spp) %>%
filter(!genus %in% c("trypanotonous")) %>%
mutate(genus = recode(genus,
"oncorhyncus" = "oncorhynchus",
"allotheutis" = "alloteuthis",
"spiridia" = "spyridia",
"penaus" = "penaeus",
"mythrax" = "mithrax",
"rajja" = "raja",
"paralichtys" = "paralichthys",
"caldophora" = "cladophora",
"pomadasyas" = "pomadasys",
"bonga" = "ethmalosa",
"sargrassum" = "sargassum",
"clariasgariepinus" = "clarias",
"paralicthys" = "paralichthys",
"liza" = "planiliza",
"platicephalus" = "heptanchus",
"percaflavescens" = "perca",
"callinestes" = "callinectes",
"anadaragranosa" = "tegillarca",
"cynoglossusarel" = "cynoglossus",
"latescalcarifer" = "lates",
"metapenaeusaffinis" = "metapenaeus",
"polynemisindicus" = "Polynemus",
"scromberomoruguttatus" = "scomberomorus",
"canalicculata" = "Pomacea")) %>%
left_join(taxa_table)
genus_missing = dta_genus %>%
filter(is.na(family))
dta_genus = dta_genus %>%
filter(!is.na(family))
##Check if missing are family
genus_missing_family = genus_missing %>%
select(-c(kingdom:family)) %>%
mutate(genus = recode(genus,
"siluroidei" = "siluridae",
"cottoidea" = "cottidae",
"pinnipedia" = "phocidae",
"pleuronectinae" = "pleuronectidae",
"logio" = "loliginidae",
"scomper" = "scombridae",
"salmus" = "salmonidae")) %>%
left_join(taxa_table %>% select(family, order, class, phylum, kingdom) %>% distinct(family, .keep_all=T), by=c("genus" = "family"))
add_family = genus_missing_family %>%
filter(!is.na(order)) %>%
rename(family = genus) %>%
mutate(genus = NA,
taxa_level = "family")
##Check if missing are order
genus_missing_order = genus_missing_family %>%
filter(is.na(order)) %>%
select(-c(kingdom:order)) %>%
mutate(genus = recode(genus,
"brachyura" = "decapoda")) %>%
left_join(taxa_table %>% select(order, class, phylum, kingdom) %>% distinct(order, .keep_all=T), by=c("genus" = "order"))
add_order = genus_missing_order %>%
filter(!is.na(class)) %>%
rename(order = genus) %>%
mutate(genus = NA,
family = NA)
##Check if missing are class
genus_missing_class = genus_missing_order %>%
filter(is.na(class)) %>%
select(-c(kingdom:class)) %>%
mutate(genus = recode(genus,
"lamellibranchia" = "bivalvia",
"oligochaeta" = "polychaeta")) %>%
left_join(taxa_table %>% select(class, phylum, kingdom) %>% distinct(class, .keep_all=T), by=c("genus" = "class"))
add_class = genus_missing_class %>%
filter(!is.na(phylum)) %>%
rename("class" = genus) %>%
mutate(genus = NA,
family = NA,
order = NA)
##Check if missing are phylum
genus_missing_phylum = genus_missing_class %>%
filter(is.na(phylum)) %>%
select(-c(kingdom:phylum)) %>%
left_join(taxa_table %>% select(phylum, kingdom) %>% distinct(phylum, .keep_all=T), by=c("genus" = "phylum"))
add_phylum = genus_missing_phylum %>%
filter(!is.na(kingdom)) %>%
rename("phylum" = genus) %>%
mutate(genus = NA,
family = NA,
order = NA,
class = NA)
still_missing = genus_missing_phylum %>%
filter(is.na(kingdom))
add_common = still_missing %>%
filter(genus %in% c("various")) %>%
mutate(genus = NA,
family = NA,
order = NA,
sciname = NA,
phylum = NA,
class = NA)
##Missing taxa information
#"Salvinia natans", "Azolla pinnata", "Neptunia oleracea",
#"nannochloropsis", "ankistrodesmus", "chlorococcum", "cystoseria"
##Fill taxonomic information for family level taxa
dta_family = data_sci3 %>%
filter(taxa_level=="family") %>%
select(-c(kingdom:family)) %>%
separate(sciname, c("family", "spp"), " ", remove=T) %>%
mutate(sciname = NA,
family = tolower(family),
family = recode(family,
"branchiostegidae" = "malacanthidae",
"aloseinae" = "clupeidae",
"catostominae" = "catostomidae",
"haliotididae" = "haliotidae",
"hippoglossinae" = "pleuronectidae",
"petromyzontinae" = "petromyzontidae")) %>%
left_join(taxa_table %>% select(family, order, class, phylum, kingdom) %>% distinct(family, .keep_all=T)) %>%
select(-spp) %>%
rbind(add_family)
##Other taxa
dta_other = data_sci3 %>%
filter(taxa_level=="other")
##First assign taxa to those that have a genus assigned
dta_other_missing = dta_other %>%
select(sciname, genus) %>%
distinct(sciname, .keep_all = T) %>%
mutate(genus = gsub(',', "", genus),
genus = recode(genus,
"Sander<U+00a0>Lucioperca" = "Sanderlucioperca",
"Morone\nSaxatilis" = "Morone",
"Porphyra/Pyropia" = "Pyropia",
"Tisbe\nSp" = "Tisbe",
"Spirulinales" = "Spirulina",
"Centropristes" = "Centropristis"),
genus = tolower(genus)) %>%
mutate(genus = recode(genus,
"simmered" = "mysis",
"acanthropagrus" = "acanthopagrus",
"corallinales" = "amphora",
"navodon" = "monacanthus",
"sanderlucioperca" = "sander",
"theragra" = "gadus")) %>%
left_join(taxa_table)
dta_other_genus = dta_other_missing %>%
filter(!is.na(family))
##Fill with family
dta_other_missing1 = dta_other_missing %>%
filter(is.na(family)) %>%
select(-c(family:kingdom)) %>%
rename(family = genus) %>%
mutate(family = recode(family,
"clupeinae" = "clupeidae",
"catla" = "cyprinidae",
"order" = "teuthoidea",
"chilina" = "chilinidae",
"phosidae" = "phocidae",
"pleuronectinae" = "pleuronectidae"),
family = case_when(sciname == "Palaemonidae/penaeidae spp." ~ "palaemonidae",
sciname == "Penaeidae, pandalidae" ~ "penaeidae",
sciname == "Loligoidae, ommastrephidae" ~ "ommastrephidae",
sciname == "Bothidae, pleuronectidae" ~ "bothidae",
sciname == "Percichthyidae, centrarchidae" ~ "percichthyidae",
TRUE ~ family)) %>%
left_join(taxa_table %>% select(-genus) %>% drop_na(family) %>% distinct(family, .keep_all=T)) %>%
mutate(genus = NA)
dta_other_family = dta_other_missing1 %>%
filter(!is.na(order))
#Fill with order
dta_other_order = dta_other_missing1 %>%
filter(is.na(order)) %>%
select(-c(order:genus)) %>%
rename(order = family) %>%
mutate(order = recode(order,
"teuthoidea" = "teuthida",
"chilinidae" = "basommatophora",
"caridea" = "decapoda"),
order = case_when(sciname == "Astacus, orconectes,, procambarus spp." ~ "decapoda",
TRUE ~ order)) %>%
left_join(taxa_table %>% select(-genus, -family) %>% drop_na(order) %>% distinct(order, .keep_all=T)) %>%
mutate(genus = NA,
family = NA)
dta_other_taxa = rbind(dta_other_genus, dta_other_family, dta_other_order) %>%
distinct(sciname, .keep_all = T)
dta_other2 = dta_other %>%
select(-c(kingdom:genus)) %>%
left_join(dta_other_taxa)
data_sci4 = rbind(dta_species, dta_genus, dta_family, dta_other2) %>%
mutate(class = if_else(order == "Actinopterygii", "Actinopterygii", class),
order = na_if(order, "Actinopterygii")) %>%
select(-taxa_id, -taxa_db, -taxa_type, -taxa_level) %>%
select(sciname, sciname_orig, genus, family, order, class, phylum, kingdom, common_name, food_name, food_name_orig, everything()) %>%
unique()
##Further clean species without taxa information
# data_comm_Nosci = data_comm %>%
# filter(is.na(sciname)) %>%
# select(-kingdom, -phylum, -taxa_id, -taxa_db, -taxa_type, -taxa_level, -class, -family, -genus, -order, -notes, -sciname, -sciname_orig) %>%
# unique()
##Load Taxa_table
#taxa_table = readRDS("data-raw/taxa-table/taxa_table.Rds")
# ## Using scientific names, find names with corresponding common name in english fishbase db
# com_names_key = com_names_es_key %>%
# filter(!is.na(spec_code)) %>% # filter out names that don't have an associated species code
# rename(SpecCode=spec_code) %>%
# mutate(SpecCode=as.integer(SpecCode)) %>%
# left_join(rfishbase::fb_tbl("comnames") |> filter(Language == "English"), by="SpecCode") %>%
# select(SpecCode, com_name, ComName, PreferredName) %>%
# filter(!is.na(ComName) & PreferredName==1 | SpecCode==8255) %>%
# rename(es_name=com_name, en_name=ComName) %>%
# distinct() %>%
# select(es_name, en_name)
## Make a key of spanish common names to scientific names
com_names_es_key = data_comm %>%
filter(study_id=="LATINFOODS") %>%
separate(food_name_orig,
into=c("ComName"),
sep = "([_;,()%/])",
remove=F) %>%
left_join(rfishbase::fb_tbl("comnames") |> filter(Language == "Spanish"), by="ComName") %>%
left_join(rfishbase::fb_tbl("species"), by="SpecCode") %>%
select("ComName", "Language", "SpecCode", "Species", "Genus") %>%
unique() %>%
janitor::clean_names() %>%
mutate(genus=tolower(genus), com_name=tolower(com_name))%>%
left_join(taxa_table, by="genus") %>%
group_by(com_name) %>%
# mutate based on common taxa, start broad then narrow down
# the detailed taxa info isn't super necessary right now, but maybe will be used later on
mutate(
class = case_when(length(unique(class)) == 1 ~ class,
length(unique(class)) > 1 ~ ""),
order = case_when(class == NA ~ "",
length(unique(order)) == 1 ~ order,
length(unique(order)) > 1 ~ ""),
family = case_when(order == "" ~ "",
length(unique(family)) == 1 ~ family,
length(unique(family)) > 1 ~ ""),
genus = case_when(family == "" ~ "",
length(unique(genus)) == 1 ~ genus,
length(unique(genus)) > 1 ~ ""),
species = case_when(length(unique(spec_code)) == 1 ~ species,
length(unique(spec_code)) > 1 ~ ""),
spec_code = case_when(species == "" ~ "",
species != "" ~ as.character(spec_code)) #remove species code for those that don't have a common species
) %>%
mutate(sciname=case_when(species!="" ~ paste(genus, species), TRUE ~ "")) %>%
mutate_all(na_if,"") %>%
select(-c(spec_code, species, language)) %>%
unique()
data_comm2 = data_comm %>%
filter(is.na(sciname)) %>%
rbind(add_common) %>%
select(-kingdom, -phylum, -taxa_id, -taxa_db, -taxa_type, -taxa_level, -class, -family, -order, -genus) %>%
left_join(com_names_es_key, by=c("common_name"="com_name")) %>%
unique() %>%
mutate(food_name_orig = if_else(is.na(food_name_orig), food_name, food_name_orig),
food_name = enc2native(food_name), #added encoding to native to play nice between windows/mac/linux
food_name = str_to_lower(food_name),
##class
class = case_when(
#Decapoda
str_detect(food_name, paste(c("fish", "char"), collapse = '|')) ~ "actinopterygii",
str_detect(food_name, paste(c("shark", "ray"), collapse = '|')) ~ "chondrichthyes",
str_detect(food_name, paste(c("shellfish", "mollusk"), collapse = '|')) ~ "bivalvia",
str_detect(food_name, paste(c("snail"), collapse = '|')) ~ "gastropoda"),
##Order
order = case_when(
#Decapoda
str_detect(food_name, paste(c("shrimp", "crab", "lobster", "prawn"), collapse = '|')) ~ "decapoda",
#Squid
str_detect(food_name, "squid") ~ "teuthida",
#Herrings
str_detect(food_name, paste(c("herring", "anchovy", "sardine"), collapse = '|')) ~ "clupeiformes",
#Cod
str_detect(food_name, "cod") ~ "gadiformes",
#Turtle
str_detect(food_name, "turtle") ~ "testudines",
#Pike
str_detect(food_name, "pike") ~ "esociformes",
#Carp
str_detect(food_name, "carp") ~ "cypriniformes"),
##Family
family = case_when(
#Shrimps
str_detect(food_name, "shrimp") ~ "penaeidae",
#mussels
str_detect(food_name, "mussel") ~ "mytilidae",
#oysters
str_detect(food_name, "oyster") ~ "ostreidae",
#octopus
str_detect(food_name, "octopus") ~ "octopodidae",
#tuna
str_detect(food_name, "tuna") ~ "scombridae",
#salmon, trout
str_detect(food_name, paste(c("salmon", "trout"), collapse = '|')) ~ "salmonidae",
#flatfish
str_detect(food_name, "flat") ~ "scophthalmidae",
#conch
str_detect(food_name, "conch") ~ "strombidae",
#rockfish
str_detect(food_name, "rockfish") ~ "sebastidae",
#Milkfish
str_detect(food_name, "milk fish") ~ "chanidae",
#Swimming crab
str_detect(food_name, "swimming crab") ~ "portunidae",
#Halibut
str_detect(food_name, "halibut") ~ "pleuronectidae",
#Mackerel
str_detect(food_name, "mackerel") ~ "scombridae",
#Agar seaweed
str_detect(food_name, "agar") ~ "gelideaceae",
#Nori seaweed
str_detect(food_name, "nori") ~ "bangiaceae",
#Grunt
str_detect(food_name, "corocoro") ~ "haemulidae"
),
genus = case_when(
str_detect(food_name, "corocoro") ~ "haemulom"
),
#Species
sciname = case_when(
#Corocoro aka Grunt
str_detect(food_name, "corocoro") ~ "haemulom aurolineatum",
!is.na(sciname.x) ~ sciname.x,
TRUE ~ sciname.y
)
) %>%
select(-c(sciname.x, sciname.y))
afcd_common_family = data_comm2 %>%
filter(!is.na(family)) %>%
select(-class, -order, -phylum, -kingdom) %>%
left_join(taxa_table %>% select(-genus) %>% unique())
afcd_common_order = data_comm2 %>%
filter(is.na(family),
!is.na(order)) %>%
select(-class, -phylum, -kingdom) %>%
left_join(taxa_table %>% select(-genus, -family) %>% unique())
afcd_common_class = data_comm2 %>%
filter(is.na(family),
is.na(order),
!is.na(class)) %>%
select(-phylum, -kingdom) %>%
left_join(taxa_table %>% select(-genus, -family, -order) %>% unique())
afcd_missing = data_comm2 %>%
filter(is.na(family),
is.na(order),
is.na(class))
data_comm_taxa = rbind(afcd_common_family,
afcd_common_order,
afcd_common_class)
data_sci5 = rbind(data_sci4, data_comm_taxa) %>%
mutate(portion_size = "per 100g") %>%
left_join(fao_sau_key,by=c("sciname"="species_merge")) %>%
rename(isscaap_code=isscaap) %>%
select(sciname:food_name_orig,isscaap_code:sau_species_code,everything())
##nutrients missing
# missing_nutrients = data_sci5 %>%
# filter(is.na(nutrient)) %>%
# rbind(data_sci5 %>% filter(nutrient_units %in% c("Not provided in unformatted AFCD", "none"))) %>%
# distinct(nutrient_orig, .keep_all = T) %>%
# select(nutrient_orig, nutrient, nutrient_type, nutrient_desc, nutrient_units)
# Export data with some taxonomic information
saveRDS(data_sci5, file=file.path(outdir, "AFCD_data_taxa.Rds"))
# Export data with complete scientific name
data_sci_only = data_sci5 %>%
drop_na(sciname) %>%
select(
sciname, sciname_orig, genus, family, order, class,
common_name, food_name, food_name_orig,
fao_taxocode,fao_3a_code,sau_species_code,isscaap_code,
everything()) %>%
unique()
saveRDS(data_sci_only, file=file.path(outdir, "AFCD_data_sci.Rds"))
##Export data without scientific names
saveRDS(afcd_missing, file=file.path(outdir, "AFCD_data_comm.Rds"))
##Export data in the wide format (data_taxa)
# data_taxa_wide = data_sci5 %>%
# distinct(sciname, sciname_orig, genus, family, order, class, common_name, food_name, food_name_orig, fct_code_orig, common_name_detailed, food_prep, food_prep_detailed, food_part, food_part_detailed, prod_catg, other_ingredients, study_type, study_id, country, edible_prop, notes, nutrient_type, nutrient, nutrient_orig, nutrient_desc, nutrient_code_fao, nutrient_units, .keep_all = T) %>%
# spread(nutrient, value)
# Export
#saveRDS(data_taxa_wide, file=file.path(outdir, "AFCD_data_taxa_wide.Rds"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.