# Clear workspace
rm(list = ls())
# Setup
################################################################################
# Packages
library(tidyverse)
# Directories
indir <- "data-raw/raw"
outdir <- "data-raw/processed"
plotdir <- "data-raw/figures"
# Resources
# GitHub: https://github.com/zachkoehn/aquatic_foods_nutrient_database
# DataVerse: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/KI0NYM
# Nature: https://www.nature.com/articles/s41586-021-03917-1?proof=t%2Btarget%3D#data-availability
# Read data
data_orig <- read.csv(file.path(indir, "20240724_AFCD.csv"), na.strings = c("", "NA"))
# Read reference key
ref_fct_orig <- readxl::read_excel(file.path(indir, "afcd_references.xlsx"), sheet="fct_references")
ref_peer_orig <- readxl::read_excel(file.path(indir, "afcd_references.xlsx"), sheet="peer_review_references")
# Read column key
col_key_orig <- readxl::read_excel(file.path(indir, "afcd_variable_codex.xlsx"))
# Build reference key
################################################################################
# Format FCT reference key
ref_fct <- ref_fct_orig %>%
# Rename
janitor::clean_names() %>%
rename(study_id=study_id_number,
doi=link_to_dataset,
database=nutrient_database,
units=nutrients_per) %>%
# Add study type
mutate(study_type="Food Composition Table (FCT)") %>%
# Arrange
select(study_type, study_id, citation, doi, database, units, everything()) %>%
# remove old versions of FCTs as new versions are published
filter(
study_id != "USA_USDA_2019"
) %>%
# Remove unneeded columns
select(-c(notes,added_by, already_included, format))
# Inspect
colnames(ref_fct)
table(ref_fct$units)
# Format peer reviewed reference key
ref_peer <- ref_peer_orig %>%
# Rename
janitor::clean_names() %>%
rename(study_id=study_id_number,
doi=study_doi,
country_origin_study=study_region,
citation=study_apa_citation) %>%
# Add study type
mutate(study_type="Peer-reviewed literature") %>%
# Convert study id
mutate(study_id=as.character(study_id)) %>%
# Arrange
select(study_type, study_id, everything()) %>%
# Remove useless columns
select(-x5)
# Inspect
colnames(ref_peer)
# Merge reference key & extract year
ref_key <- bind_rows(ref_peer, ref_fct) %>%
arrange(study_type, study_id) %>%
# extract year
mutate(study_year=str_extract(citation, "\\(\\d{4}\\)"),
study_year=case_when(is.na(study_year) ~ str_extract(citation, "(2\\d{3}|19\\d{2})"), TRUE ~ study_year),
study_year=gsub('(\\(|\\))', "", study_year)
) %>%
select(study_type, study_id, citation, everything())
# Inspect
# freeR::complete(ref_key)
# Export
saveRDS(ref_key, file.path(outdir, "AFCD_reference_key.Rds"))
# Step 1. Rename columns and go from wide to long
################################################################################
##Combine some columns
dta = data_orig %>%
mutate(energy_total_combined = if_else(is.na(Energy_total_metabolizable_calculated_from_the_energy_producing_food_components_original_as_from_source_kcal),
Energy_total_metabolizable_calculated_from_the_energy_producing_food_components_original_as_from_source_kj/4.184,
Energy_total_metabolizable_calculated_from_the_energy_producing_food_components_original_as_from_source_kcal),
protein_total_combined = if_else(is.na(Protein_total_calculated_from_total_nitrogen),
Protein_total_method_of_determination_unknown_or_variable,
Protein_total_calculated_from_protein_nitrogen),
nitrogen_total_combined = if_else(is.na(Nitrogen_total),
Nitrogen_nonprotein,
Nitrogen_total),
nitrogen_total_combined = if_else(is.na(nitrogen_total_combined),
Nitrogen_protein,
nitrogen_total_combined),
vitamin_a_combined = if_else(is.na(Vitamin_a_retinol_activity_equivalent_rae_calculated_by_summation_of_the_vitamin_a_activities_of_retinol_and_the_active_carotenoids),
Retinol,
Vitamin_a_retinol_activity_equivalent_rae_calculated_by_summation_of_the_vitamin_a_activities_of_retinol_and_the_active_carotenoids),
vitamin_a_combined = if_else(is.na(vitamin_a_combined),
0.3*Vitamin_a_international_units_iu_sum_of_carotenoids_usda_indicates_over_estimates_bioavailability,
vitamin_a_combined),
DHA = if_else(is.na(Fatty_acid_20_5), Fatty_acid_20_5_n3, Fatty_acid_20_5),
DHA = if_else(is.na(DHA), Fatty_acid_20_5_cis_n3, DHA),
EPA = if_else(is.na(Fatty_acid_22_6), Fatty_acid_22_6_n3, Fatty_acid_22_6),
EPA = if_else(is.na(EPA), Fatty_acid_22_6_cis_n3, EPA),
ALA = if_else(is.na(Fatty_acid_18_3), Fatty_acid_18_3_n3, Fatty_acid_18_3),
ALA = if_else(is.na(ALA), Fatty_acid_18_3_cis_n3, ALA),
DHA_EPA = if_else(is.na(EPA), DHA, EPA+DHA),
DHA_EPA = if_else(is.na(DHA_EPA), EPA, DHA_EPA),
Country_origin_sample=ifelse(is.na(Country_origin_sample),Country_iso3,Country_origin_sample)
) %>%
select(-Notes) %>%
rename(Notes=Notes_on_laboratory_analysis_methods_to_calculate_the_nutrient_composition_values) %>%
select(
Taxa_name,Kingdom,Class,Order,Family,Genus,Taxa_id,Parts_of_food,Preparation_of_food,
Production_category,Edible_portion_coefficient,Study_id_number,Country_origin_sample,Country_origin_study,
Country_origin_sample_detail,Peer_review,Phylum,
Taxa_db,Country_iso3,Original_fct_food_code,Food_name_in_english,Food_name_in_original_language,Notes,
everything()
)
# Format data
data1 <- dta %>%
# Rename columns
janitor::clean_names() %>%
rename(sciname=taxa_name,
food_part=parts_of_food,
food_prep=preparation_of_food,
prod_catg=production_category,
edible_prop=edible_portion_coefficient,
study_id=study_id_number,
fct_code_orig=original_fct_food_code,
food_name=food_name_in_english,
food_name_orig=food_name_in_original_language
) %>%
# Arrange
select(sciname:food_name_orig,country_origin_sample,country_origin_sample_detail,country_origin_study, notes, everything()) %>%
# Gather nutrients (maintain capitalization)
gather(key="nutrient_orig", value="value", 24:ncol(.)) %>%
mutate(nutrient_orig=stringr::str_to_sentence(nutrient_orig)) %>%
# Reduce to rows with no data and remove old versions of FCTs
filter(
!is.na(value),
study_id != "USA_USDA_2019"
)
# Inspect
# freeR::complete(data1)
# Step 2. Build nutrient key
################################################################################
# Build column key
col_key <- col_key_orig %>%
# Rename
janitor::clean_names() %>%
rename(col_id=x1, col_name=afcd_variable_name, units=unit, fao_code=fao_tagname_if_applicable)
# Build nutrient key
nutr_col_key <- col_key %>%
# Simplify
select(-col_id) %>%
# Reduce to nutrients
filter(units!="none" | is.na(units)) %>%
# Rename
rename(nutrient_orig=col_name) %>%
# Arrange
select(nutrient_orig, units, fao_code, description) %>%
unique()
# Identify nutrients in data
nutr_key_orig <- data1 %>%
# Identify nutrients in dataset
select(nutrient_orig) %>%
unique() %>%
arrange(nutrient_orig) %>%
# Add known meta-data from column key
left_join(nutr_col_key, by="nutrient_orig") %>%
# Format nutrient name
mutate(nutrient=nutrient_orig %>% gsub("_", " ", .)) %>%
# Arrange
select(nutrient_orig, nutrient, units, description, everything())
# Export for formatting outside R
# write.csv(nutr_key_orig, file.path(indir, "AFCD_nutrient_key_work.csv"), row.names = F)
# Step 3. Format data
################################################################################
# Read formatted key
nutr_key_use <- readxl::read_excel(file.path(indir, "AFCD_nutrient_key_work.xlsx"), na="NA")
# Format data some more
data2 <- data1 %>%
# Format scientific name
mutate(sciname=stringr::str_to_sentence(sciname),
sciname=stringr::str_trim(sciname)) %>%
mutate(sciname=recode(sciname,
"Can"="Cancer spp."),
sciname=ifelse(sciname=="Etc.", NA, sciname)) %>%
# Format other taxonomic info
mutate(across(.cols=kingdom:genus, .fns=stringr::str_to_title),
across(.cols=kingdom:genus, .fns=stringr::str_trim)) %>%
# Format ETC in genus
mutate(genus=ifelse(toupper(genus)=="ETC.", NA, genus)) %>%
# Format taxa database
mutate(taxa_db=stringr::str_to_upper(taxa_db)) %>%
# Format food parts
mutate(food_part=gsub("_", " ", food_part)) %>%
# Format food preparation
mutate(food_prep=gsub("_", " ", food_prep)) %>%
# Format production category
mutate(prod_catg=gsub("_", " ", prod_catg)) %>%
# Add reference type
mutate(study_id=ifelse(is.na(study_id), "Not provided in unformatted AFCD", study_id)) %>%
left_join(ref_key %>% select(study_id, study_type), by=c("study_id")) %>%
mutate(study_type=ifelse(is.na(study_type), "Id not in AFCD reference key", study_type)) %>%
# Format IS0
mutate(
country_origin_sample=stringr::str_trim(country_origin_sample),
country_origin_sample=ifelse(is.na(country_origin_sample), "Not provided in unformatted AFCD", country_origin_sample),
country_origin_sample=recode(country_origin_sample,
"SAu"="SAU",
"BNG"="IND", # West Bengal which is part of India - study 1407
"GRB"="GBR", # study 789 mis-recorded
"KHG"="ITA", # study 338 mis-recorded
"MYL"="MYS", # study 1438 mis-recorded
"PNDB"="Pacific Region",
"smiling_cambodia"="KHM",
"smiling_indonesia"="IDN",
"smiling_laos"="LAO",
"smiling_thailand"="THA",
"smiling_vietnam"="VNM",
"unknown (Caspian Sea)"="Caspian Sea",
"unknown"="Unknown",
"POL/ AUS"="POL, AUS",
"FAO.biodiv3"="FAO Biodiv 3",
"FAO.infoods.ufish1"="FAO INFOODS Ufish",
"FAO.infoods.west.africa"="FAO INFOODS West Africa",
"FAO.latinfoods"="FAO Latin Foods",
"USA_USDA_2022"="USA"),
country_origin_sample=str_replace_all(country_origin_sample,", ",","),
country_origin_sample=str_replace_all(country_origin_sample,"_",","),
country_origin_study=str_replace_all(country_origin_study,", ",","),
country_origin_study=str_replace_all(country_origin_study,"_",","),) %>%
# Add country
mutate(country=countrycode::countrycode(country_origin_sample, "iso3c", "country.name")) %>%
mutate(country=ifelse(is.na(country), country_origin_sample, country),
country=recode(country,
"BGD, KHM"="Bangladesh, Cambodia",
"CHN, JPN, KOR"="China, Japan, South Korea",
"CHN, TWN"="China, Taiwan",
"KOR, CHN"="South Korea, China",
"FRA, GBR"="France, Great Britain",
"NOR, FRA, ISL"="Norway, France, Israel",
"POL, AUS"="Poland, Australia")) %>%
# Add nutrients
left_join(nutr_key_use, by=c("nutrient_orig")) %>%
rename(nutrient_units=units, nutrient_desc=description, nutrient_code_fao=fao_code) %>%
# Format nutrient units/description
mutate(nutrient_units=ifelse(is.na(nutrient_units), "Not provided in unformatted AFCD", nutrient_units),
nutrient_desc=ifelse(is.na(nutrient_desc), nutrient, nutrient_desc)) %>%
# Fix up scientific names with "includes"
mutate(sciname=recode(sciname,
"Includes a mix of species belonging to the astacidae"="Astacidae spp.",
"Includes a mix of species belonging to the ommastrephidae family"="Ommastrephidae spp.",
"Includes a mix of species belonging to the palaemonidae family"="Palaemonidae spp."),
sciname=ifelse(grepl("includes", tolower(sciname)), NA, sciname),
genus=ifelse(genus=="Includes", NA, genus)) %>%
# Format scientific name
mutate(sciname_source=ifelse(!is.na(sciname), "Provided",
ifelse(!is.na(genus), "Genus",
ifelse(!is.na(family), "Family",
ifelse(!is.na(order), "Order",
ifelse(!is.na(food_name), "Food name (English)", "Food name (original)")))))) %>%
mutate(sciname=ifelse(sciname_source=="Provided", sciname,
ifelse(sciname_source=="Genus", genus,
ifelse(sciname_source=="Family", family,
ifelse(sciname_source=="Order", order,
ifelse(sciname_source=="Food name (English)", food_name, food_name_orig)))))) %>%
# Rename scientific name columns
rename(taxa_name=sciname, taxa_name_source=sciname_source) %>%
# Arrange
select(taxa_name, taxa_name_source, kingdom:taxa_db,
study_type, study_id, peer_review, country_origin_sample, country_origin_sample_detail,country_origin_study,
prod_catg, food_part, food_prep, food_name, food_name_orig, fct_code_orig, edible_prop, notes,
nutrient_type, nutrient, nutrient_orig, nutrient_desc, nutrient_code_fao, nutrient_units, value, everything()) %>%
# Remove unimportant columns
select(-c(peer_review))
# Inspect scinames with "includes"
data2 %>%
filter(grepl(pattern="includes|Includes", x=taxa_name)) %>% pull(taxa_name) %>% unique() %>% sort()
###Exclude studies with unrealistic values
data2 = data2 %>%
filter(!study_id == "465")
# Export data
################################################################################
# Export data
saveRDS(data2, file=file.path(outdir, "AFCD_data_pass1.Rds"))
# Step 4. Inspect data
################################################################################
# Inspect
# str(data2)
# freeR::complete(data2)
# Inspect taxa
table(data2$kingdom)
sort(unique(data2$phylum))
sort(unique(data2$order))
sort(unique(data2$family))
sort(unique(data2$genus))
table(data2$taxa_db)
# Inspect food parts
table(data2$food_part)
table(data2$food_prep)
table(data2$prod_catg)
# Inspect edible proportions (should be 0-1)
range(data2$edible_prop, na.rm=T)
# Inspect nutrient units
table(data2$nutrient_units)
# Inspect study characteristics
sort(unique(data2$study_id))
# Check study type vs. peer review
# Add peer-review back in to make this work
# table(data2$peer_review)
# data2 %>%
# group_by(study_type, peer_review) %>%
# summarize(n=n())
# Study ids not in key
data2$study_id[!data2$study_id %in% ref_key$study_id] %>% unique() %>% sort()
# Study ids in key not in data
ref_key$study_id[!ref_key$study_id %in% data2$study_id] %>% unique() %>% sort()
# Inspect foods
sort(unique(data2$fct_code_orig))
sort(unique(data2$food_name)) # terrible
sort(unique(data2$food_name_orig)) # terrible
# Inspect countries
sort(unique(data2$country_origin_study))
sort(unique(data2$country_origin_sample))
sort(unique(data2$country))
# Nutrient key
################################################################################
# Build nutrient key
nutr_key <- data2 %>%
# Summarize
group_by(nutrient_type, nutrient, nutrient_units, nutrient_desc, nutrient_code_fao) %>%
summarize(n=n()) %>%
ungroup() %>%
# Remover
filter(nutrient_type!="Non-nutrient")
# Export data
saveRDS(nutr_key, file=file.path(outdir, "AFCD_nutrient_key.Rds"))
write.csv(nutr_key,
file = file.path(indir,"afcd_nutrient_key.csv"),
row.names=FALSE
)
# Inspect
# freeR::complete(nutr_key)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.