knitr::opts_chunk$set(collapse = TRUE,comment = "#>")
This vignette aims to showcase a use case using the 2 main functions of metajam
- download_d1_data
and read_d1_files
using a data processing workflow developed by the NCO synthesis working group Stream Elemental Cycling.
The datasets used are from the LTER site - Luquillo and can be found in the PASTA data repository https://dx.doi.org/doi:10.6073/pasta/f9df56348f510da0113b1e6012fa2967. This data package is a collection of 8 datasets of stream water samples from 8 different locations of the Luquillo Mountains.
Our goal is to read the data for the 8 different sampling sites and aggregate them into one harmonized dataset. We will use the metadata to check if the data structures and units are the same across the 8 different sampling sites before performing the aggregation.
#devtools::install_github("NCEAS/metajam") library(metajam) # For wrangling the data library(readr) library(tidyr) library(dplyr) library(purrr) library(stringr)
# Download the data from DataONE on your local machine data_folder <- "Data_SEC" # Ammonium to Ammoniacal-nitrogen conversion. We will use this conversion later. coeff_conv_NH4_to_NH4N <- 0.7764676534
# Create the local directory to store datasets dir.create(data_folder, showWarnings = FALSE) # Get the datasets unique identifiers test_datasets_listing <- readr::read_csv(system.file("extdata", "LTER-SEC_DatasetsListing_SearchedData.csv", package = "metajam")) # Keep only the LUQ related datasets luq_test_datasets <- test_datasets_listing %>% dplyr::filter(grepl("LUQ", .$`LTER site abbreviation`)) %>% dplyr::select(`LTER site abbreviation`, `Data Repository (PASTA) URL to Archive/Metadata`, `Data Repository (PASTA) URL to File`, `Data Repository (PASTA) Filename`) %>% na.omit() %>% dplyr::arrange(`Data Repository (PASTA) Filename`) # sort the data sets alphabetically ## Batch download the datasets # the tidiest way local_datasets <- purrr::map(.x = luq_test_datasets$`Data Repository (PASTA) URL to File`, .f = ~ download_d1_data(.x, data_folder)) # the apply way # local_datasets <- lapply(luq_test_datasets$`Data Repository (PASTA) URL to File`, download_d1_data, data_folder) # the map way # local_datasets <- map(luq_test_datasets$`Data Repository (PASTA) URL to File`, function(x) {download_d1_data(x, data_folder)})
At this point, you should have all the data and the metadata downloaded inside your main directory; Data_SEC
in this example. metajam
organize the files as follow:
my_data.csv
__full_metadata.xml
: my_data__full_metadata.xml
__summary_metadata.csv
: my_data__summary_metadata.csv
__attribute_metadata.csv
: my_data__attribute_metadata.csv
__attribute_factor_metadata.csv
: my_data__attribute_factor_metadata.csv
# You could list the datasets dowloaded in the `Data_SEC` folder # local_datasets <- dir(data_folder, full.names = TRUE) # or you can directly use the outputed paths from download_d1_data # Read all the datasets and their associated metadata in as a named list luq_datasets <- purrr::map(local_datasets, read_d1_files) %>% purrr::set_names(purrr::map(., ~.x$summary_metadata$value[.x$summary_metadata$name == "File_Name"]))
Is the data structure the same across sampling sites (datasets)? For example, do the datasets all have the same column names?
# list all the attributes attributes_luq <- luq_datasets %>% purrr::map("data") %>% purrr::map(colnames) # Check if they are identical by comparing all against the first site for(ds in names(attributes_luq)) { print(identical(attributes_luq[[1]], attributes_luq[[ds]])) } #> => We are good, same data structure across the sampling sites
Is data reported in identical units? For example, in every dataset is CI reported in microgramsPerLiter?
# List all the units used luq_units <- luq_datasets %>% purrr::map("attribute_metadata") %>% purrr::map(~.[["unit"]]) # Check if they are identical by comparing all against the first site for(us in names(luq_units)) { print(identical(luq_units[[1]], luq_units[[us]])) } #>!!! => The 2 last datasets have different units!!!!!!!!!! # Let's check the differences luq_units_merged <- luq_datasets %>% purrr::map("attribute_metadata") %>% purrr::map(. %>% select(attributeName, unit)) %>% purrr::reduce(full_join, by = "attributeName") ## Rename # Create the new names luq_new_colnames <- names(luq_units) %>% stringr::str_split("[.]") %>% purrr::map(~.[1]) %>% paste("unit", ., sep = "_") # Apply the new names colnames(luq_units_merged) <- c("attributeName", luq_new_colnames)
RioIcacos
and RioMameyesPuenteRoto
, the units used for the gage height ("Gage_Ht") are in feet and not meters like the other sitesRioIcacos
and RioMameyesPuenteRoto
, NH4
and not NH4-N
is measured# fix attribute naming discrepancies -- to be improved # Copy the units for Gage height luq_units_merged <- luq_units_merged %>% dplyr::mutate(unit_RioIcacos = ifelse(test = attributeName == "Gage_Ht", yes = "foot", no = unit_RioIcacos), unit_RioMameyesPuenteRoto = ifelse(test = attributeName == "Gage_Ht", yes = "foot", no = unit_RioMameyesPuenteRoto)) # Copy the units for NH4 luq_units_merged <- luq_units_merged %>% dplyr::mutate(unit_RioIcacos = ifelse(test = attributeName == "NH4-N", yes = "microgramsPerLiter", no = unit_RioIcacos), unit_RioMameyesPuenteRoto = ifelse(test = attributeName == "NH4-N", yes = "microgramsPerLiter", no = unit_RioMameyesPuenteRoto)) # drop the 2 last rows luq_units_merged <- head(luq_units_merged, -2) ### Implement the unit conversion for RioIcacos and RioMameyesPuenteRoto ---- # Simplify naming RioIcacos_data <- luq_datasets$RioIcacos$data RioIcacos_attrmeta <- luq_datasets$RioIcacos$attribute_metadata ## RioIcacos # Fix NAs. In this dataset "-9999" is the missing value code. So we need to replace those with NAs RioIcacos_data <- na_if(RioIcacos_data, "-9999") # Do the unit conversion RioIcacos_data <- RioIcacos_data %>% dplyr::mutate( `Gage_Ht` = `Gage_Ht`* 0.3048) # Update the units column accordingly RioIcacos_attrmeta <- RioIcacos_attrmeta %>% dplyr::mutate(unit = gsub(pattern = "foot", replacement = "meter", x = unit)) # Do the unit conversion for RioIcacos and RioMameyesPuenteRoto - NH4 to NH4-N # Ammonium to Ammoniacal-nitrogen conversion coeff_conv_NH4_to_NH4N <- 0.7764676534 # Unit conversion for RioIcacos and RioMameyesPuenteRoto - NH4 to NH4-N RioIcacos_data <- RioIcacos_data %>% mutate( `NH4-N` = `NH4-N`* coeff_conv_NH4_to_NH4N) # Update the main object luq_datasets$RioIcacos$data <- RioIcacos_data ## RioMameyesPuenteRoto # Simplify naming RioMameyesPuenteRoto_data <- luq_datasets$RioMameyesPuenteRoto$data RioMameyesPuenteRoto_attrmeta <- luq_datasets$RioMameyesPuenteRoto$attribute_metadata #Replace all cells with the missing value code ("-9999") with "NA" RioMameyesPuenteRoto_data <- na_if(RioMameyesPuenteRoto_data, "-9999") #Tidy version of unit conversion RioMameyesPuenteRoto_data <- RioMameyesPuenteRoto_data %>% dplyr::mutate(`Gage_Ht` = `Gage_Ht`* 0.3048) # Update the units column accordingly RioMameyesPuenteRoto_attrmeta <- RioMameyesPuenteRoto_attrmeta %>% dplyr::mutate(unit = gsub(pattern = "foot", replacement = "meter", x = unit)) # Do the unit conversion for RioMameyesPuenteRoto - NH4 to NH4-N #In this dataset the NH4-N column is actually empty, so this is not necessary. But here is how you would do it if you had to. RioMameyesPuenteRoto_data <- RioMameyesPuenteRoto_data %>% dplyr::mutate( `NH4-N` = `NH4-N`* coeff_conv_NH4_to_NH4N) # Update the main object luq_datasets$RioMameyesPuenteRoto$data <- RioMameyesPuenteRoto_data
# bind the sampling sites data into one master dataset for LUQ all_sites_luq <- luq_datasets %>% purrr::map("data") %>% dplyr::bind_rows(.id = "prov") # Replace -9999 with NAs all_sites_luq <- na_if(all_sites_luq, "-9999") # Write as csv write_csv(all_sites_luq, "stream_chem_all_LUQ.csv")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.