Prepare the environment

Reset R's brain

#----------------------------------------------------------
# Reset R's brain
#----------------------------------------------------------
rm(list=ls())

Set working directory

#-------------------------------------------------
# Make sure the working directory is set to the desired location
#-------------------------------------------------
getwd()

#-------------------------------------------------
# If the working directory is incorrect
# Uncomment the following command and change the directory path before execution
#-------------------------------------------------
# setwd("<path_to_dir>")

Load rtry package

#-------------------------------------------------
# Make sure the rtry package is installed before loading
#-------------------------------------------------
library(rtry)
packageVersion("rtry")

Data pre-processing

Import the TRY data text file

#-------------------------------------------------
# Obtain and print the path to the sample dataset within the rtry package
#-------------------------------------------------
path_to_data <- system.file("testdata", "data_TRY_15160.txt", package = "rtry")
path_to_data
#-------------------------------------------------
# Import TRY data requests into data frames
#-------------------------------------------------
TRYdata1 <- rtry_import(path_to_data)

There are two ways to view the imported data

# Method 1: Print the first 6 rows of the data using the head() function
head(TRYdata1)
# Method 2: Open the imported data in using data viewer (only available in RStudio)
# using the View() function
View(TRYdata1)

Import another TRY dataset and view the data

path_to_data <- system.file("testdata", "data_TRY_15161.txt", package = "rtry")
path_to_data
TRYdata2 <- rtry_import(path_to_data)
head(TRYdata2)
View(TRYdata2)

Explore the imported data

#-------------------------------------------------
# Explore the imported data using rtry_explore()
#   1. Group the input data based on TraitID and TraitName
#   2. Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
#   3. Group the input data based on DataID, DataName, TraitID and TraitName, and sort by TraitID
# Note: For TraitID == "NA", meaning that entry is an ancillary data
#-------------------------------------------------
# Explore TRYdata1
TRYdata1_explore_trait <- rtry_explore(TRYdata1, TraitID, TraitName)
TRYdata1_explore_trait # Print the entire data frame
# View(TRYdata1_explore_trait)

TRYdata1_explore_species <- rtry_explore(TRYdata1, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata1_explore_species
# View(TRYdata1_explore_species)

TRYdata1_explore_anc <- rtry_explore(TRYdata1, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata1_explore_anc
# View(TRYdata1_explore_anc)
# Explore TRYdata2
# Group the input data based on TraitID and TraitName
TRYdata2_explore_trait <- rtry_explore(TRYdata2, TraitID, TraitName)
TRYdata2_explore_trait
# View(TRYdata2_explore_trait)

# Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
# Note: For TraitID == "NA", meaning that entry is an ancillary data
TRYdata2_explore_species <- rtry_explore(TRYdata2, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata2_explore_species
# View(TRYdata2_explore_species)

# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
TRYdata2_explore_anc <- rtry_explore(TRYdata2, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata2_explore_anc
# View(TRYdata2_explore_anc)

Bind imported data by rows

# Combine TRYdata1 and TRYdata2 by rows
TRYdata <- rtry_bind_row(TRYdata1, TRYdata2)

# View the combined data TRYdata
head(TRYdata)
# View(TRYdata)

Explore the combined data

# Group the input data based on TraitID and TraitName
TRYdata_explore_trait <- rtry_explore(TRYdata, TraitID, TraitName)
TRYdata_explore_trait
# View(TRYdata_explore_trait)

# Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
# Note: For TraitID == "NA", meaning that entry is an ancillary data
TRYdata_explore_species <- rtry_explore(TRYdata, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata_explore_species
# View(TRYdata_explore_species)

# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
TRYdata_explore_anc <- rtry_explore(TRYdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata_explore_anc
# View(TRYdata_explore_anc)

Select relevant columns

# Remove a small fraction of the data column
workdata <- rtry_remove_col(TRYdata, V28)

# Select relevant columns directly
workdata <- rtry_select_col(workdata, ObsDataID, ObservationID, AccSpeciesID, AccSpeciesName, ValueKindName, TraitID, TraitName, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, OrigObsDataID, ErrorRisk, Comment)

Select relevant rows

# First identify relevant ancillary data using the rtry_explore function
workdata_explore_anc <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc
# View(workdata_explore_anc)

# Select all trait records and only the following ancillary data
# 59 Latitude
# 60 Longitude
# 61 Altitude
# 6601 Sampling date
# 327 Exposition
# 413 Plant developmental status / plant age / maturity / plant life stage
# 1961 Health status of plants (vitality)
# 113 Reference / source
workdata <- rtry_select_row(workdata, TraitID > 0 | DataID %in% c(59, 60, 61, 6601, 327, 413, 1961, 113))
workdata
# View(workdata)

# Double check if all the traits and necessary ancillary data are selected
workdata_explore_anc <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc
# View(workdata_explore_anc)

Save and load backup data

# Save workdata_unfiltered as backup
workdata_unfiltered <- workdata
# Load workdata_unfiltered
workdata <- workdata_unfiltered

Exclude (remove) data

Exclude observations of juvenile plants or saplings

#-------------------------------------------------
# Select the rows where DataID is 413, i.e. the data containing the plant development status
# Then explore the unique values of the OrigValueStr within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 413)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = OrigValueStr)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 413
# 2. OrigValueStr equals to "juvenile" or "saplings"
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 413) & (OrigValueStr %in% c("juvenile", "saplings")), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 413, i.e. the data containing the plant development status
# Then explore the unique values of the OrigValueStr within the selected data
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 413)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = OrigValueStr)
tmp_filtered
# View(tmp_filtered)
#-------------------------------------------------
# To further confirmed if the trait and/or ancillary data were also removed, explore the data
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
workdata_explore_anc_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc_filtered
# View(workdata_explore_anc_filtered)

Exclude observations without geo-referenced information and from irrelevant regions

#-------------------------------------------------
# Select only the geo-referenced observations, i.e. with DataID 59 Latitude
# Set getAncillary to TRUE to obtain (keep) all traits and ancillary data
#-------------------------------------------------
workdata <- rtry_select_row(workdata, DataID %in% 59, getAncillary = TRUE)

#-------------------------------------------------
# Explore the selected geo-referenced observations
# Select the rows that contain DataID 59, i.e. latitude information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 59)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Exclude observations using latitude information
# Criteria
# 1. DataID equals to 59
# 2. StdValue smaller than 40 or NA
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 59) & (StdValue < 40 | is.na(StdValue)), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 59 (Latitude)
# Then explore the unique values of the StdValue within the selected data
# Sort the exploration by StdValue
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 59)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)
#-------------------------------------------------
# Select only the geo-referenced observations with DataID 60 Longitude
# Set getAncillary to TRUE to obtain (keep) all traits and ancillary data
#-------------------------------------------------
workdata <- rtry_select_row(workdata, DataID %in% 60, getAncillary = TRUE)

#-------------------------------------------------
# Select the rows that contain DataID 60, i.e. longitude information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 60)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Exclude observations using longitude information
# Criteria
# 1. DataID equals to 60
# 2. StdValue smaller than 10 or larger than 60 or NA
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 60) & (StdValue < 10 | StdValue > 60 | is.na(StdValue)), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 60 (Longitude)
# Then explore the unique values of the StdValue within the selected data
# Sort the exploration by StdValue
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 60)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)

Exclude non-representative sub-traits

#-------------------------------------------------
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
tmp_unfiltered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 7222, 7223 or 6598
#-------------------------------------------------
workdata <- rtry_exclude(workdata, DataID %in% c(7222, 7223, 6598), baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
tmp_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
tmp_filtered
# View(tmp_filtered)

Exclude data according to standard values (StdValue)

#-------------------------------------------------
# Select the rows where DataID is 6582, 6583 and 6584, i.e. the data containing the SLA information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% c(6582, 6583, 6584))
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 6582, 6583 or 6584
# 2. StdValue smaller than 5
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% c(6582, 6583, 6584)) & (StdValue < 5), baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 6582, 6583 and 6584, i.e. the data containing the SLA information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% c(6582, 6583, 6584))
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)

Exclude outliers according to error risk (ErrorRisk)

#-------------------------------------------------
# Group the input data based on DataID, DataName, TraitID, TraitName and ErrorRisk
# Then sort the output by ErrorRisk using the sortBy argument
#-------------------------------------------------
tmp_unfiltered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, ErrorRisk, sortBy = ErrorRisk)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. ErrorRisk larger than or equal to 3
#-------------------------------------------------
workdata <- rtry_exclude(workdata, ErrorRisk >= 3, baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Group the input data based on DataID, DataName, TraitID, TraitName and ErrorRisk
# Then sort the output by ErrorRisk using the sortBy argument
#-------------------------------------------------
tmp_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, ErrorRisk, sortBy = ErrorRisk)
tmp_filtered
# View(tmp_filtered)

Remove duplicates based on duplicate identifier (OrigObsDataID)

#-------------------------------------------------
# The TRY database provided a duplicate identifier OrigObsDataID to duplicate entries
# This unique identifier is used within the rtry_rm_dup() for duplicates removal
# Note: if the column OrigObsDataID has been removed, the function will not work
#-------------------------------------------------
workdata <- rtry_remove_dup(workdata)

Transform from long-table to wide-table

Select only the traits with numerical values

#-------------------------------------------------
# Exclude
# 1. All entries with "" in TraitID
# 2. Potential categorical traits that don't have a StdValue
# 3. Traits that have not yet been standardized in TRY
# Note: The complete.cases() is used to ensure the cases are complete,
#       i.e. have no missing values
#-------------------------------------------------
num_traits <- rtry_select_row(workdata, complete.cases(TraitID) & complete.cases(StdValue))
num_traits
# View(num_traits)

Select only the relevant columns

# Select the columns for transformation
num_traits <- rtry_select_col(num_traits, ObservationID, AccSpeciesID, AccSpeciesName, TraitID, TraitName, StdValue, UnitName)
num_traits
# View(num_traits)

Retrieve the necessary ancillary data

#-------------------------------------------------
# To transform long table into wide table on traits while keeping the ancillary data, 
# the ancillary data needs to be added manually as additional columns before proceeding.
# Take latitude and longitude as example, extract these information from the input data
# using the function rtry_select_anc
#-------------------------------------------------
# Extract the unique value of latitude (DataID 59) and the corresponding ObservationID
workdata_lat <- rtry_select_anc(workdata, 59)

# Extract the unique value of longitude (DataID 60) and the corresponding ObservationID
workdata_lon <- rtry_select_anc(workdata, 60)
#-------------------------------------------------
# To merge the extracted ancillary data with the numerical traits
# Merge the relevant data frames based on the ObservationID using rtry_join_left (left join)
#-------------------------------------------------
num_traits_georef <- rtry_join_left(num_traits, workdata_lat, baseOn = ObservationID)
num_traits_georef <- rtry_join_left(num_traits_georef, workdata_lon, baseOn = ObservationID)
num_traits_georef
# View(num_traits_georef)

Perform wider transformation

#-------------------------------------------------
# Perform wide table transformation on TraitID, TraitName and UnitName
# With cell values to be the mean values calculated for StdValue
#-------------------------------------------------
num_traits_georef_wider <- rtry_trans_wider(num_traits_georef, names_from = c(TraitID, TraitName, UnitName), values_from = c(StdValue), values_fn = list(StdValue = mean))
num_traits_georef_wider
# View(num_traits_georef_wider)

Export pre-processed TRY data

#-------------------------------------------------
# Export the data into a CSV file
# Note: If the specified output directory does not exist, it will be created automatically.
#-------------------------------------------------
output_file = file.path(tempdir(), "workdata_wider_traits.csv")
rtry_export(num_traits_georef_wider, output_file)


MPI-BGC-Functional-Biogeography/rtry documentation built on Aug. 26, 2023, 7 a.m.