In MPI-BGC-Functional-Biogeography/rtry: Preprocessing Plant Trait Data

Prepare the environment

Reset R's brain

#----------------------------------------------------------
# Reset R's brain
#----------------------------------------------------------
rm(list=ls())

Set working directory

#-------------------------------------------------
# Make sure the working directory is set to the desired location
#-------------------------------------------------
getwd()

#-------------------------------------------------
# If the working directory is incorrect
# Uncomment the following command and change the directory path before execution
#-------------------------------------------------
# setwd("<path_to_dir>")

Load rtry package

#-------------------------------------------------
# Make sure the rtry package is installed before loading
#-------------------------------------------------
library(rtry)
packageVersion("rtry")

Data pre-processing

Import the TRY data text file

#-------------------------------------------------
# Obtain and print the path to the sample dataset within the rtry package
#-------------------------------------------------
path_to_data <- system.file("testdata", "data_TRY_15160.txt", package = "rtry")
path_to_data

#-------------------------------------------------
# Import TRY data requests into data frames
#-------------------------------------------------
TRYdata1 <- rtry_import(path_to_data)

There are two ways to view the imported data

# Method 1: Print the first 6 rows of the data using the head() function
head(TRYdata1)

# Method 2: Open the imported data in using data viewer (only available in RStudio)
# using the View() function
View(TRYdata1)

Import another TRY dataset and view the data

path_to_data <- system.file("testdata", "data_TRY_15161.txt", package = "rtry")
path_to_data

TRYdata2 <- rtry_import(path_to_data)

head(TRYdata2)
View(TRYdata2)

Explore the imported data

#-------------------------------------------------
# Explore the imported data using rtry_explore()
#   1. Group the input data based on TraitID and TraitName
#   2. Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
#   3. Group the input data based on DataID, DataName, TraitID and TraitName, and sort by TraitID
# Note: For TraitID == "NA", meaning that entry is an ancillary data
#-------------------------------------------------
# Explore TRYdata1
TRYdata1_explore_trait <- rtry_explore(TRYdata1, TraitID, TraitName)
TRYdata1_explore_trait # Print the entire data frame
# View(TRYdata1_explore_trait)

TRYdata1_explore_species <- rtry_explore(TRYdata1, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata1_explore_species
# View(TRYdata1_explore_species)

TRYdata1_explore_anc <- rtry_explore(TRYdata1, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata1_explore_anc
# View(TRYdata1_explore_anc)

# Explore TRYdata2
# Group the input data based on TraitID and TraitName
TRYdata2_explore_trait <- rtry_explore(TRYdata2, TraitID, TraitName)
TRYdata2_explore_trait
# View(TRYdata2_explore_trait)

# Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
# Note: For TraitID == "NA", meaning that entry is an ancillary data
TRYdata2_explore_species <- rtry_explore(TRYdata2, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata2_explore_species
# View(TRYdata2_explore_species)

# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
TRYdata2_explore_anc <- rtry_explore(TRYdata2, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata2_explore_anc
# View(TRYdata2_explore_anc)

Bind imported data by rows

# Combine TRYdata1 and TRYdata2 by rows
TRYdata <- rtry_bind_row(TRYdata1, TRYdata2)

# View the combined data TRYdata
head(TRYdata)
# View(TRYdata)

Explore the combined data

# Group the input data based on TraitID and TraitName
TRYdata_explore_trait <- rtry_explore(TRYdata, TraitID, TraitName)
TRYdata_explore_trait
# View(TRYdata_explore_trait)

# Group the input data based on AccSpeciesID, AccSpeciesName, TraitID and TraitName
# Note: For TraitID == "NA", meaning that entry is an ancillary data
TRYdata_explore_species <- rtry_explore(TRYdata, AccSpeciesID, AccSpeciesName, TraitID, TraitName)
TRYdata_explore_species
# View(TRYdata_explore_species)

# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
TRYdata_explore_anc <- rtry_explore(TRYdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
TRYdata_explore_anc
# View(TRYdata_explore_anc)

Select relevant columns

# Remove a small fraction of the data column
workdata <- rtry_remove_col(TRYdata, V28)

# Select relevant columns directly
workdata <- rtry_select_col(workdata, ObsDataID, ObservationID, AccSpeciesID, AccSpeciesName, ValueKindName, TraitID, TraitName, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, OrigObsDataID, ErrorRisk, Comment)

Select relevant rows

# First identify relevant ancillary data using the rtry_explore function
workdata_explore_anc <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc
# View(workdata_explore_anc)

# Select all trait records and only the following ancillary data
# 59 Latitude
# 60 Longitude
# 61 Altitude
# 6601 Sampling date
# 327 Exposition
# 413 Plant developmental status / plant age / maturity / plant life stage
# 1961 Health status of plants (vitality)
# 113 Reference / source
workdata <- rtry_select_row(workdata, TraitID > 0 | DataID %in% c(59, 60, 61, 6601, 327, 413, 1961, 113))
workdata
# View(workdata)

# Double check if all the traits and necessary ancillary data are selected
workdata_explore_anc <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc
# View(workdata_explore_anc)

Save and load backup data

# Save workdata_unfiltered as backup
workdata_unfiltered <- workdata

# Load workdata_unfiltered
workdata <- workdata_unfiltered

Exclude (remove) data

Exclude observations of juvenile plants or saplings

#-------------------------------------------------
# Select the rows where DataID is 413, i.e. the data containing the plant development status
# Then explore the unique values of the OrigValueStr within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 413)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = OrigValueStr)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 413
# 2. OrigValueStr equals to "juvenile" or "saplings"
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 413) & (OrigValueStr %in% c("juvenile", "saplings")), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 413, i.e. the data containing the plant development status
# Then explore the unique values of the OrigValueStr within the selected data
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 413)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = OrigValueStr)
tmp_filtered
# View(tmp_filtered)

#-------------------------------------------------
# To further confirmed if the trait and/or ancillary data were also removed, explore the data
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
workdata_explore_anc_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
workdata_explore_anc_filtered
# View(workdata_explore_anc_filtered)

Exclude observations without geo-referenced information and from irrelevant regions

#-------------------------------------------------
# Select only the geo-referenced observations, i.e. with DataID 59 Latitude
# Set getAncillary to TRUE to obtain (keep) all traits and ancillary data
#-------------------------------------------------
workdata <- rtry_select_row(workdata, DataID %in% 59, getAncillary = TRUE)

#-------------------------------------------------
# Explore the selected geo-referenced observations
# Select the rows that contain DataID 59, i.e. latitude information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 59)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Exclude observations using latitude information
# Criteria
# 1. DataID equals to 59
# 2. StdValue smaller than 40 or NA
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 59) & (StdValue < 40 | is.na(StdValue)), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 59 (Latitude)
# Then explore the unique values of the StdValue within the selected data
# Sort the exploration by StdValue
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 59)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)

#-------------------------------------------------
# Select only the geo-referenced observations with DataID 60 Longitude
# Set getAncillary to TRUE to obtain (keep) all traits and ancillary data
#-------------------------------------------------
workdata <- rtry_select_row(workdata, DataID %in% 60, getAncillary = TRUE)

#-------------------------------------------------
# Select the rows that contain DataID 60, i.e. longitude information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% 60)
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Exclude observations using longitude information
# Criteria
# 1. DataID equals to 60
# 2. StdValue smaller than 10 or larger than 60 or NA
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% 60) & (StdValue < 10 | StdValue > 60 | is.na(StdValue)), baseOn = ObservationID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 60 (Longitude)
# Then explore the unique values of the StdValue within the selected data
# Sort the exploration by StdValue
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% 60)
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)

Exclude non-representative sub-traits

#-------------------------------------------------
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
tmp_unfiltered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 7222, 7223 or 6598
#-------------------------------------------------
workdata <- rtry_exclude(workdata, DataID %in% c(7222, 7223, 6598), baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Group the input data based on DataID, DataName, TraitID and TraitName
# Then sort the output by TraitID using the sortBy argument
#-------------------------------------------------
tmp_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, sortBy = TraitID)
tmp_filtered
# View(tmp_filtered)

Exclude data according to standard values (`StdValue`)

#-------------------------------------------------
# Select the rows where DataID is 6582, 6583 and 6584, i.e. the data containing the SLA information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_unfiltered <- rtry_select_row(workdata, DataID %in% c(6582, 6583, 6584))
tmp_unfiltered <- rtry_explore(tmp_unfiltered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, Comment, sortBy = StdValue)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. DataID equals to 6582, 6583 or 6584
# 2. StdValue smaller than 5
#-------------------------------------------------
workdata <- rtry_exclude(workdata, (DataID %in% c(6582, 6583, 6584)) & (StdValue < 5), baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Select the rows where DataID is 6582, 6583 and 6584, i.e. the data containing the SLA information
# Then explore the unique values of the StdValue within the selected data
#-------------------------------------------------
tmp_filtered <- rtry_select_row(workdata, DataID %in% c(6582, 6583, 6584))
tmp_filtered <- rtry_explore(tmp_filtered, DataID, DataName, OriglName, OrigValueStr, OrigUnitStr, StdValue, UnitName, Comment, sortBy = StdValue)
tmp_filtered
# View(tmp_filtered)

Exclude outliers according to error risk (`ErrorRisk`)

#-------------------------------------------------
# Group the input data based on DataID, DataName, TraitID, TraitName and ErrorRisk
# Then sort the output by ErrorRisk using the sortBy argument
#-------------------------------------------------
tmp_unfiltered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, ErrorRisk, sortBy = ErrorRisk)
tmp_unfiltered
# View(tmp_unfiltered)

#-------------------------------------------------
# Criteria
# 1. ErrorRisk larger than or equal to 3
#-------------------------------------------------
workdata <- rtry_exclude(workdata, ErrorRisk >= 3, baseOn = ObsDataID)
workdata
# View(workdata)

#-------------------------------------------------
# Double check the workdata to ensure the excluding worked as expected
# Group the input data based on DataID, DataName, TraitID, TraitName and ErrorRisk
# Then sort the output by ErrorRisk using the sortBy argument
#-------------------------------------------------
tmp_filtered <- rtry_explore(workdata, DataID, DataName, TraitID, TraitName, ErrorRisk, sortBy = ErrorRisk)
tmp_filtered
# View(tmp_filtered)

Remove duplicates based on duplicate identifier (`OrigObsDataID`)

#-------------------------------------------------
# The TRY database provided a duplicate identifier OrigObsDataID to duplicate entries
# This unique identifier is used within the rtry_rm_dup() for duplicates removal
# Note: if the column OrigObsDataID has been removed, the function will not work
#-------------------------------------------------
workdata <- rtry_remove_dup(workdata)

Transform from long-table to wide-table

Select only the traits with numerical values

#-------------------------------------------------
# Exclude
# 1. All entries with "" in TraitID
# 2. Potential categorical traits that don't have a StdValue
# 3. Traits that have not yet been standardized in TRY
# Note: The complete.cases() is used to ensure the cases are complete,
#       i.e. have no missing values
#-------------------------------------------------
num_traits <- rtry_select_row(workdata, complete.cases(TraitID) & complete.cases(StdValue))
num_traits
# View(num_traits)

Select only the relevant columns

# Select the columns for transformation
num_traits <- rtry_select_col(num_traits, ObservationID, AccSpeciesID, AccSpeciesName, TraitID, TraitName, StdValue, UnitName)
num_traits
# View(num_traits)

Retrieve the necessary ancillary data

#-------------------------------------------------
# To transform long table into wide table on traits while keeping the ancillary data, 
# the ancillary data needs to be added manually as additional columns before proceeding.
# Take latitude and longitude as example, extract these information from the input data
# using the function rtry_select_anc
#-------------------------------------------------
# Extract the unique value of latitude (DataID 59) and the corresponding ObservationID
workdata_lat <- rtry_select_anc(workdata, 59)

# Extract the unique value of longitude (DataID 60) and the corresponding ObservationID
workdata_lon <- rtry_select_anc(workdata, 60)

#-------------------------------------------------
# To merge the extracted ancillary data with the numerical traits
# Merge the relevant data frames based on the ObservationID using rtry_join_left (left join)
#-------------------------------------------------
num_traits_georef <- rtry_join_left(num_traits, workdata_lat, baseOn = ObservationID)
num_traits_georef <- rtry_join_left(num_traits_georef, workdata_lon, baseOn = ObservationID)
num_traits_georef
# View(num_traits_georef)

Perform wider transformation

#-------------------------------------------------
# Perform wide table transformation on TraitID, TraitName and UnitName
# With cell values to be the mean values calculated for StdValue
#-------------------------------------------------
num_traits_georef_wider <- rtry_trans_wider(num_traits_georef, names_from = c(TraitID, TraitName, UnitName), values_from = c(StdValue), values_fn = list(StdValue = mean))
num_traits_georef_wider
# View(num_traits_georef_wider)

Export pre-processed TRY data

#-------------------------------------------------
# Export the data into a CSV file
# Note: If the specified output directory does not exist, it will be created automatically.
#-------------------------------------------------
output_file = file.path(tempdir(), "workdata_wider_traits.csv")
rtry_export(num_traits_georef_wider, output_file)

MPI-BGC-Functional-Biogeography/rtry documentation built on Aug. 26, 2023, 7 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

MPI-BGC-Functional-Biogeography/rtry
Preprocessing Plant Trait Data

In MPI-BGC-Functional-Biogeography/rtry: Preprocessing Plant Trait Data

Prepare the environment

Reset R's brain

Set working directory

Load rtry package

Data pre-processing

Import the TRY data text file

Explore the imported data

Bind imported data by rows

Explore the combined data

Select relevant columns

Select relevant rows

Save and load backup data

Exclude (remove) data

Exclude observations of juvenile plants or saplings

Exclude observations without geo-referenced information and from irrelevant regions

Exclude non-representative sub-traits

Exclude data according to standard values (`StdValue`)

Exclude outliers according to error risk (`ErrorRisk`)

Remove duplicates based on duplicate identifier (`OrigObsDataID`)

Transform from long-table to wide-table

Select only the traits with numerical values

Select only the relevant columns

Retrieve the necessary ancillary data

Perform wider transformation

Export pre-processed TRY data

R Package Documentation

Browse R Packages

We want your feedback!

MPI-BGC-Functional-Biogeography/rtry Preprocessing Plant Trait Data

In MPI-BGC-Functional-Biogeography/rtry: Preprocessing Plant Trait Data

Prepare the environment

Reset R's brain

Set working directory

Load rtry package

Data pre-processing

Import the TRY data text file

Explore the imported data

Bind imported data by rows

Explore the combined data

Select relevant columns

Select relevant rows

Save and load backup data

Exclude (remove) data

Exclude observations of juvenile plants or saplings

Exclude observations without geo-referenced information and from irrelevant regions

Exclude non-representative sub-traits

Exclude data according to standard values (StdValue)

Exclude outliers according to error risk (ErrorRisk)

Remove duplicates based on duplicate identifier (OrigObsDataID)

Transform from long-table to wide-table

Select only the traits with numerical values

Select only the relevant columns

Retrieve the necessary ancillary data

Perform wider transformation

Export pre-processed TRY data

R Package Documentation

Browse R Packages

We want your feedback!

MPI-BGC-Functional-Biogeography/rtry
Preprocessing Plant Trait Data

Exclude data according to standard values (`StdValue`)

Exclude outliers according to error risk (`ErrorRisk`)

Remove duplicates based on duplicate identifier (`OrigObsDataID`)