#' After appending annual REddyProc output files using 'FS_fAppendREddyProcOutputs', this function will join the 'night' and 'day' method files into a single record, as well as joining raw data (for met variables), and FLUXNET2015 data products (when applicable).
#' @export
#' @title Build complete dataset from: (1) stacked REddyProc 'night' and 'day' output files, (2) raw (half-)hourly data (includes met variables), and (3) FLUXNET2015 data products (if supplied).
#' @param inpath_REP character string, navigate to the directory containing stacked REddyProc output files (must contain the following subdirectory names: '1_after-night-part/' and '2_after-day-part/')
#' @param inpath_raw character string, navigate to the directory containing formatted raw output datasets (formatted using function 'FS_fRawDataProcessing') (e.g. 'AMF_US-NR1_1998-2019-raw_data_for_plotting.txt')
#' @param inpath_FLX character string, navigate to the directory containing FLUXNET2015 data products (formatted using function 'FS_fRawDataProcessing')
#' @param outpath character string, set the directory by which complete output dataframes will be exported
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.
#' @importFrom stringr str_remove str_detect str_remove
#' @importFrom patchwork plot_annotation
#' @importFrom lubridate month day decimal_date
FS_fBuildCompleteOutput <- function(inpath_REP, inpath_raw, inpath_FLX = NULL, outpath, site.meta, export.files = TRUE) {
if (!dir.exists(outpath)) {
message(paste('Warning! No such directory exists!'))
x <- askYesNo(paste("Would you like to create a new directory in: ",outpath))
if (x) {
dir.create(file.path(paste0(outpath)), showWarnings = FALSE)
}
}
# find all data files
all.files <- list.files(paste0(inpath_REP), recursive = TRUE)
# extract paths for FLUXNET2015, AMERIFLUX, and NEON data
fluxnet.files <- all.files[grep(pattern = '\\FLX', all.files)]
amf.files <- all.files[grep(pattern = '\\AMF', all.files)]
neon.files <- all.files[grep(pattern = '\\NEO', all.files)]
# combine the flux datasets
parent.files <- c(fluxnet.files, amf.files, neon.files)
# separate into night vs. day partitioned
night.parent.files <- parent.files[grep(paste("night"), parent.files)]
day.parent.files <- parent.files[grep(paste("day"), parent.files)]
night.files <- sapply(strsplit(night.parent.files, "/\\s*"), tail, 1)
day.files <- sapply(strsplit(day.parent.files, "/\\s*"), tail, 1)
files <- c(night.files, day.files)
# list of all available sites with database type (eg, 'AMF_CA-Qcu')
db.and.sites <- unique(substr(x = files, start = 1, stop = 10))
# matching database for each
all.site.db <- substr(x = files, start = 1,stop = 3)
neon_index <- all.site.db %in% "NEO"
# all available site names
flx.amf.site.names <- substr(x = files[!neon_index], start = 5,stop = 10)
neon.site.names <- substr(x = files[neon_index], start = 5,stop = 8)
all.site.names <- c(flx.amf.site.names, neon.site.names)
# update db.and.sites to fix NEON names
db.and.sites[neon_index] <- gsub("_N", "", db.and.sites[neon_index])
# give a description for your routine:
routine_description <- "Combine 'day' and 'night' partitioned products into single dataframe"
# select one or more sites
choices <- fSiteSelect(db.and.sites, multi = TRUE, descrip = routine_description)
if (length(choices[["AMF"]]) > 0) {
AMF_selected <- paste0("AMF_", choices[["AMF"]])
} else {AMF_selected <- NULL}
if (length(choices[["FLX"]]) > 0) {
FLX_selected <- paste0("FLX_", choices[["FLX"]])
} else {FLX_selected <- NULL}
if (length(choices[["NEO"]]) > 0) {
NEO_selected <- paste0("NEO_", choices[["NEO"]])
} else {NEO_selected <- NULL}
# Includes database name in site selection
selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)
# Omits database name in site selection
selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])
for (i in 1:length(selected.db_sites)) {
# The current site (and database)
db_site <- selected.db_sites[i]
# Extract the database name and site name from the 'db_site' string
db <- substr(x = db_site, start = 1, stop = 3)
site <- str_remove(db_site, paste0(db, "_"))
# Get site metadata
site.info <- fGetSiteInfo(dat = site.meta,
site = site,
db = db,
site_col = "site_ID",
db_col = "database",
site_name_col = "site_Name",
start_yr_col = "start_year",
end_yr_col = "end_year",
exclude_yr_col = "exclude_years",
lat_col = "lat",
long_col = "long",
country_col = "country",
IGBP_col = "IGBP",
elev_col = "elev_orig",
MAT_col = "MAT_orig",
MAP_col = "MAP_orig")
message('')
message('Processing site (', min(which(selected.sites %in% site)), '/',
length(selected.sites), '):\t', site, ', ', db, " (", site.info$country, ")")
message('\t\tName: ', site.info$info)
message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0), 'mm')
message('\t\tForest Type: ', site.info$IGBP)
message('')
message(' ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)
message('')
message(' ', 'Navigating to data directory: ')
message(' ', paste0(inpath_REP))
site.files <- parent.files[grep(paste(site, collapse="|"), parent.files)]
night.site.files <- night.files[grep(paste(site, collapse="|"), night.files)]
day.site.files <- day.files[grep(paste(site, collapse="|"), day.files)]
night.subdirectory <- "1_after-night-part/"
day.subdirectory <- "2_after-day-part/"
message('')
message(' ', 'Reading stacked REddyProc output files (night/day methods) ')
dat.NT <- read.table(paste(inpath_REP, night.subdirectory, night.site.files, sep=""), header=TRUE)
dat.DT <- read.table(paste(inpath_REP, day.subdirectory, day.site.files, sep=""), header=TRUE)
# Note: the '_gf' suffix on all data columns is meant to indicate that these data have been gapfilled by KS
# Join the two gap-filled/partitioned datasets (night/day)
dat.gapfilled <- dat.NT %>%
left_join(dat.DT, by = c("Year", "DoY", "Hour"))
# Remove large objects
rm(dat.NT, dat.DT)
## Below we're using the function fTimestampQC to determine which days in a given year have incomplete or duplicate data
# This should pick up dataframes that contain anomalies in the timestamp due to Daylight Savings Time, as well as individual (half-)hours that may be duplicated or missing.
timestamp_test_gapfilled_data <- dat.gapfilled %>%
fTimestampQC(., "Year", "DoY", "Hour") %>%
mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
flag = no.flagged > 1)
if (any(timestamp_test_gapfilled_data$flag)) {
message('')
message(' ', "Check gapfilled flux data for possible missing/duplicate rows!")
message('')
print(timestamp_test_gapfilled_data)
}
message('')
message(' ', 'Navigating to data directory: ')
message(' ', paste0(inpath_raw))
## Load your original (non-gapfilled) data that contains additional meteorological variables, and append to dat_list_full
all.files.met_data <- list.files(paste(inpath_raw), recursive = FALSE)
site.file.met_data <- all.files.met_data[grep(pattern = site, all.files.met_data)]
message('')
message(' ', 'Reading raw (half-)hourly flux data')
dat.met_data <- read.table(paste(inpath_raw, site.file.met_data, sep=""), header=TRUE)
# drop k_datenum, k_dd, k_fracyr (we'll create those again later)
dat.met_data <- dat.met_data %>%
subset(., select = -c(k_datenum, k_dd, k_fracyr) ) %>%
rename("Year"=k_YY, "DoY"=k_doy, "Hour"=k_HH)
timestamp_test_met_data <- dat.met_data %>%
fTimestampQC(., "Year", "DoY", "Hour") %>%
mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
flag = no.flagged > 1)
if (any(timestamp_test_met_data$flag)) {
message('')
message(' ', "Check meteorological data for possible missing/duplicate rows!")
message('')
print(timestamp_test_met_data)
}
# For FLUXNET2015 sites only, we also want to include the original gap-filled data provided by FLUXNET
if (is.character(inpath_FLX) & db == "FLX") {
all.files.FLX_products <- list.files(paste(inpath_FLX), recursive = FALSE)
site.file.FLX_products <- all.files.FLX_products[grep(pattern = site, all.files.FLX_products)]
dat.FLX_products <- read.table(paste(inpath_FLX, site.file.FLX_products, sep=""), header=TRUE)
timestamp_test_FLX_data <- dat.FLX_products %>%
fTimestampQC(., "Year", "DoY", "Hour") %>%
mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
flag = no.flagged > 1)
if (any(timestamp_test_FLX_data$flag)) {
message('')
message(' ', "Check FLUXNET2015 data for possible missing/duplicate rows!")
message('')
print(timestamp_test_FLX_data)
}
} else if (is.null(inpath_FLX)) {
message('')
message(' ', "Path to FLUXNET2015 data products not supplied in arguments!")
message(' ', "FLUXNET2015 data products will not be added!")
message('')
# Add the FLUXNET2015 products dataframe to dat_list_full
dat.FLX_products <- NULL
} else {
# Add the FLUXNET2015 products dataframe to dat_list_full
dat.FLX_products <- NULL
}
## At this point we have 3 datasets stored in a list for each site:
# 1. Appended files from REddyProc (gapfilled and partitioned using both night and day methods)
# head(dat.gapfilled)
# 2. FLUXNET2015 data products (if applicable)
# head(dat.FLX_products)
# 3. Meteorological data that has NOT been gapfilled (includes original columns from source data, but parsed by me)
# head(dat.met_data)
# Each dataset has 3 datetime columns that overlap (and can thus be used to join): Year, DoY, and Hour
# Each dataset also differs in either a prefix (e.g. 'k_' in the meteorological dataset) or suffix (e.g. '_FLX' in the FLUXNET2015 products dataset)..
# .. which denotes its origin:
# prefix 'k_' - contains non-gapfilled data including a number of meteorological variables that might be of interest
# suffix '_FLX' - contains gapfilled/partitioned data products straight from FLUXNET2015
# suffix '_gf' - contains data that was gapfilled by the fluxsynth routine. (maybe we change the suffix)
# The meteorological dataset (prefix 'k_') contains other useful information, specifically datetime columns in local time for each site (k_POSIXdate_local)..
# .. as well as POSIX class dates (k_POSIXdate_plotting).
# For this reason, we will join all data relative to this dataset
dat <- dat.met_data %>%
left_join(dat.gapfilled, by = c('Year', 'DoY', 'Hour'))
if (db == "FLX") {
dat <- dat %>%
left_join(dat.FLX_products, by = c('Year', 'DoY', 'Hour'))
}
YY <- dat$Year
MM <- month(as.Date(dat$k_POSIXdate_local, tz = site.info$tz_name))
DD <- day(as.Date(dat$k_POSIXdate_local, tz = site.info$tz_name))
# new variables (convert decimal hours to hours and minutes)
HH <- dat[,'Hour']%/%1
MIN <- (dat[,'Hour'] - dat[,'Hour']%/%1)*60
dat$k_datenum <- fDatenum(YY, MM, DD, HH, MIN, 0)
dat$k_dd <- dat$k_datenum - fDatenum(YY, 1, 1, 0, 0, 0) + 1 # equates to 1 for the 1st day of the year, 2 for the 2nd and so on
dat$k_fracyr <- decimal_date(as.Date(dat$k_POSIXdate_plotting, tz = "UTC"))
# You'll notice that the time strings match the original data, and are now set to the appropriate timezone (by coordinates).
# Let's reorder our columns a bit
dat <- dat %>%
select(Year:k_POSIXdate_plotting, k_datenum:k_fracyr, NEE_NT_U50_gf:VPD_DT_gf, everything())
start_year <- unique(dat$Year)[1]
end_year <- tail(dat$Year,1)
# Sometimes a single row will be added at the end of the data.
# Since this affects the naming scheme (re: years of record), we remove this last row
test_end_year <- subset(dat, Year == end_year)
if (nrow(test_end_year) == 1) {
dat <- head(dat,-1)
end_year <- tail(dat$Year,1)
}
if (export.files) {
message('')
message(' ', 'Exporting complete dataset to: ')
message(' ', paste0(outpath))
outfile <- sprintf('%s_%s-%d-%d-Complete-Output.txt', db, site, start_year, end_year)
write.table(dat, file = paste0(outpath, outfile),
sep = "\t", row.names = FALSE,
col.names = TRUE,
append = FALSE)
}
} # end site loop
} # end function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.