FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in FS_fBuildCompleteOutput

#' After appending annual REddyProc output files using 'FS_fAppendREddyProcOutputs', this function will join the 'night' and 'day' method files into a single record, as well as joining raw data (for met variables), and FLUXNET2015 data products (when applicable).

#' @export
#' @title Build complete dataset from: (1) stacked REddyProc 'night' and 'day' output files, (2) raw (half-)hourly data (includes met variables), and (3) FLUXNET2015 data products (if supplied).
#' @param inpath_REP character string, navigate to the directory containing stacked REddyProc output files (must contain the following subdirectory names: '1_after-night-part/' and '2_after-day-part/')
#' @param inpath_raw character string, navigate to the directory containing formatted raw output datasets (formatted using function 'FS_fRawDataProcessing') (e.g. 'AMF_US-NR1_1998-2019-raw_data_for_plotting.txt')
#' @param inpath_FLX character string, navigate to the directory containing FLUXNET2015 data products (formatted using function 'FS_fRawDataProcessing')
#' @param outpath character string, set the directory by which complete output dataframes will be exported
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.


#' @importFrom stringr str_remove str_detect str_remove
#' @importFrom patchwork plot_annotation
#' @importFrom lubridate month day decimal_date


FS_fBuildCompleteOutput <- function(inpath_REP, inpath_raw, inpath_FLX = NULL, outpath, site.meta, export.files = TRUE) {

  if (!dir.exists(outpath)) {
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ",outpath))

    if (x) {
      dir.create(file.path(paste0(outpath)), showWarnings = FALSE)
    }
  }


  # find all data files
  all.files <- list.files(paste0(inpath_REP), recursive = TRUE)

  # extract paths for FLUXNET2015, AMERIFLUX, and NEON data
  fluxnet.files <- all.files[grep(pattern = '\\FLX', all.files)]
  amf.files <- all.files[grep(pattern = '\\AMF', all.files)]
  neon.files <- all.files[grep(pattern = '\\NEO', all.files)]

  # combine the flux datasets
  parent.files <- c(fluxnet.files, amf.files, neon.files)

  # separate into night vs. day partitioned
  night.parent.files <- parent.files[grep(paste("night"), parent.files)]
  day.parent.files <- parent.files[grep(paste("day"), parent.files)]

  night.files <- sapply(strsplit(night.parent.files, "/\\s*"), tail, 1)
  day.files <- sapply(strsplit(day.parent.files, "/\\s*"), tail, 1)

  files <- c(night.files, day.files)

  # list of all available sites with database type (eg, 'AMF_CA-Qcu')
  db.and.sites <- unique(substr(x = files, start = 1, stop = 10))

  # matching database for each
  all.site.db <- substr(x = files, start = 1,stop = 3)

  neon_index <- all.site.db %in% "NEO"

  # all available site names
  flx.amf.site.names <- substr(x = files[!neon_index], start = 5,stop = 10)
  neon.site.names <- substr(x = files[neon_index], start = 5,stop = 8)
  all.site.names <- c(flx.amf.site.names, neon.site.names)

  # update db.and.sites to fix NEON names
  db.and.sites[neon_index] <- gsub("_N", "", db.and.sites[neon_index])



  # give a description for your routine:
  routine_description  <-  "Combine 'day' and 'night' partitioned products into single dataframe"

  # select one or more sites
  choices <- fSiteSelect(db.and.sites, multi = TRUE, descrip = routine_description)

  if (length(choices[["AMF"]]) > 0) {
    AMF_selected <- paste0("AMF_", choices[["AMF"]])
  } else {AMF_selected <- NULL}

  if (length(choices[["FLX"]]) > 0) {
    FLX_selected <- paste0("FLX_", choices[["FLX"]])
  } else {FLX_selected <- NULL}

  if (length(choices[["NEO"]]) > 0) {
    NEO_selected <- paste0("NEO_", choices[["NEO"]])
  } else {NEO_selected <- NULL}

  # Includes database name in site selection
  selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)

  # Omits database name in site selection
  selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])



  for (i in 1:length(selected.db_sites)) {


    # The current site (and database)
    db_site <- selected.db_sites[i]

    # Extract the database name and site name from the 'db_site' string
    db <- substr(x = db_site, start = 1, stop = 3)
    site <- str_remove(db_site, paste0(db, "_"))

    # Get site metadata
    site.info <- fGetSiteInfo(dat = site.meta,
                              site = site,
                              db = db,
                              site_col = "site_ID",
                              db_col = "database",
                              site_name_col = "site_Name",
                              start_yr_col = "start_year",
                              end_yr_col = "end_year",
                              exclude_yr_col = "exclude_years",
                              lat_col = "lat",
                              long_col = "long",
                              country_col = "country",
                              IGBP_col = "IGBP",
                              elev_col = "elev_orig",
                              MAT_col = "MAT_orig",
                              MAP_col = "MAP_orig")

    message('')
    message('Processing site (', min(which(selected.sites %in% site)), '/',
            length(selected.sites), '):\t', site, ', ', db, " (", site.info$country, ")")
    message('\t\tName: ', site.info$info)
    message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
    message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0),  'mm')
    message('\t\tForest Type: ', site.info$IGBP)

    message('')
    message('  ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)

    message('')
    message('  ', 'Navigating to data directory:  ')
    message('    ', paste0(inpath_REP))



    site.files <- parent.files[grep(paste(site, collapse="|"), parent.files)]
    night.site.files <- night.files[grep(paste(site, collapse="|"), night.files)]
    day.site.files <- day.files[grep(paste(site, collapse="|"), day.files)]

    night.subdirectory <- "1_after-night-part/"
    day.subdirectory <- "2_after-day-part/"


    message('')
    message('  ', 'Reading stacked REddyProc output files (night/day methods)  ')

    dat.NT <- read.table(paste(inpath_REP, night.subdirectory, night.site.files, sep=""), header=TRUE)
    dat.DT <- read.table(paste(inpath_REP, day.subdirectory, day.site.files, sep=""), header=TRUE)

    # Note: the '_gf' suffix on all data columns is meant to indicate that these data have been gapfilled by KS
    # Join the two gap-filled/partitioned datasets (night/day)
    dat.gapfilled <- dat.NT %>%
      left_join(dat.DT, by = c("Year", "DoY", "Hour"))

    # Remove large objects
    rm(dat.NT, dat.DT)


    ## Below we're using the function fTimestampQC to determine which days in a given year have incomplete or duplicate data
    # This should pick up dataframes that contain anomalies in the timestamp due to Daylight Savings Time, as well as individual (half-)hours that may be duplicated or missing.
    timestamp_test_gapfilled_data <- dat.gapfilled %>%
      fTimestampQC(., "Year", "DoY", "Hour") %>%
      mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
             flag = no.flagged > 1)

    if (any(timestamp_test_gapfilled_data$flag)) {
      message('')
      message('  ', "Check gapfilled flux data for possible missing/duplicate rows!")
      message('')
      print(timestamp_test_gapfilled_data)
    }




    message('')
    message('  ', 'Navigating to data directory:  ')
    message('    ', paste0(inpath_raw))

    ## Load your original (non-gapfilled) data that contains additional meteorological variables, and append to dat_list_full
    all.files.met_data <- list.files(paste(inpath_raw), recursive = FALSE)
    site.file.met_data <- all.files.met_data[grep(pattern = site, all.files.met_data)]


    message('')
    message('  ', 'Reading raw (half-)hourly flux data')



    dat.met_data <- read.table(paste(inpath_raw, site.file.met_data, sep=""), header=TRUE)



    # drop k_datenum, k_dd, k_fracyr (we'll create those again later)
    dat.met_data <- dat.met_data %>%
      subset(., select = -c(k_datenum, k_dd, k_fracyr) ) %>%
      rename("Year"=k_YY, "DoY"=k_doy, "Hour"=k_HH)


    timestamp_test_met_data <- dat.met_data %>%
      fTimestampQC(., "Year", "DoY", "Hour") %>%
      mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
             flag = no.flagged > 1)

    if (any(timestamp_test_met_data$flag)) {
      message('')
      message('  ', "Check meteorological data for possible missing/duplicate rows!")
      message('')
      print(timestamp_test_met_data)
    }


    # For FLUXNET2015 sites only, we also want to include the original gap-filled data provided by FLUXNET
    if (is.character(inpath_FLX) & db == "FLX") {

      all.files.FLX_products <- list.files(paste(inpath_FLX), recursive = FALSE)
      site.file.FLX_products <- all.files.FLX_products[grep(pattern = site, all.files.FLX_products)]

      dat.FLX_products <- read.table(paste(inpath_FLX, site.file.FLX_products, sep=""), header=TRUE)

      timestamp_test_FLX_data <- dat.FLX_products %>%
        fTimestampQC(., "Year", "DoY", "Hour") %>%
        mutate(no.flagged = sapply(strsplit(.$`DoY flagged`,","),FUN=function(x){length(x[!is.na(x)])}),
               flag = no.flagged > 1)

      if (any(timestamp_test_FLX_data$flag)) {
        message('')
        message('  ', "Check FLUXNET2015 data for possible missing/duplicate rows!")
        message('')
        print(timestamp_test_FLX_data)
      }


    } else if (is.null(inpath_FLX)) {
      message('')
      message('  ', "Path to FLUXNET2015 data products not supplied in arguments!")
      message('  ', "FLUXNET2015 data products will not be added!")
      message('')


      # Add the FLUXNET2015 products dataframe to dat_list_full
      dat.FLX_products <- NULL

    } else {

      # Add the FLUXNET2015 products dataframe to dat_list_full
      dat.FLX_products <- NULL
    }


    ## At this point we have 3 datasets stored in a list for each site:
    # 1. Appended files from REddyProc (gapfilled and partitioned using both night and day methods)
    # head(dat.gapfilled)

    # 2. FLUXNET2015 data products (if applicable)
    # head(dat.FLX_products)

    # 3. Meteorological data that has NOT been gapfilled (includes original columns from source data, but parsed by me)
    # head(dat.met_data)


    # Each dataset has 3 datetime columns that overlap (and can thus be used to join): Year, DoY, and Hour
    # Each dataset also differs in either a prefix (e.g. 'k_' in the meteorological dataset) or suffix (e.g. '_FLX' in the FLUXNET2015 products dataset)..
    # .. which denotes its origin:
    # prefix 'k_' - contains non-gapfilled data including a number of meteorological variables that might be of interest
    # suffix '_FLX' - contains gapfilled/partitioned data products straight from FLUXNET2015
    # suffix '_gf' - contains data that was gapfilled by the fluxsynth routine. (maybe we change the suffix)

    # The meteorological dataset (prefix 'k_') contains other useful information, specifically datetime columns in local time for each site (k_POSIXdate_local)..
    # .. as well as POSIX class dates (k_POSIXdate_plotting).

    # For this reason, we will join all data relative to this dataset

    dat <- dat.met_data %>%
      left_join(dat.gapfilled, by = c('Year', 'DoY', 'Hour'))

    if (db == "FLX") {
      dat <- dat %>%
        left_join(dat.FLX_products, by = c('Year', 'DoY', 'Hour'))
    }



    YY <- dat$Year
    MM <- month(as.Date(dat$k_POSIXdate_local, tz = site.info$tz_name))
    DD <- day(as.Date(dat$k_POSIXdate_local, tz = site.info$tz_name))

    # new variables (convert decimal hours to hours and minutes)
    HH <- dat[,'Hour']%/%1
    MIN <- (dat[,'Hour'] - dat[,'Hour']%/%1)*60


    dat$k_datenum <- fDatenum(YY, MM, DD, HH, MIN, 0)
    dat$k_dd <- dat$k_datenum - fDatenum(YY, 1, 1, 0, 0, 0) + 1    # equates to 1 for the 1st day of the year, 2 for the 2nd and so on
    dat$k_fracyr <- decimal_date(as.Date(dat$k_POSIXdate_plotting, tz = "UTC"))
    # You'll notice that the time strings match the original data, and are now set to the appropriate timezone (by coordinates).

    # Let's reorder our columns a bit
    dat <- dat %>%
      select(Year:k_POSIXdate_plotting, k_datenum:k_fracyr, NEE_NT_U50_gf:VPD_DT_gf, everything())

    start_year <- unique(dat$Year)[1]
    end_year <- tail(dat$Year,1)


    # Sometimes a single row will be added at the end of the data.
    # Since this affects the naming scheme (re: years of record), we remove this last row
    test_end_year <- subset(dat, Year == end_year)

    if (nrow(test_end_year) == 1) {
      dat <- head(dat,-1)
      end_year <- tail(dat$Year,1)
    }


    if (export.files) {
      message('')
      message('  ', 'Exporting complete dataset to:  ')
      message('    ', paste0(outpath))

      outfile <- sprintf('%s_%s-%d-%d-Complete-Output.txt', db, site, start_year, end_year)

      write.table(dat, file = paste0(outpath, outfile),
                  sep = "\t", row.names = FALSE,
                  col.names = TRUE,
                  append = FALSE)

    }



  } # end site loop


} # end function