FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in FS_fAppendREddyProcOutputs

#' This function appends annual dataframes produced via FS_fProcessingThruREddyProc (and package REddyProc)
#' Here, annual files are stacked and joined to a continuous (half-)houry time series for the entire record of the site, with NAs assigned to gaps.

#' @export
#' @title Append annual REddyProc output files
#' @param inpath character string, navigate to the directory containing REddyProc output files (must contain the following subdirectory names: '1_after-night-part/' and '2_after-day-part/')
#' @param outpath character string, set the directory by which appended dataframes will be exported (must contain the following subdirectory names: '1_after-night-part/' and '2_after-day-part/')
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.


#' @importFrom stringr str_remove str_detect str_remove
#' @importFrom patchwork plot_annotation




# Author: Kenny Smith

# Version notes -----------------------------------------------------------
# v5.0.KS
# 201005
# Major overhaul to ensure more flexible coding. Functionally there is nothing new here, but the code is more organized and relies on less hard-coding



FS_fAppendREddyProcOutputs <- function(inpath, outpath, site.meta, export.files = TRUE) {


  if (!dir.exists(outpath)) {
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ",outpath))

    if (x) {
      dir.create(file.path(paste0(outpath, "1_after-night-part/")), showWarnings = FALSE, recursive = TRUE)
      dir.create(file.path(paste0(outpath, "2_after-day-part/")), showWarnings = FALSE, recursive = TRUE)
    }
  }

  ## Timestamp handling
  # Create the 'ideal' dataframe first (including leap days), then use left_join to unite the two dfs.
  # This will append the relevant info you need (GPP, PPFD) to a full-time series, leaving NAs for DoY = 366

  # Create a vector of half-hourly timestamps over a 30-year window
  # NOTE: By default, uses the timezone of your machine. This shouldn't be an issue, because we're just using the POSIX string to match Years, Hours, and Min from dat
  halfhours_null <- seq(as.POSIXct("1990-01-01 00:00:00", tz = "UTC"), as.POSIXct("2040-01-01 00:00:00", tz = "UTC"), by="30 mins")


  # Convert timestamp vector into a dataframe, and then parse date into Year, DoY, and Hour
  date.ref <- data.frame(halfhours_null)
  date.ref <- transform(date.ref,
                        Year = lubridate::year(halfhours_null),
                        DoY = lubridate::yday(halfhours_null),
                        Hour = as.numeric(format(halfhours_null, format = "%H")) + (lubridate::minute(halfhours_null)/60))

  # remove unnecessary objects
  rm(halfhours_null)

  ## File Import
  # find all raw data files (including rejected sites)
  all.files <- list.files(paste(inpath), recursive = TRUE)

  # extract paths for FLUXNET2015 and AMERIFLUX data
  fluxnet.files <- all.files[grep(pattern = '\\FLX', all.files)]
  amf.files <- all.files[grep(pattern = '\\AMF', all.files)]
  neon.files <- all.files[grep(pattern = '\\NEO', all.files)]


  # concatenate the flux datasets
  parent.files <- c(fluxnet.files, amf.files, neon.files)

  # separate into night vs. day partitioned
  night.parent.files <- parent.files[grep(paste("night"), parent.files)]
  day.parent.files <- parent.files[grep(paste("day"), parent.files)]

  night.files <- sapply(strsplit(night.parent.files, "/\\s*"), tail, 1)
  day.files <- sapply(strsplit(day.parent.files, "/\\s*"), tail, 1)

  files <- c(night.files, day.files)

  # list of all available sites with database type (eg, 'AMF_CA-Qcu')
  db.and.sites <- unique(substr(x = files, start = 1, stop = 10))

  # matching database for each
  all.site.db <- substr(x = files, start = 1,stop = 3)

  neon_index <- all.site.db %in% "NEO"

  # all available site names
  flx.amf.site.names <- substr(x = files[!neon_index], start = 5,stop = 10)
  neon.site.names <- substr(x = files[neon_index], start = 5,stop = 8)
  all.site.names <- c(flx.amf.site.names, neon.site.names)

  # update db.and.sites to fix NEON names
  db.and.sites[neon_index] <- gsub("_N", "", db.and.sites[neon_index])



  # set a description for your routine:
  routine_description <- "Append annual REddyProc output files into single dataframe"

  # select one or more sites
  choices <- fSiteSelect(db.and.sites, descrip = routine_description)

  if (length(choices[["AMF"]]) > 0) {
    AMF_selected <- paste0("AMF_", choices[["AMF"]])
  } else {AMF_selected <- NULL}

  if (length(choices[["FLX"]]) > 0) {
    FLX_selected <- paste0("FLX_", choices[["FLX"]])
  } else {FLX_selected <- NULL}

  if (length(choices[["NEO"]]) > 0) {
    NEO_selected <- paste0("NEO_", choices[["NEO"]])
  } else {NEO_selected <- NULL}

  # Includes database name in site selection
  selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)

  # Omits database name in site selection
  selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])



  for (i in 1:length(selected.db_sites)) {
    # The current site (and database)
    db_site <- selected.db_sites[i]

    # Extract the database name and site name from the 'db_site' string
    db <- substr(x = db_site, start = 1, stop = 3)
    site <- str_remove(db_site, paste0(db, "_"))


    # Get site metadata
    site.info <- fGetSiteInfo(dat = site.meta,
                              site = site,
                              db = db,
                              site_col = "site_ID",
                              db_col = "database",
                              site_name_col = "site_Name",
                              start_yr_col = "start_year",
                              end_yr_col = "end_year",
                              exclude_yr_col = "exclude_years",
                              lat_col = "lat",
                              long_col = "long",
                              country_col = "country",
                              IGBP_col = "IGBP",
                              elev_col = "elev_orig",
                              MAT_col = "MAT_orig",
                              MAP_col = "MAP_orig")

    message('')
    message('Processing site (', min(which(selected.sites %in% site)), '/',
            length(selected.sites), '):\t', site, ', ', db, " (", site.info$country, ")")
    message('\t\tName: ', site.info$info)
    message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
    message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0),  'mm')
    message('\t\tForest Type: ', site.info$IGBP)

    message('')
    message('  ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)

    message('')
    message('  ', 'Navigating to data directory:  ')
    message('    ', paste0(inpath))


    site.files <- parent.files[grep(paste(site, collapse="|"), parent.files)]
    years_of_record <- as.numeric(gsub(".*-(\\d{4})-.+", "\\1", site.files)) ## Looks for 4-char numeric strings, followed by a hyphen
    start_year <- years_of_record[1]
    end_year <- tail(years_of_record, 1)


    # Store all years for a given site into a list
    dat.list <- list()


    message('')
    message('  ', 'Storing REddyProc output data in a list...')
    message('  ', 'Data imported from:')
    message('    ', paste0(inpath))


    # Write a small loop that will combine your two datasets (night and day)
    # If either doesnt exist, this can't work. So return an error.
    # Assign the two partitioning methods to an object
    partition.method <- c("night", "day")


    for (j in 1:length(partition.method)) {
      method <- partition.method[j]

      if (method == 'night') {
        message('')
        message('  ', 'Stacking REddyProc output data: night method')
        message('')
      } else if (method == 'day') {
        message('')
        message('  ', 'Stacking REddyProc output data: day method')
        message('')
      }


      # create progress bar
      pb <- txtProgressBar(min = start_year, max = end_year, style = 3)
      for (process_year in start_year:end_year) {
        tryCatch(
          expr = {
            Sys.sleep(0.1)

            #+++ Load data with 1 header and 1 unit row from (tab-delimited) text file

            if (method == 'night') {
              infile <- sprintf("%s_%s-%d-after-night-part.txt", db, site, process_year)
              subdirectory <- "1_after-night-part/"
            } else if (method == 'day') {
              infile <- sprintf("%s_%s-%d-after-day-part.txt", db, site, process_year)
              subdirectory <- "2_after-day-part/"
            }

            dat.list[[process_year]] <- read.table(paste(inpath, subdirectory, infile, sep=""), header=TRUE)[-1,]
            index <- sapply(dat.list[[process_year]], is.factor)
            dat.list[[process_year]][index] <- lapply(dat.list[[process_year]][index], function(x) as.numeric(as.character(x)))

            # update progress bar
            setTxtProgressBar(pb, process_year)},

          # If loop fails for a particular year,
          error = function(e){
            message("")
            message("    * NO data available for process year: ", process_year)
            message("      (NAs will be assigned to data during missing years)")
            message("")
            print(e)
          }
        )
      }
      close(pb)


      message('')
      message('  ', 'Appending years into dataframe:')
      message('  ', 'NOTE: As of 08/26/2020, the following variables are being extracted from output files:')
      message('  ', '\tNEE_U50, GPP_U50, Reco_U50, PPFD, SW_IN, Tair, Tsoil, VPD')



      if (method == 'night') {
        dat <- dat.list %>%
          bind_rows(.id = "source") %>%
          group_by(source) %>%
          select(source, Year, DoY, Hour,
                 NEE_U50_f, GPP_U50_f, Reco_U50, PPFD_f,
                 Rg_f, Tair_f, Tsoil_f, VPD_f) %>% # Here you can add other variables of interest
          rename('NEE_NT_U50_gf'='NEE_U50_f', 'GPP_NT_U50_gf'='GPP_U50_f', 'Reco_NT_U50_gf'='Reco_U50', 'PPFD_NT_gf'='PPFD_f',
                 'SW_IN_NT_gf'='Rg_f', 'Tair_NT_gf'='Tair_f', 'Tsoil_NT_gf'='Tsoil_f', 'VPD_NT_gf'='VPD_f')
      } else if (method == 'day') {
        dat <- dat.list %>%
          bind_rows(.id = "source") %>%
          group_by(source) %>%
          select(source, Year, DoY, Hour,
                 NEE_U50_f, GPP_DT_U50, Reco_DT_U50, PPFD_f,
                 Rg_f, Tair_f, Tsoil_f, VPD_f) %>% # Here you can add other variables of interest
          rename('NEE_DT_U50_gf'='NEE_U50_f', 'GPP_DT_U50_gf'='GPP_DT_U50', 'Reco_DT_U50_gf'='Reco_DT_U50', 'PPFD_DT_gf'='PPFD_f',
                 'SW_IN_DT_gf'='Rg_f', 'Tair_DT_gf'='Tair_f', 'Tsoil_DT_gf'='Tsoil_f', 'VPD_DT_gf'='VPD_f')
      }




      message('')
      message('  ', 'Parsing datetime strings...')

      # Convert dat to a dataframe, and convert all na strings (-9999) to 'NA'
      dat <- as.data.frame(dat)
      dat[dat == -9999] <- NA


      # Append data to full year timestamps ----------------------------------------------------
      df <- date.ref %>%
        left_join(dat, by = c("Year", "DoY", "Hour")) %>%
        filter(Year >= start_year & Year <= end_year) %>% # Constrains the dataframe to years where you have data
        select(-c(source, halfhours_null))


      # At this point we can be pretty darn sure that there aren't any data gaps, and since we indexed our data to the Year,DoY,Hour cols, we don't have to worry about timezones either.


      ## Export data

      if (export.files) {


        if (method == 'night') {
          outfile <- sprintf('%s_%s-%d-%d-after-night-part.txt', db, site, start_year, end_year)
          message('')
          message('  ', 'Exporting stacked data to:  ')
          message('    ', paste0(outpath, '1_after-night-part/'))
        } else if (method == 'day') {
          outfile <- sprintf('%s_%s-%d-%d-after-day-part.txt', db, site, start_year, end_year)
          message('')
          message('  ', 'Exporting stacked data to:  ')
          message('    ', paste0(outpath, '2_after-day-part/'))
        }

        write.table(df, file = paste0(outpath, subdirectory, outfile),
                    sep = "\t", row.names = FALSE,
                    col.names = TRUE,
                    append = FALSE)

      }


    } # end partition loop


  } # end site loop


} # end function