FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in FS_fExtractAllCriticalDates

#' After calculating GPPsat using 'FS_fCalculateGPPandNEESatParameters', this function will extract critical dates from the GPPsat time series.
#' Additionally, this function will calculate start and end of winter (SOW/EOW) based on the period in which ecosystems no longer accumulate C (GPP-free period), using cumulative GPP as the flux parameter.

#' @export
#' @title Analyze critical dates (SOS/EOS and SOW/EOW) from GPPsat and cumuGPP time series.
#'
#' @param inpath_GPPsat character string, navigate to the directory containing GPPsat and associated fit parameters
#' @param inpath_complete_output character string, navigate to the directory containing stacked complete output datasets (specific formatting required)
#' @param outpath_critdates character string, specify path to where you'd like to export a dataframe of critical dates (includes all sites on the same df)
#' @param outpath_plots character string, specify path to where you'd like to export figures
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param span numeric, set the span for time series smoothing (defaults to 0.075)
#' @param create.plots logical, if TRUE (default) exports 4 figures
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.
#' @param batch_process logical, if TRUE, batch processes all sites within the inpath directory (defaults to FALSE).
#' @param exclude_bad_yrs logical, if TRUE, excludes user defined years from the analysis (defaults to FALSE).


#' @importFrom stringr str_remove str_detect
#' @importFrom lubridate decimal_date
#' @importFrom pracma circshift


FS_fExtractAllCriticalDates <- function(inpath_GPPsat, inpath_complete_output, outpath_critdates, outpath_plots, site.meta, span = 0.075, create.plots = TRUE, export.files = TRUE, batch_process = FALSE, exclude_bad_yrs = FALSE) {

  # File handling and user site selection -----------------------------------
  if (!dir.exists(outpath_critdates)) {
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ",outpath_critdates))

    if (x) {
      dir.create(file.path(paste0(outpath_critdates)), showWarnings = FALSE, recursive = TRUE)
    }
  }

  if (!dir.exists(outpath_plots)) {
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ", outpath_plots))

    if (x) {
      dir.create(file.path(paste0(outpath_plots)), showWarnings = FALSE, recursive = TRUE)
    }
  }






  # find all GPPsat time series data
  all.files <- list.files(paste(inpath_GPPsat), recursive = TRUE)

  # extract paths for FLUXNET2015, AMERIFLUX, and NEON data
  fluxnet.files <- all.files[grep(pattern = '\\FLX', all.files)]
  amf.files <- all.files[grep(pattern = '\\AMF', all.files)]
  neon.files <- all.files[grep(pattern = '\\NEO', all.files)]

  # concatenate the three flux datasets
  files <- c(fluxnet.files, amf.files, neon.files)

  # list of all available sites with database type (eg, 'AMF_CA-Qcu')
  db.and.sites <- unique(substr(x = files, start = 1,stop = 10))

  # matching database for each
  all.site.db <- substr(x = files, start = 1,stop = 3)

  neon_index <- all.site.db %in% "NEO"

  # all available site names
  flx.amf.site.names <- substr(x = files[!neon_index], start = 5,stop = 10)
  neon.site.names <- substr(x = files[neon_index], start = 5,stop = 8)
  all.site.names <- c(flx.amf.site.names, neon.site.names)

  # update db.and.sites to fix NEON names
  db.and.sites[neon_index] <- gsub("_N", "", db.and.sites[neon_index])


  # Select site(s) or batch process all sites at once
  if (batch_process) {
    selected.db_sites <- db.and.sites
    selected.sites <- all.site.names
  } else {


    # give a description for your routine:
    routine_description  <-  "Extract SOS/EOS dates from GPPsat and cumulative fluxes"

    # select one or more sites
    choices <- fSiteSelect(db.and.sites, descrip = routine_description)

    if (length(choices[["AMF"]]) > 0) {
      AMF_selected <- paste0("AMF_", choices[["AMF"]])
    } else {AMF_selected <- NULL}

    if (length(choices[["FLX"]]) > 0) {
      FLX_selected <- paste0("FLX_", choices[["FLX"]])
    } else {FLX_selected <- NULL}

    if (length(choices[["NEO"]]) > 0) {
      NEO_selected <- paste0("NEO_", choices[["NEO"]])
    } else {NEO_selected <- NULL}

    # Includes database name in site selection
    selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)

    # Omits database name in site selection
    selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])

  }


  # Store all sites in their own database-specific list (these will be appended later)
  # dat.list is the master list
  dat.list <- list()

  dat.list$AMF <- list()
  dat.list$FLX <- list()
  dat.list$NEO <- list()


  # Later we're going to export all the critical date information across all sites, so we need a place to store the critical date datasets ONLY
  CritDates.list <- list()





  # Begin looping thru sites ------------------------------------------------
  for (i in 1:length(selected.db_sites)) {

    # Identify site name and available data
    # The current site (and database)
    db_site <- selected.db_sites[i]

    # Extract the database name and site name from the 'db_site' string
    db <- substr(x = db_site, start = 1, stop = 3)
    site <- str_remove(db_site, paste0(db, "_"))


    # Get site metadata
    site.info <- fGetSiteInfo(dat = site.meta,
                              site = site,
                              db = db,
                              site_col = "site_ID",
                              db_col = "database",
                              site_name_col = "site_Name",
                              start_yr_col = "start_year",
                              end_yr_col = "end_year",
                              exclude_yr_col = "exclude_years",
                              lat_col = "lat",
                              long_col = "long",
                              country_col = "country",
                              IGBP_col = "IGBP",
                              elev_col = "elev_orig",
                              MAT_col = "MAT_orig",
                              MAP_col = "MAP_orig")

    message('')
    message('Processing site (', min(which(selected.sites %in% site)), '/',
            length(selected.sites), '):\t', site, ', ', db, " (", site.info$country, ")")
    message('\t\tName: ', site.info$info)
    message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
    message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0),  'mm')
    message('\t\tForest Type: ', site.info$IGBP)

    message('')
    message('  ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)

    message('')
    message('  ', 'Reading (half-)hourly data from:  ' )
    message('    ', paste0(inpath_complete_output))






    # Load (half-)hourly datasets ---------------------------------------------
    # List all the files in the complete output datasets folder
    complete_HH_files <- list.files(paste(inpath_complete_output), recursive = TRUE)

    # Find the output dataset for the current site
    complete_HH_site_file <- complete_HH_files[grepl(db_site,  complete_HH_files)]

    # Load complete output half-hourly dataset (be sure to delete later)
    complete_HH_dataset <- read.table(paste(inpath_complete_output, complete_HH_site_file, sep=""), header=TRUE)

    # Make sure k_POSIXdate_plotting is a date column:
    complete_HH_dataset$k_POSIXdate_plotting <- as.Date(complete_HH_dataset$k_POSIXdate_plotting)

    # Add db and Site as cols to complete_HH_dataset
    complete_HH_dataset <- complete_HH_dataset %>%
      mutate(db = db, site = site) %>%
      select(db, site, everything())

    # Store the complete half-hourly dataset to your site list
    dat.list[[db]][[site]]$Data$`(Half-)hourly site data` <- complete_HH_dataset

    rm(complete_HH_files, complete_HH_site_file)




    # Start/End of Season (SOS/EOS) - Setup -----------------------------------
    # Specify the name of your GPPsat datafile for this site
    infile <- all.files[grep(paste(site, collapse="|"), all.files)]

    message('')
    message('  ', 'Reading GPPsat data from:  ')
    message('    ', paste0(inpath_GPPsat))


    # Read data
    dat <- read.table(paste(inpath_GPPsat, infile, sep=""), header=TRUE)

    # Pull out years of record for this site
    years_of_record <- unique(dat$Year)
    start_year <- years_of_record[1]
    end_year <- tail(years_of_record,1)

    # Store the GPPsat dataset to your site list
    dat.list[[db]][[site]]$Data$`Daily GPPsat data` <- dat





    # SOS/EOS - Loop thru GPPsat time series by process_year ------------------
    message('')
    message('    ', 'Subsetting GPPsat data by process_year...')


    # In a loop, subset dat by process_year, centered on DoY 190 (+/- 233 days, a full calendar year + 100 days)
    # Store in list: GPPsat_TimeSeries_by_process_year_ls
    GPPsat_TimeSeries_by_process_year_ls <- list()

    pb <- txtProgressBar(min = 0, max = length(years_of_record), initial = 0, style = 3)
    k <- 1

    for (process_year in years_of_record) { # begin year loop (GPPsat)

      # Assign a numeric value for the current iteration (k)
      k <- which(years_of_record %in% process_year)

      tryCatch(
        expr = {
          Sys.sleep(0.1)

          # Determine the dates to trim your dataset by
          # NOTE: we are processing sites on a per-year basis, but we often need to include data that extends beyond the calendar year
          # Here we're allowing a certain number of buffer days to be added

          # Add specified amount of buffer days to the year (0 would be a normal 365/6 day year, 50 would add 50 days to the beginning and end of the year or 100 total)
          buffer_days <- 50
          # Specify the DoY you would like to center the axis on
          center_DoY <- 190

          trim_dates <- dat %>%
            subset(., Year == process_year) %>%
            subset(., DoY == center_DoY) %>%
            summarize(start = as.Date(k_POSIXdate_plotting) - ((366/2) + buffer_days),
                      end = as.Date(k_POSIXdate_plotting) + ((366/2) + buffer_days))


          # Subset dat by trim dates and calculate a revised column for cumu.DoY (after dropping the old one)
          df <- dat %>%
            subset(., as.Date(k_POSIXdate_plotting) >= trim_dates$start & as.Date(k_POSIXdate_plotting) < trim_dates$end) %>%
            select(!cumu.DoY) %>%
            mutate(cumu.DoY = seq_len(nrow(.))) %>%
            select(c("Year", "DoY", "cumu.DoY", "fracyr", "k_POSIXdate_plotting", "NEEsat_fixed_NEE", "n_interval", "n_NEE_possible",
                     "n_NEE_actual", "pct_avail_NEE")) # Pick out x and y data


          # Create a new dataframe that omits mismatched pairs (x = value, y = NA)
          # Omitting this section causes problems with curve fitting
          df.minusNA <- df[complete.cases(df),]



          # Fit a smoothing spline to the time series data
          # Your LOESS model
          mod1 <- loess(data = df.minusNA, NEEsat_fixed_NEE ~ cumu.DoY, span = span)

          # Your model predictions for the length of your subset data (includes NAs in length calc) (this smooths out your trend line)
          pred1 <- predict(mod1, df$cumu.DoY)

          # Create a dataframe matching the predicted values to every DoY within a 366 day year.
          loess.line <- data.frame(cbind(cumu.DoY = df$cumu.DoY, pred1))

          # Join the predicted fit estimates to your data
          df.pred <- df %>%
            left_join(loess.line, by = "cumu.DoY") %>%
            mutate(resid = NEEsat_fixed_NEE - pred1)


          ## NOTE!
          # Here we should add a bootstrapping routine to estimate uncertainty around our fitted spline
          df.pred <-  df.pred %>%
            select(!fracyr) %>%
            mutate(fracyr_null = fPOSIX_to_fracyr_null(k_POSIXdate_plotting, process_year), # Create a new time column that's a fractional year (But not indexed by year)
                   process_year = process_year)


          ## NOTE: fracyr_null is expressed as a decimal date where the process year equals 1, the following year equals 2, and the previous year equals 0

          # Add each subsetted dataset to the GPPsat_TimeSeries_by_process_year_ls
          GPPsat_TimeSeries_by_process_year_ls[[as.character(process_year)]] <- df.pred

          # Remove large objects
          rm(df, df.minusNA, mod1, pred1, loess.line, df.pred, trim_dates)


        },
        # If loop fails for a particular year,
        error = function(e){
          message("")
          message("    * ERROR on process year: ", process_year)
          message("      (NAs will be assigned to data during missing years)")
          message("")
          print(e)
        }
      )

      # Step up progress bar
      setTxtProgressBar(pb,k)

      } # end year loop (GPPsat)

    rm(k, process_year, pb)



    # SOS/EOS - Append GPPsat by process_year into dataframe ------------------
    message('')
    message('    ', 'Appending grouped GPPsat data into single dataframe...')

    # Append the individual process years into a single dataframe
    # IMPORTANT NOTE: This is not a true time-series dataframe since it contains repeat values (i.e. each process year contains data from the year before and after)
    GPPsat_TimeSeries_by_process_year_df <- bind_rows(GPPsat_TimeSeries_by_process_year_ls, .id = "process_year")

    # Add columns for site and db, and reorder columns
    GPPsat_TimeSeries_by_process_year_df <- GPPsat_TimeSeries_by_process_year_df %>%
      mutate(site = site, ## IMPORTANT: 'site' column in previous versions was capitalized (Site)
             db = db) %>%
      rename(GPPsat_pred = pred1,    ## IMPORTANT: 'pred1' and 'resid' were changed to 'GPPsat_pred' and 'GPPsat_resid' (accordingly)
             GPPsat_resid = resid) %>%
      select(db, site, process_year, k_POSIXdate_plotting, Year, fracyr_null, DoY, cumu.DoY, GPPsat_pred, GPPsat_resid, NEEsat_fixed_NEE:pct_avail_NEE) %>%
      mutate(process_year = as.numeric(process_year),
             k_POSIXdate_plotting = as.Date(k_POSIXdate_plotting))

    rm(GPPsat_TimeSeries_by_process_year_ls)
    # We now have a dataframe of GPPsat that is structured in a way that allows us to analyze SOS/EOS









    # SOS/EOS - Analyze start/end of season from GPPsat -----------------------

    message('')
    message('    ', 'Extracting SOS/EOS dates based on continuous GPPsat...')

    # Use fExtract_SOSEOS_bySite to examine SOS/EOS critical threshold dates for each site
    CritDates_SOS.EOS <- fExtract_SOSEOS_bySite(POSIX = GPPsat_TimeSeries_by_process_year_df$k_POSIXdate_plotting,
                                        cumu.DoY = GPPsat_TimeSeries_by_process_year_df$cumu.DoY,
                                        proc_yr = GPPsat_TimeSeries_by_process_year_df$process_year,
                                        GPP = GPPsat_TimeSeries_by_process_year_df$GPPsat_pred)

    CritDates_SOS.EOS <- CritDates_SOS.EOS$CritDates_long




    # BREAK -------------------------------------------------------------------
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #



    # Start/End of Winter (SOW/EOW) - Setup -----------------------------------
    ## Subset (half-)hourly data for analysis of GPP-free period (determine end of current winter (ECW) and start of next winter (SNW))
    ## Store in list: CumulativeFlux_TimeSeries_by_process_year_ls

    # Goal is to calculate cumulative GPP/NEE for the period of a process_year +/- 190 days, as well as the first derivative of cumulative GPP.
    # Using a defined threshold for the first derivative of cumulativee GPP, we will determine the GPP-free period as that in which C-gain (cumulative GPP) over time is ~0

    message('')
    message('    ', 'Setting up analysis of cumulative GPP to determine winter critical dates...')

    # First select the flux columns of interest. In this case we're using (half-)hourly NEE, Reco, and GPP that were gap-filled and partitioned by KS using the FluxSynth routine
    HH_fluxes_df <- complete_HH_dataset %>%
      select(db:Hour, k_POSIXdate_plotting, k_fracyr, NEE_NT_U50_gf, GPP_NT_U50_gf, Reco_NT_U50_gf )

    # Convert NEE values to positive C gain, and Reco values to negative C gain
    HH_fluxes_df$NEE_NT_U50_gf <- -1*HH_fluxes_df$NEE_NT_U50_gf
    HH_fluxes_df$Reco_NT_U50_gf <- -1*HH_fluxes_df$Reco_NT_U50_gf

    # Convert (half-)hourly data to mean daily (DD)
    DD_fluxes_df <- HH_fluxes_df %>%
      group_by(db, site, Year, DoY, k_POSIXdate_plotting, k_fracyr) %>%
      summarise_at(c("NEE_NT_U50_gf", "GPP_NT_U50_gf", "Reco_NT_U50_gf"), mean, na.rm = TRUE) %>%
      ungroup()


    # Convert NaNs to NA
    DD_fluxes_df[sapply(DD_fluxes_df, is.nan)] <- NA



    # SOW/EOW - Loop thru cumulative fluxes time series by process_year -------
    # Initialize empty lists
    CumulativeFlux_TimeSeries_by_process_year_ls <- list()
    winterCritDates_by_process_year_ls <- list()


    # Reset process_year
    process_year <- years_of_record[1]

    message('')
    message('    ', 'Subsetting GPP data by process_year...')

    pb <- txtProgressBar(min = 0, max = length(years_of_record), initial = 0, style = 3)
    k <- 1

    for (process_year in years_of_record) { # begin year loop (fluxes)

      k <- which(years_of_record %in% process_year)


      tryCatch(
        expr = {
          Sys.sleep(0.1)

          # Add specified amount of buffer days to the year (0 would be a normal 365/6 day year, 190 would add 190 days to the beginning AND end of the year or 380 total added days)
          buffer_days <- 190
          # Specify the DoY you would like to center the axis on
          center_DoY <- 190



          # Determine the dates to trim your dataset by
          trim_dates <- DD_fluxes_df %>%
            subset(., Year == process_year) %>%
            subset(., DoY == center_DoY) %>%
            summarize(start = as.Date(k_POSIXdate_plotting) - ((366/2) + buffer_days),
                      end = as.Date(k_POSIXdate_plotting) + ((366/2) + buffer_days))


          # Subset dat by trim dates and calculate a revised column for cumu.DoY (after dropping the old one)
          DD_fluxes_subset <- DD_fluxes_df %>%
            subset(., as.Date(k_POSIXdate_plotting) >= trim_dates$start & as.Date(k_POSIXdate_plotting) < trim_dates$end) %>%
            mutate(cumu.DoY = seq_len(nrow(.)))

          # Goal: Start with the first position in every year, always. So if your data starts 6-3-2003, then cumulative DoY will begin on 1-1-2003
          # Luckily, we already have a DoY column, so we only need to use this to begin our cumulative sum
          start_DoY <-  as.numeric(DD_fluxes_subset[1,"DoY"] )
          no_DoY <- nrow(DD_fluxes_subset)

          # Add a column for cumulative day of year based on start_DoY (cumu.DoY_raw)
          DD_fluxes_subset$cumu.DoY_raw <-  seq(from = start_DoY, length.out = no_DoY)




          DD_fluxes_subset_cumu <- DD_fluxes_subset %>%
            rename("NEE" = NEE_NT_U50_gf, "GPP" = GPP_NT_U50_gf, "Reco" = Reco_NT_U50_gf) %>%      # Rename the column headers for NEE, GPP, and Reco
            mutate(NEE_zeroes = NEE, GPP_zeroes = GPP, Reco_zeroes = Reco) %>%        # Duplicate the flux columns and rename using the suffix '_zeroes'
            replace_na(list(NEE_zeroes = 0, GPP_zeroes = 0, Reco_zeroes = 0)) %>%      # Replace NAs in the 'XX_zeroes' columns with 0
            group_by(site) %>%
            mutate(cumu.GPP_raw = cumsum(GPP_zeroes),      # Calculate cumulative GPP/NEE for the subset time frame
                   cumu.GPP_raw_NA = cumu.GPP_raw,
                   cumu.NEE_raw = cumsum(NEE_zeroes),
                   cumu.NEE_raw_NA = cumu.NEE_raw)



          # Check if there are any cumulative GPP values that need to be replaced with NA
          NA.index_GPP <- is.na(DD_fluxes_subset_cumu$GPP)

          if (sum(NA.index_GPP) > 0) {
            DD_fluxes_subset_cumu$cumu.GPP_raw_NA[NA.index_GPP] <- NA
          }

          # Do the same for NEE
          NA.index_NEE <- is.na(DD_fluxes_subset_cumu$NEE)

          if (sum(NA.index_NEE) > 0) {
            DD_fluxes_subset_cumu$cumu.NEE_raw_NA[NA.index_NEE] <- NA
          }


          # require(pracma) for 'circshift' function

          # Calculate the first derivatives for cumulative GPP/NEE
          DD_fluxes_subset_deriv <- DD_fluxes_subset_cumu %>%
            mutate(deriv.GPP_raw_NA = cumu.GPP_raw_NA - circshift(cumu.GPP_raw_NA, 1),
                   deriv.NEE_raw_NA = cumu.NEE_raw_NA - circshift(cumu.NEE_raw_NA, 1))




          # Assign NAs to the first row position for deriv(GPP/NEE)
          DD_fluxes_subset_deriv$deriv.GPP_raw_NA[1] <- NA
          DD_fluxes_subset_deriv$deriv.NEE_raw_NA[1] <- NA


          # View simple plots of cumulative NEE/GPP and the first derivative of cumulative GPP

          # cumsum.plot.simple <- ggplot(data = DD_fluxes_subset_deriv, aes(x = k_fracyr)) +
          #   geom_line(aes( y = cumu.NEE_raw_NA), color = "black") +
          #   geom_line(aes( y = cumu.GPP_raw_NA/10), color = "darkgreen") +
          #   scale_y_continuous(
          #
          #     # Features of the first axis
          #     name = expression(atop("cumulative NEE",paste("(g C ", m^-2,")"))),
          #
          #     # Add a second axis and specify its features
          #     sec.axis = sec_axis(~.*10, name=expression(atop("cumulative GPP",paste("(g C ", m^-2,")"))))
          #   )
          #
          # deriv.plot.simple <- ggplot(data = DD_fluxes_subset_deriv, aes(x = k_fracyr, y = deriv.GPP_raw_NA)) +
          #   geom_point(color = "darkgreen") +
          #   scale_y_continuous(name = "deriv (cumulative GPP)")
          #
          # print(cumsum.plot.simple)
          # print(deriv.plot.simple)
          #
          # rm(cumsum.plot.simple, deriv.plot.simple)




          # Smooth out the cumulative GPP data using a filter (we may want to revisit how we do this)
          # By specifying a window size, you're calculating the average of the data on a given day +/- the number of days in the window
          # Example: a window size of 5 will calculate the moving average over 11 days (the current day +/- 5 days)
          windowSize  <-  14
          smooth_filter <- rep(1/windowSize,windowSize)

          # Note: specifying sides = 2 elinates the need to circshift at this point
          DD_fluxes_subset_deriv$deriv.GPP_smooth <- stats::filter(DD_fluxes_subset_deriv$deriv.GPP_raw_NA, smooth_filter, sides=2)

          # convert the smooth deriv column to numeric
          DD_fluxes_subset_deriv$deriv.GPP_smooth <- as.numeric(DD_fluxes_subset_deriv$deriv.GPP_smooth)


          DD_fluxes_subset_deriv <-  DD_fluxes_subset_deriv %>%
            select(!k_fracyr) %>%
            mutate(fracyr = decimal_date(k_POSIXdate_plotting),
                   fracyr_null = fPOSIX_to_fracyr_null(k_POSIXdate_plotting, process_year), # Create a new time column that's a fractional year (But not indexed by year)
                   process_year = as.numeric(process_year)) %>%
            select(db, site, process_year, Year:k_POSIXdate_plotting, fracyr:fracyr_null, everything())


          # Append the daily accumulated fluxes dataframe to your empty list (by process_year)
          CumulativeFlux_TimeSeries_by_process_year_ls[[as.character(process_year)]] <- DD_fluxes_subset_deriv




          # Test to determine the beginning and end of winter for each process_year (which will include the previous year's winter as well as the current year)
          # test1 is true for each entry if 5 in a row are true, centered on the entry
          GPP_deriv_thresh <- 0.75
          tmp <- DD_fluxes_subset_deriv$deriv.GPP_smooth < GPP_deriv_thresh
          test1 <- circshift(tmp, 2) & circshift(tmp, 1) & tmp & circshift(tmp, -1) & circshift(tmp, -2)


          # GPPend_test marks the beginning of the GPP-free season
          # GPPend_test is true if 3 entries of test1 in a row are false and 14 after them are true
          xm3 <- circshift(test1,3) # 3 entries prior
          xm2 <- circshift(test1,2)
          xm1 <- circshift(test1,1)
          x0 <- test1
          x1 <- circshift(test1,-1)
          x2 <- circshift(test1,-2)
          x3 <- circshift(test1,-3) # 3 entries after
          x4 <- circshift(test1,-4)
          x5 <- circshift(test1,-5)
          x6 <- circshift(test1,-6)
          x7 <- circshift(test1,-7)
          x8 <- circshift(test1,-8)
          x9 <- circshift(test1,-9)
          x10 <- circshift(test1,-10)
          x11 <- circshift(test1,-11)
          x12 <- circshift(test1,-12)
          x13 <- circshift(test1,-13)
          GPPend_test <- (!xm3 & !xm2 & !xm1) &  x0 & (x1 & x2 & x3 & x4 & x5 & x6 & x7 & x8 & x9 & x10 & x11 & x12 & x13)


          # GPPstart_test marks the end of the GPP-free season
          # GPPstart_test is true if 14 entries of test1 in a row are true followed by 3 false
          xm13 <- circshift(test1,13)
          xm12 <- circshift(test1,12)
          xm11 <- circshift(test1,11)
          xm10 <- circshift(test1,10)
          xm9 <- circshift(test1,9)
          xm8 <- circshift(test1,8)
          xm7 <- circshift(test1,7)
          xm6 <- circshift(test1,6)
          xm5 <- circshift(test1,5)
          xm4 <- circshift(test1,4)
          xm3 <- circshift(test1,3) # 3 entries prior
          xm2 <- circshift(test1,2)
          xm1 <- circshift(test1,1)
          x0 <- test1
          x1 <- circshift(test1,-1)
          x2 <- circshift(test1,-2)
          x3 <- circshift(test1,-3) # 3 entries after
          GPPstart_test <- (xm13 & xm12 & xm11 & xm10 & xm9 & xm8 & xm7 & xm6 & xm5 & xm4 & xm3 & xm2 & xm1) &  x0 & (!x1 & !x2 & !x3)



          # Define end of winter (SOW) as the date at which the prior 14 days are BELOW a 1st derivative threshold while the following 3 days are ABOVE the threshold
          EOW <- DD_fluxes_subset_deriv %>%
            select(db:fracyr_null, cumu.DoY, cumu.DoY_raw, cumu.GPP_raw_NA, deriv.GPP_smooth) %>%
            cbind("GPPstart" = GPPstart_test) %>%
            subset(., !is.na(GPPstart) &
                     GPPstart == TRUE) %>%
            mutate(CritThreshold = "EOW") %>%
            select(-GPPstart)

          # Define start of winter (SOW) as the date at which the prior 3 days are ABOVE a 1st derivative threshold while the following 14 days are BELOW the threshold
          SOW <- DD_fluxes_subset_deriv %>%
            select(db:fracyr_null, cumu.DoY, cumu.DoY_raw, cumu.GPP_raw_NA, deriv.GPP_smooth) %>%
            cbind("GPPend" = GPPend_test) %>%
            subset(., !is.na(GPPend) &
                     GPPend == TRUE) %>%
            mutate(CritThreshold = "SOW") %>%
            select(-GPPend)


          max_deriv.GPP_smooth <- DD_fluxes_subset_deriv %>%
            filter(Year == process_year) %>%
            filter(deriv.GPP_smooth == max(deriv.GPP_smooth, na.rm = TRUE)) %>%
            select(db:fracyr_null, cumu.DoY, cumu.DoY_raw, cumu.GPP_raw_NA, deriv.GPP_smooth) %>%
            mutate(CritThreshold = "maxGPP_accum")

          # Goal: find first ECW/SNW dates relative to the peak of GPP accumulation
          # Exclude any EOW values that occur AFTER the maximum GPP 1st derivative (as well as SOW values that occur BEFORE the maximum GPP 1st derivative)
          EOW.before.max <- EOW[EOW$k_POSIXdate_plotting < max_deriv.GPP_smooth$k_POSIXdate_plotting,]
          SOW.after.max <- SOW[SOW$k_POSIXdate_plotting > max_deriv.GPP_smooth$k_POSIXdate_plotting,]

          # Row bind the date of peak GPP accumulation rate with your SOW/EOW estimates and arrange by date
          CritDates.tmp <- rbind(EOW.before.max, max_deriv.GPP_smooth, SOW.after.max) %>%
            arrange(k_POSIXdate_plotting)


          test_for_ECW.SNW <- outer(which(CritDates.tmp$CritThreshold == "maxGPP_accum"), c(-1,1), `+`) %>%
            as.vector() %>%
            unique()

          # Subset the critical dates by ECW/SNW
          CritDates.process_year <- CritDates.tmp[test_for_ECW.SNW,]


          # Remove row if it contains only NAs
          CritDates.process_year <- CritDates.process_year[rowSums(is.na(CritDates.process_year)) != ncol(CritDates.process_year), ]


          # Sometimes this routine will fail to detect critical dates for the current process year
          # When this is this case, return a row of NAs for the process year
          if (nrow(CritDates.process_year) == 0) {

            CritDates.process_year[1,] <- NA
            CritDates.process_year$db <- db
            CritDates.process_year$site <- site
            CritDates.process_year$process_year <- process_year
            CritDates.process_year$Year <- process_year



            winterCritDates_by_process_year_ls[[as.character(process_year)]] <- CritDates.process_year

          } else {
            # Recode the CritDates column, renaming EOW to ECW, and SOW to SNW
            CritDates.process_year$CritThreshold <- recode(CritDates.process_year$CritThreshold, EOW = "ECW", SOW ="SNW")

            # Reorder columns
            CritDates.process_year <- CritDates.process_year %>%
              select(db:Year, process_year, CritThreshold, everything())

            winterCritDates_by_process_year_ls[[as.character(process_year)]] <- CritDates.process_year
          }



        },
        # If loop fails for a particular year,
        error = function(e){
          message("")
          message("    * ERROR on process year: ", process_year)
          message("      (NAs will be assigned to data during missing years)")
          message("")
          print(e)
        }
      )


      # Step up progress bar
      setTxtProgressBar(pb,k)


    } # end year loop (fluxes)

    rm(k, process_year, pb)



    # SOW/EOW - Append cumulative fluxes by process_year into dataframe -------
    message('')
    message('    ', 'Appending annual fluxes into single dataframe...')


    # browser()
    # Append the individual process years into a single dataframe
    # IMPORTANT NOTE: This is not a true time-series dataframe since it contains repeat values (i.e. each process year contains data from the year before and after)
    CumulativeFlux_TimeSeries_by_process_year_df <- bind_rows(CumulativeFlux_TimeSeries_by_process_year_ls, .id = "process_year")



    # reorder columns
    dat_SOW.EOW <- CumulativeFlux_TimeSeries_by_process_year_df %>%
      select(db, site, process_year, Year, k_POSIXdate_plotting, fracyr, fracyr_null, DoY, cumu.DoY, everything()) %>%
      mutate(process_year = as.numeric(process_year))

    # We now have a dataframe of cumulative GPP/NEE that is organized by process_year (note: each process_year contains > 365 days so there ARE repeat rows this df)



    # Do the same thing for the winter critical dates list
    CritDates_SOW.EOW <- bind_rows(winterCritDates_by_process_year_ls, .id = "process_year") %>%
      select(db, site, process_year, Year, k_POSIXdate_plotting, fracyr, fracyr_null, DoY, cumu.DoY, everything()) %>%
      mutate(process_year = as.numeric(process_year))


    rm(CumulativeFlux_TimeSeries_by_process_year_ls, winterCritDates_by_process_year_ls)






    # Organize your output datasets -------------------------------------------
    # reorder columns
    dat_SOS.EOS <- GPPsat_TimeSeries_by_process_year_df %>%
      select(db, site, process_year, Year, k_POSIXdate_plotting, fracyr_null, DoY, cumu.DoY, GPPsat_pred, GPPsat_resid, NEEsat_fixed_NEE:pct_avail_NEE)

    # Specify years to omit by using the site.info lookup
    if (exclude_bad_yrs) {
      omit_years <- as.numeric(unlist(strsplit(site.info$exclude_years, split=",")))
    } else {
      omit_years <- NULL
    }




    # Subset your data by omitting "bad" years
    dat_SOS.EOS <- subset(dat_SOS.EOS, !(process_year %in% omit_years))
    CritDates_SOS.EOS <- subset(CritDates_SOS.EOS, !(process_year %in% omit_years))

    dat_SOW.EOW <- subset(dat_SOW.EOW, !(process_year %in% omit_years))
    CritDates_SOW.EOW <- subset(CritDates_SOW.EOW, !(process_year %in% omit_years))



    # Perform quality check on critical date estimates ------------------------
    # Calculate data availability around each critical date, as well as z-scores
    CritDates_QC_SOS.EOS <- fCritDates_QC(y = dat_SOS.EOS$NEEsat_fixed_NEE,
                                         y.POSIX = as.Date(dat_SOS.EOS$k_POSIXdate_plotting),
                                         y.ProcYr = as.numeric(dat_SOS.EOS$process_year),
                                         crit.ID = CritDates_SOS.EOS$CritThreshold,
                                         crit.POSIX = as.Date(CritDates_SOS.EOS$k_POSIXdate_plotting),
                                         crit.ProcYr = as.numeric(CritDates_SOS.EOS$process_year),
                                         buffer = 10)


    CritDates_QC_SOW.EOW <- fCritDates_QC(y = dat_SOW.EOW$deriv.GPP_raw_NA,
                                          y.POSIX = as.Date(dat_SOW.EOW$k_POSIXdate_plotting),
                                          y.ProcYr = as.numeric(dat_SOW.EOW$process_year),
                                          crit.ID = CritDates_SOW.EOW$CritThreshold,
                                          crit.POSIX = as.Date(CritDates_SOW.EOW$k_POSIXdate_plotting),
                                          crit.ProcYr = as.numeric(CritDates_SOW.EOW$process_year),
                                          buffer = 10)





    CritDates_QC <- CritDates_QC_SOS.EOS %>%
      bind_rows(CritDates_QC_SOW.EOW) %>%
      arrange(process_year, match(CritThreshold, c("ECW","SOS10","SOS25","SOS50","Peak_GPPsat","EOS50","EOS25","EOS10","SNW")))


    # Append the GPPsat curve data to dat.list (indexed by site and db)
    dat.list[[db]][[site]]$Data$`Critical dates` <- CritDates_QC

    # Push your critical date data to its own list:
    CritDates.list[[db]][[site]] <- dat.list[[db]][[site]]$Data$`Critical dates`






    # Load meteorological data ------------------------------------------------
    ## Load complete half-hourly dataset for site (includes *some* MET data )
    message('')
    message('    ', 'Examining meteorological data...')

    # Now we want to extract meteorological variables from the complete half-hourly dataset and store these as a new dataframe
    df_met_gapfilled <- complete_HH_dataset %>%
      select(Year:k_fracyr, PPFD_NT_gf:VPD_NT_gf, PPFD_DT_gf:VPD_DT_gf)

    df_met_notgapfilled <- complete_HH_dataset %>%
      select(Year:k_fracyr, k_LE:k_PPFD)


    # Summarize met data into daytime (10 am - 2 pm) means
    complete_HH_dataset_MEANs <- complete_HH_dataset %>%
      select(!c(db, site, k_POSIXdate_local, k_datenum:k_fracyr)) %>%
      subset(., Hour >= 10 & Hour < 14) %>%
      group_by(Year, DoY, k_POSIXdate_plotting) %>%
      summarize_all(funs(mean(., na.rm = TRUE))) %>%
      select(!Hour)




    # Store the gapfilled means and non-gapfilled means in separate datasets
    df_met_gapfilled_MEANs <- complete_HH_dataset_MEANs %>%
      select(Year:k_POSIXdate_plotting, PPFD_NT_gf:VPD_NT_gf, PPFD_DT_gf:VPD_DT_gf)

    df_met_notgapfilled_MEANs <- complete_HH_dataset_MEANs %>%
      select(Year:k_POSIXdate_plotting, k_LE:k_PPFD)





    # Join summary data tables to site.dat
    site.dat <- dat_SOS.EOS %>%
      left_join(complete_HH_dataset_MEANs, by = c("Year", "DoY", "k_POSIXdate_plotting"))


    ## Calculate accumulated degree days above 5°C
    # Create a summary table of mean DAILY temperature
    Tair_mean <- df_met_gapfilled %>%
      select(Year, DoY, Hour, Tair_NT_gf) %>%
      group_by(Year, DoY) %>%
      summarize(Tair_NT_mean = mean(Tair_NT_gf, na.rm = T)) %>%
      mutate(gdd_test = (Tair_NT_mean >= 5) * 1)   # Perform a logical test on the mean daily temperature column to determine days where Tair >= 5°C (1 if true, 0 if false)

    # Convert all NAs in the gdd_test column to 0 (they cannot contribute to the accumulated degree days)
    Tair_mean$gdd_test[is.na(Tair_mean$gdd_test)] <- 0

    # Calculate accumulated degree days for each year
    Tair_ADD <- Tair_mean %>%
      group_by(Year) %>%
      mutate(ADD = cumsum(gdd_test))

    # Calculate mean annual temperature for each year
    Tair_MAT <- df_met_gapfilled %>%
      select(Year, DoY, Hour, Tair_NT_gf) %>%
      group_by(Year) %>%
      summarize(Tair_MAT = mean(Tair_NT_gf, na.rm = T))


    # Convert NaNs to NA
    is.nan.data.frame <- function(x)
      do.call(cbind, lapply(x, is.nan))

    Tair_ADD[is.nan(Tair_ADD)] <- NA
    Tair_MAT[is.nan(Tair_MAT)] <- NA


    rm(complete_HH_dataset, complete_HH_dataset_MEANs, df_met_gapfilled, df_met_gapfilled_MEANs, df_met_notgapfilled, df_met_notgapfilled_MEANs)






    # Plotting routine --------------------------------------------------------
    if (create.plots) {

      export.plots <- TRUE

      message('')
      message('    ', 'Preparing plots for export...')
      # SOS/EOS Plots
      # plot_facets: facets the GPPsat time series according to process year (note: process_year includes the year of interest plus buffer days before/after)


      plot_facets <- fPaginateGPPsatFacets(GPP = as.numeric(site.dat$NEEsat_fixed_NEE),
                                           GPP.pred = as.numeric(site.dat$GPPsat_pred),
                                           GPP.POSIX = as.Date(site.dat$k_POSIXdate_plotting),
                                           GPP.ProcYr = as.numeric(site.dat$process_year),

                                           crit.ID = CritDates_SOS.EOS$CritThreshold,
                                           crit.POSIX = as.Date(CritDates_SOS.EOS$k_POSIXdate_plotting),
                                           crit.ProcYr = as.numeric(CritDates_SOS.EOS$process_year),
                                           crit.GPP = as.numeric(CritDates_SOS.EOS$GPP_pred),

                                           ncol = 4, nrow = 4, site.info = site.info, span = span)



      # plot_multiyear: plots the GPPsat time series with all process_years overlaid together
      plot_multiyear <- fPlotMultiyearGPPsat(GPP = as.numeric(site.dat$NEEsat_fixed_NEE),
                                             GPP.pred = as.numeric(site.dat$GPPsat_pred),
                                             GPP.POSIX = as.Date(site.dat$k_POSIXdate_plotting),
                                             GPP.ProcYr = as.numeric(site.dat$process_year),

                                             crit.ID = CritDates_SOS.EOS$CritThreshold,
                                             crit.POSIX = as.Date(CritDates_SOS.EOS$k_POSIXdate_plotting),
                                             crit.ProcYr = as.numeric(CritDates_SOS.EOS$process_year),
                                             crit.GPP = as.numeric(CritDates_SOS.EOS$GPP_pred),

                                             span = span)




      # plot_zscores: distribution plot of z-scores for each of the 7 critical threshold dates (SOS/EOS/peak GPPsat)
      # NOTE: z-scores are calculated using "fracyr_null" as the centered
      plot_zscores <- fPlotCritDate_Zscores(CritDates_QC_SOS.EOS)

      # plot_composite: combines plot_multiyear and plot_zscores
      plot_composite <- plot_multiyear + plot_zscores  + plot_layout(ncol=2)

      # Add site metadata to composite plot
      plot_composite <- fAddSiteMeta2Plot(wrap_elements(plot_composite), site = site.info$site,
                                          db = site.info$db,
                                          longname = site.info$info,
                                          country = site.info$country,
                                          lat = site.info$lat,
                                          long = site.info$long,
                                          MAT = site.info$MAT,
                                          MAP = site.info$MAP,
                                          IGBP = site.info$IGBP)

      # Store plots in your list
      dat.list[[db]][[site]]$Plots$`Faceted GPPsat time series` <- plot_facets
      dat.list[[db]][[site]]$Plots$`Multiyear GPPsat and z-score distribution` <- plot_composite



      # plot_GPPsat_plus_met
      # Creating composite plots of annual time series (466-d year) for 16 variables, including GPPsat with critical dates added
      # The goal is for the user to select any 15 variables of interest to compare to the GPPsat time series.
      # These will be exported as a single pdf with each page being a different process year

      # Select 15 additional variables to facet with GPPsat
      vars_of_interest <- c("k_LE",
                            "k_H",
                            "k_Rnet",
                            "SW_IN_NT_gf",
                            "k_SW_out",
                            "k_LW_in",
                            "k_LW_out",
                            "k_albedo",
                            "Tair_NT_gf",
                            "Tsoil_NT_gf",
                            "k_SWC",
                            "k_RH",
                            "VPD_NT_gf",
                            "k_ustar",
                            "PPFD_NT_gf")



      CritDates_SOS.EOS <- CritDates_SOS.EOS %>%
        mutate(DoY = yday(k_POSIXdate_plotting),
               fracyr_null = fPOSIX_to_fracyr_null(as.Date(k_POSIXdate_plotting), as.numeric(as.character(process_year))))


      plot_GPPsat_plus_met <- fPlotGPPsat_plus_VariablesOfInterest(dat = site.dat, critdat = CritDates_SOS.EOS,
                                                                   site.info, years_of_record = years_of_record, vars = vars_of_interest, span = span)

      # Store plots in your list
      dat.list[[db]][[site]]$Plots$`GPPsat time series with selected meteorological vectors` <- plot_GPPsat_plus_met


      # Plot cumulative GPP/NEE with start and end of winter dates shown
      plot_cumuGPP_byYear <- fPlotCumuGPPbyYear(dat = dat_SOW.EOW, critdat = CritDates_SOW.EOW, site.info, years_of_record)





      # Plot export -------------------------------------------------------------
      # Export critical threshold plots
      if (export.plots) {
        # export plots


        ## We want to send our exported plots to subdirectories based on date of creation using: dir.create(paste0(outpath.plots, Sys.Date()))
        dir.create(paste0(outpath_plots, Sys.Date()))

        # We also want to separate our plots into subdirectories of their own
        dir.create(paste0(outpath_plots, Sys.Date(), "/1_MultiyearGPPsat_plusZscores"))
        dir.create(paste0(outpath_plots, Sys.Date(), "/2_FacetedGPPsat_byYear"))
        dir.create(paste0(outpath_plots, Sys.Date(), "/3_MultipageGPPsat_plusMetVars"))
        dir.create(paste0(outpath_plots, Sys.Date(), "/4_cumuGPPTimeSeries"))

        timestamped.outpath.plots <- paste0(outpath_plots, Sys.Date())


        message('')
        message('  ', 'Exporting plots to:  ')
        message('    ', paste0(outpath_plots))
        message('')
        message('    ', 'Exporting plot: 1/4')

        # Export plot_composite (which includes both the multiyear GPPsat time series and z-score distribution figures)
        ggsave(path=paste0(timestamped.outpath.plots, "/1_MultiyearGPPsat_plusZscores"), filename = paste0(sprintf("%s_%s_%d-%d-%s",
                                                                         db, site, years_of_record[1], tail(years_of_record,1),
                                                                         "MultiyearGPPsat_plusZscores.pdf")),
               plot = plot_composite, width = 16, height = 6, dpi=300)

        message('')
        message('    ', 'Exporting plot: 2/4')

        # Export plot_facets (time-series of GPPsat split into facets by process_year)
        ggsave(path=paste0(timestamped.outpath.plots, "/2_FacetedGPPsat_byYear"), filename = paste0( sprintf("%s_%s_%d-%d-%s",
                                                                          db, site, years_of_record[1], tail(years_of_record,1),
                                                                          "FacetedGPPsat_byYear.pdf")),
               plot = plot_facets, width = 12, height = 9, dpi=300)

        message('')
        message('    ', 'Exporting plot: 3/4')

        # Export plot_GPPsat_plus_met (annual time-series of GPPsat with 15 additional variables of interest that the user specifies)
        pdf(paste0(timestamped.outpath.plots, "/3_MultipageGPPsat_plusMetVars", "/", sprintf("%s_%s_%d-%d-%s",db, site, years_of_record[1], tail(years_of_record,1), "MultipageGPPsat_plusMetVars.pdf")),
            width = 16, height = 12)
        invisible(lapply(plot_GPPsat_plus_met, print))
        dev.off()


        message('')
        message('    ', 'Exporting plot: 4/4')
        pdf(paste0(timestamped.outpath.plots, "/4_cumuGPPTimeSeries", "/", sprintf("%s_%s_%d-%d-%s",db, site, years_of_record[1], tail(years_of_record,1), "cumuGPPTimeSeries.pdf")),
            width = 10, height = 8)
        invisible(lapply(plot_cumuGPP_byYear, print))
        dev.off()


      }



      # Remove large objects
      rm(plot_composite, plot_facets, plot_GPPsat_plus_met, plot_multiyear, plot_zscores)

    } # end 'if (create.plots)'





  } # end site loop


  # Bind all sites for a given database into a dataframe
  FLX_dat <- bind_rows(CritDates.list$FLX, .id = "Site")
  AMF_dat <- bind_rows(CritDates.list$AMF, .id = "Site")
  NEO_dat <- bind_rows(CritDates.list$NEO, .id = "Site")

  # Bind all databases into a single dataframe. That's it!
  final_dat <- bind_rows(list(FLX=FLX_dat, AMF=AMF_dat, NEO=NEO_dat), .id = 'Database') %>%
    rename(db = "Database", site = "Site") %>%
    mutate(DoY = yday(k_POSIXdate_plotting),
           fracyr_null = fPOSIX_to_fracyr_null(k_POSIXdate_plotting, process_year),
           CritThreshold_KS_qc = NA,
           CritThreshold_auto_qc = NA) %>%
    select(db, site, process_year, k_POSIXdate_plotting, DoY, fracyr_null, everything()) %>%
    arrange(db, site, process_year, match(CritThreshold, c("ECW","SOS10","SOS25","SOS50","Peak_GPPsat","EOS50","EOS25","EOS10","SNW")))

  final_dat <- final_dat[!is.na(final_dat$CritThreshold),]

  # Rule 1 - duplicated dates
  # This can occur when there is not sufficient data at the start or end of the process_year, leading the program to select SOS/EOS points that overlap.
  SOSEOS_dupes <- final_dat %>%
    filter(CritThreshold %in% c("SOS10","SOS25","SOS50","Peak_GPPsat","EOS50","EOS25","EOS10")) %>%
    group_by(db, site, process_year, k_POSIXdate_plotting) %>%
    filter(n()>1) %>%
    mutate(dupe = "Y")

  # Rule 2 - start of season dates that occur after DoY 190 / end of season dates that occur before DoY 190
  # This can occur when there is insufficient data early or late in the year
  # Note: DoY 190 == fracyr_null 1.516
  # Any SOS's that occur after DoY 190 (fracyr_null = 1.516) are suspicious
  late_SOS <- final_dat %>%
    filter(CritThreshold %in% c("SOS10","SOS25","SOS50")) %>%
    group_by(db, site, process_year) %>%
    filter(fracyr_null > 1.516) %>%
    mutate(late_SOS = "Y")

  # Any EOS's that occur before DoY 190 (fracyr_null = 1.516) are suspicious
  early_EOS <- final_dat %>%
    filter(CritThreshold %in% c("EOS50","EOS25","EOS10")) %>%
    group_by(db, site, process_year) %>%
    filter(fracyr_null < 1.516) %>%
    mutate(early_EOS = "Y")


  # Rule 3 - interpolated peak GPPsat estimates
  # This can occur when estimating the date of peak GPPsat from a loess fit line where there is no surrounding data.
  # The column 'pct_avail_aroundCritDate' tells the user the proportion of actual datapoints around each date.
  # Here we flag any peakGPPsat dates that have lower than 70% data representation (0.70).
  # Note: because the estimate of peak GPPsat is so crucial to determining SOS/EOS, you should throw out any years that get flagged here.
  interp_GPPmax <- final_dat %>%
    filter(CritThreshold %in% c("Peak_GPPsat")) %>%
    group_by(db, site, process_year) %>%
    filter(pct_avail_aroundCritDate < 0.70) %>%
    mutate(interp_GPPmax = "Y")



  SOSEOS_rules <- final_dat %>%
    # filter(site == "CA-TP4") %>%
    filter(CritThreshold %in% c("SOS10","SOS25","SOS50","Peak_GPPsat","EOS50","EOS25","EOS10")) %>%
    left_join(SOSEOS_dupes) %>%
    left_join(late_SOS) %>%
    left_join(early_EOS) %>%
    left_join(interp_GPPmax) %>%
    group_by(db, site, process_year)
    # mutate(CritThreshold_auto_qc = ifelse(interp_peak_GPPsat == "Y", 0, NA))


  # Need to use logical indexing to exclude YEARS where suspicious or interpolated GPPsat peaks are detected
  flag_bad_CritDates <- SOSEOS_rules %>%
    group_by(db, site, process_year) %>%
    mutate(CritThreshold_auto_qc = ifelse('Y' %in% interp_GPPmax, 0, CritThreshold_auto_qc) )

  flag_bad_CritDates$CritThreshold_auto_qc[which(abs(flag_bad_CritDates$Z) > 2.6)] = 0
  flag_bad_CritDates$CritThreshold_auto_qc[which(flag_bad_CritDates$dupe == 'Y')] = 0
  flag_bad_CritDates$CritThreshold_auto_qc[which(flag_bad_CritDates$late_SOS == 'Y')] = 0
  flag_bad_CritDates$CritThreshold_auto_qc[which(flag_bad_CritDates$early_EOS == 'Y')] = 0

  flag_bad_CritDates$CritThreshold_auto_qc[is.na(flag_bad_CritDates$CritThreshold_auto_qc)] = 1

  final_dat_updatedSOSEOS <- final_dat %>%
    select(!CritThreshold_auto_qc) %>%
    left_join(select(flag_bad_CritDates, !c(dupe:interp_GPPmax)))



  # Export data (with timestamp) --------------------------------------------
  if (export.files) {
    ## Export datafile

    message('')
    message('  ', 'Exporting CritDate data to:  ')
    message('    ', paste0(outpath_critdates))
    message('')

    date_of_analysis <- Sys.Date()

    no.sites <- length(selected.sites)

    outfile <- sprintf('%s_CritDates_%d_sites.txt', date_of_analysis, no.sites)

    write.table(final_dat_updatedSOSEOS, file = paste0(outpath_critdates, outfile),
                sep = "\t", row.names = FALSE,
                col.names = TRUE,
                append = FALSE)

  } # end if (export.files)



} # end function