FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in FS_fExtractWinterFluxes

#' Subset flux data for the GPP-free period only

#' @export
#' @title Subset winter fluxes.
#'
#' @param path_to_fluxdat character string, navigate to the directory containing the complete output datasets
#' @param path_to_critdates character string, navigate to the directory containing critical date files (must have dates associated with end of current winter (ECW) and start of next winter (SNW))
#' @param outpath character string, specify path to where you'd like to export the winter-only output files
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.
#' @param batch_process logical, if TRUE, batch processes all sites within the inpath directory (defaults to FALSE).
#' @param qc choices: 'auto', 'none' or 'specified', If the user selects 'auto', the data will be subset according to quality check flags that were automatically generated during Critical Date estimation. If 'specified' the user must provide a column name containing quality check flags (0 = bad, 1 = good). If 'none' the data is not subset. 
#' @param qc_col character, specify the name of a column containing qc flags (0 = bad, 1 = good). Defaults to NULL.

#' @importFrom stringr str_remove str_detect
#' @importFrom lubridate interval days ymd 
#' @importFrom dplyr mutate filter select arrange rename left_join bind_rows group_by
#' @importFrom tibble enframe   
#' @importFrom tidyr unnest  


FS_fExtractWinterFluxes <- function(path_to_fluxdat, path_to_critdates, outpath, site.meta, export.files = TRUE, batch_process = FALSE, qc = c('auto', 'none', 'specified'), qc_col = NULL) {
  if (!dir.exists(outpath)) {
    message('')
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ",outpath))
    message('')
    
    if (x) {
      dir.create(file.path(outpath), showWarnings = FALSE, recursive = TRUE)
    }
  }
  
  
  CritDates.infile <- tail(list.files(path_to_critdates, pattern = ".txt"),1)
  

  dat.CritDates.tmp <- read.table(paste0(path_to_critdates, CritDates.infile), stringsAsFactors = FALSE, header = TRUE)
  
  
  # Here we're going to use dat.CritDates as a lookup table to subset our data based on the GPP-free period
  dat.CritDates <- dat.CritDates.tmp %>%
    filter(!(site == "CA-SF1" | site == "CA-SF2" | site == "US-Me2" | site == "US-Me5")) %>% # Filter out sites with little to no useable data: "CA-SF1"  "US-Me2"  "US-Me5"
    merge(select(site.meta, c(database, site_ID, MAT_orig)), by.x = c("db", "site"), by.y = c("database", "site_ID")) %>%  # Merge site metadata with the dat.fluxsynth_filtered dataframe
    mutate(MAT_bin = cut_width(MAT_orig, width = 3)) # Add a grouping variable for MAT (to bin sites into MAT categories/levels)

  
  
  
  
  
  # Select sites to process -------------------------------------------------
  site.list <- unique(dat.CritDates$site)
  temp_df <- dat.CritDates %>%
    mutate(db_site = paste0(db, "_", site)) 
  
  db_site.list <-  unique(temp_df$db_site)
  
  routine_description <- "Select which sites you would like to extract winter fluxes from"
  
  choices <- FluxSynthU::fSiteSelect(db_site.list, descrip = routine_description)
  
  
  
  
  if (length(choices[["AMF"]]) > 0) {
    AMF_selected <- paste0("AMF_", choices[["AMF"]])
  } else {AMF_selected <- NULL}
  
  if (length(choices[["FLX"]]) > 0) {
    FLX_selected <- paste0("FLX_", choices[["FLX"]])
  } else {FLX_selected <- NULL}
  
  if (length(choices[["NEO"]]) > 0) {
    NEO_selected <- paste0("NEO_", choices[["NEO"]])
  } else {NEO_selected <- NULL}
  
  # Includes database name in site selection
  selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)
  
  # Omits database name in site selection
  selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])
  
  rm(dat.CritDates.tmp, temp_df)
  


  # Make sure the 'k_POSIXdate_plotting' column is in date format  
  dat.CritDates$k_POSIXdate_plotting <- as.Date(dat.CritDates$k_POSIXdate_plotting)
  
  
  
  ## Quality check behavior
  if (qc == 'none') { # when qc == 'none', all estimates of SOS/EOS and ECW/SNW are included (even questionable ones)
    dat.CritDates_good <- dat.CritDates
  } else if (qc == 'auto') { # when qc == 'auto', critical date estimates are subset by the CritThreshold_auto_qc column, which automatically removes dates that are suspicious
    dat.CritDates_good <- subset(dat.CritDates, CritThreshold_auto_qc != 0 )
  } else if (qc == 'specified') { # when qc == 'specified', the user must supply a column name containing their own quality check info (0 = bad, 1 = good). RECOMMENDED
    
    if (is.null(qc_col)) { # NOTE: the user must supply a qc column name, otherwise the data will be subset using the CritThreshold_auto_qc column
      message('')
      message('Warning: No qc column name specified! Defaulting to auto_qc column')
      message('')
      
      dat.CritDates_good <- subset(dat.CritDates, CritThreshold_auto_qc != 0 )
    } else {
      dat.CritDates_good <- subset(dat.CritDates, dat.CritDates[,qc_col] == 1 )
    }
    
  }
  
  
  rm(dat.CritDates)
  
  
  for (i in 1:length(selected.db_sites)) {
    
    # The current site (and database)
    db_site <- selected.db_sites[i]
    
    # Extract the database name and site name from the 'db_site' string
    db <- substr(x = db_site, start = 1, stop = 3)
    site <- str_remove(db_site, paste0(db, "_"))

    # Get site metadata
    site.info <- fGetSiteInfo(dat = site.meta,
                              site = site,
                              db = db,
                              site_col = "site_ID",
                              db_col = "database",
                              site_name_col = "site_Name",
                              start_yr_col = "start_year",
                              end_yr_col = "end_year",
                              exclude_yr_col = "exclude_years",
                              lat_col = "lat",
                              long_col = "long",
                              country_col = "country",
                              IGBP_col = "IGBP",
                              elev_col = "elev_orig",
                              MAT_col = "MAT_orig",
                              MAP_col = "MAP_orig")
    
    
    message('')
    message('')
    message('Processing site (', min(which(selected.sites %in% site)), '/',
            length(selected.sites), '):')
    message('\t\t', site, ', ', db, " (", site.info$country, ")")
    message('\t\tName: ', site.info$info)
    message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
    message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0),  'mm')
    message('\t\tForest Type: ', site.info$IGBP)
    
    message('')
    message('  ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)
    
    
    
    
    # Step 1. Import (half-)hourly data
    # Load (half-)hourly dataset -----------------------------------------------------
    ## Load complete half-hourly dataset for site (includes *some* MET data )
    
    
    # Create a string matching the site to its database (ex: AMF_US-NR1)
    db_site <- paste(db, "_", site, sep ="")
    
    # List all the files in the complete output datasets folder
    complete_HH_files <- list.files(paste(path_to_fluxdat), recursive = TRUE) 
    
    # Find the output dataset for the current site
    complete_HH_site_file <- complete_HH_files[grepl(db_site,  complete_HH_files)]
    
    
    message('')
    message('  ', 'Reading (half-)hourly data from:  ' )
    message('    ', path_to_fluxdat)
    
    
    # Load complete output half-hourly dataset (be sure to delete later)
    complete_HH_dataset <- read.table(paste(path_to_fluxdat, complete_HH_site_file, sep=""), header=TRUE)
    
    # Make sure k_POSIXdate_plotting is a date column:
    complete_HH_dataset$k_POSIXdate_plotting <- as.Date(complete_HH_dataset$k_POSIXdate_plotting)
    
    # Add db and site as cols to complete_HH_dataset
    complete_HH_dataset <- complete_HH_dataset %>%
      mutate(db = db, site = site) %>%
      select(db, site, everything())

    
    
    # Don't worry too much about this line - this is necessary when you want to subset a column that has the same name as an object
    # So if you have an object called 'site' and a column named 'site' and you try to subset the column by the object, it will fail.
    # This creates a temporary object that doesn't have the same name
    site_i <- site
    
    WinterCritDates_bySite <- dat.CritDates_good %>%
      filter(., site == site_i &
               CritThreshold %in% c("ECW", "SNW")) %>%
      arrange(k_POSIXdate_plotting) 
    
    
    if (nrow(WinterCritDates_bySite) == 0) {
      message("    * No winter critical dates found for site: ", site)
      next
    }
    
    
    
    # Within each site, we're going to loop through winter years
    # first we'll define our winter years as unique process_years within each site
    # then we'll perform a test: each winter year must begin with a SNW from the prev year and an ECW for the current year
    #  if only one (or neither) of these values is available, the loop skips to the next process year
    
    
    years_of_record <- unique(WinterCritDates_bySite$process_year)
    start_year <- years_of_record[1]
    end_year <- tail(years_of_record, 1)
    
    
    site.data.list <- list()
    
    message('')
    message('  ', 'Looping through site years and subsetting flux data based on winter CritDates...')
    
 
    for (winter_year in start_year:end_year) {
      
      tryCatch(
        expr = {
          
          # Create a temporary dataframe in which you subset only the critical dates for the current 'winter year' as well as the previous year's winter
          WinterCritDates_tmp <- WinterCritDates_bySite %>%
            filter(., process_year %in% c(winter_year - 1, winter_year))
          
          
          
          # Determine the start of winter for the current winter period
          # This should fall on the previous process year (but not always)
          SOW_winter_year <- WinterCritDates_tmp[which(WinterCritDates_tmp$CritThreshold == "SNW")[1],]
          
          # Determine the end of winter for the current winter period
          # This always falls on the current process year
          EOW_winter_year <- WinterCritDates_tmp[tail(which(WinterCritDates_tmp$CritThreshold == "ECW"),1),]
          
          
          # Test whether SOW_winter_year occcurs AFTER EOW_winter_year (in normal years this is NOT the case)
          # If this test evaluates to TRUE, move to the next year
          if (SOW_winter_year$k_POSIXdate_plotting > EOW_winter_year$k_POSIXdate_plotting) {
            message("    * Incomplete winter on process year: ", winter_year)
            next
          }
          
          
          # Bind these two dates and arrange according to the POSIXdate column
          WinterCritDates_byYear  <- rbind(SOW_winter_year, EOW_winter_year) %>%
            arrange(k_POSIXdate_plotting)
          
          
          # In some cases, we may have a viable 'start of winter' but no corresponding 'end of winter' date (or vice versa)
          # Since winter must have a beginning and end, these cases prevent us from analyzing the data and we should skip to the next year
          if (nrow(WinterCritDates_byYear) < 2) {
            message("    * Incomplete winter on process year: ", winter_year)
            next
          }
          
          
          # Determine the length of winter (no. days) for the given process year
          no.days_in_winter <- interval(SOW_winter_year$k_POSIXdate_plotting, EOW_winter_year$k_POSIXdate_plotting, tzone = "UTC")/days(1)
          
          # Divide this period into 5 subintervals, and calculate the length of days
          no.days_per_interval <- round(no.days_in_winter/5)
          
          # Determine the start date of each interval
          WinterCritDates_byInterval <- seq(ymd(SOW_winter_year$k_POSIXdate_plotting), EOW_winter_year$k_POSIXdate_plotting, by = no.days_per_interval)
          
          # Append the date corresponding to the end of winter
          WinterCritDates_byInterval <- c(WinterCritDates_byInterval, EOW_winter_year$k_POSIXdate_plotting)
          
          int1_latefall <- seq(WinterCritDates_byInterval[1], WinterCritDates_byInterval[2]-1, by = "days")
          int2_earlywint <- seq(WinterCritDates_byInterval[2], WinterCritDates_byInterval[3]-1, by = "days")
          int3_midwint <- seq(WinterCritDates_byInterval[3], WinterCritDates_byInterval[4]-1, by = "days")
          int4_latewint <- seq(WinterCritDates_byInterval[4], WinterCritDates_byInterval[5]-1, by = "days")
          
          # NOTE: Since we're rounding our subintervals to the nearest day, it's unlikely that each subinterval will have the exact same vector dimensions
          # To deal with this, we'll add any dangling dates to the final interval (early spring)
          int5_earlyspr <- seq(WinterCritDates_byInterval[5], tail(WinterCritDates_byInterval,1), by = "days")
          
          
          # Create a new dataframe containing the sequence of dates for each winter subinterval
          WinterIntervals_byYear <- lst(int1_latefall, int2_earlywint, int3_midwint, int4_latewint, int5_earlyspr) %>%
            enframe %>%
            unnest(cols = c("name", "value")) %>%
            rename("interval" = name, "k_POSIXdate_plotting" = value) %>%
            as.data.frame()
          
          
          # Take your complete_HH_dataset and join it to your WinterIntervals_byYear dataframe
          df_winter_HH <- WinterIntervals_byYear %>%
            left_join(complete_HH_dataset, by = "k_POSIXdate_plotting") %>%
            select(interval, db:k_POSIXdate_local, k_POSIXdate_plotting, everything())
          
          
          # Store the no. of days in winter to a list, indexed by process year
          site.data.list[[as.character(winter_year)]] <- df_winter_HH
          
          rm(df_winter_HH)
          
        }, # end TryCatch expr
        
        # If loop fails for a particular year, 
        error = function(e){
          message("    * Caught an error on process year: ", winter_year)
          print(e)
        }
        
      ) # end TryCatch
      
      
      
      
    } # end winter_year loop
    
    rm(complete_HH_dataset)
    
    
    message('')
    message('  ', 'Appending site years into a dataframe...')
    
    site.data.df <- site.data.list %>% 
      bind_rows(.id = "source") %>% 
      group_by(source) %>%
      rename('winter_year' = source)
    
    
    
    if (export.files) {
      message('')
      message('  ', 'Exporting winter flux dataframes to:  ')
      message('    ', outpath)
      message('')
      
      outfile <- sprintf('%s_%s-%d-%d_WinterOnly-Output.txt', db, site, start_year, end_year)
      
      write.table(site.data.df, file = paste0(outpath, outfile), 
                  sep = "\t", row.names = FALSE, 
                  col.names = TRUE,
                  append = FALSE)
    }
    
    rm(start_year, end_year, years_of_record)
    
    
  } # end site loop
  
  
}