FluxSynthU: Suite of functions to analyze eddy covariance data

Documented in FS_fCalculateGPPandNEESatParameters

#' After building complete output datasets using 'FS_fBuildCompleteOutput', this function will calculate GPPsat parameters using a series of light-response functions across a moving window of specified day length.

#' @export
#' @title Calculate light-saturated GPP across moving window.
#' @param window numeric, specify the day length by which the moving window will subset data for light response curves
#' @param inpath character string, navigate to the directory containing stacked complete output datasets (specific formatting required)
#' @param outpath character string, set the directory by which GPPsat parameters will be exported
#' @param site.meta dataframe, site metadata file contained in FluxSynthU
#' @param batch_process logical, if TRUE processes all sites in inpath directory (defaults to FALSE)
#' @param export.files logical, if TRUE (default) exports dataframes as text files in the outpath directory.


#' @importFrom stringr str_remove str_detect


FS_fCalculateGPPandNEESatParameters <- function(window = 5, inpath, outpath, site.meta, batch_process = FALSE, export.files = TRUE) {

  if (!dir.exists(outpath)) {
    message(paste('Warning! No such directory exists!'))
    x <- askYesNo(paste("Would you like to create a new directory in: ",outpath))

    if (x) {
      dir.create(file.path(paste0(outpath)), showWarnings = FALSE)
    }
  }


  # File import -------------------------------------------------------------

  # find all raw data files (including rejected sites)
  all.files <- list.files(paste(inpath), recursive = TRUE)

  # extract paths for FLUXNET2015 and AMERIFLUX data
  fluxnet.files <- all.files[grep(pattern = '\\FLX', all.files)]
  amf.files <- all.files[grep(pattern = '\\AMF', all.files)]
  neon.files <- all.files[grep(pattern = '\\NEO', all.files)]


  # concatenate the two flux datasets
  files <- c(fluxnet.files, amf.files, neon.files)


  # list of all available sites with database type (eg, 'AMF_CA-Qcu')
  db.and.sites <- unique(substr(x = files, start = 1,stop = 10))

  # matching database for each
  all.site.db <- substr(x = files, start = 1,stop = 3)

  neon_index <- all.site.db %in% "NEO"

  # all available site names
  flx.amf.site.names <- substr(x = files[!neon_index], start = 5,stop = 10)
  neon.site.names <- substr(x = files[neon_index], start = 5,stop = 8)
  all.site.names <- c(flx.amf.site.names, neon.site.names)

  # update db.and.sites to fix NEON names
  db.and.sites[neon_index] <- gsub("_N", "", db.and.sites[neon_index])




  if (batch_process) {
    selected.db_sites <- db.and.sites
    selected.sites <- all.site.names
  } else {


    # give a description for your routine:
    routine_description  <-  "Calculate GPPsat using light response curves"

    # select one or more sites
    choices <- fSiteSelect(db.and.sites, descrip = routine_description)

    if (length(choices[["AMF"]]) > 0) {
      AMF_selected <- paste0("AMF_", choices[["AMF"]])
    } else {AMF_selected <- NULL}

    if (length(choices[["FLX"]]) > 0) {
      FLX_selected <- paste0("FLX_", choices[["FLX"]])
    } else {FLX_selected <- NULL}

    if (length(choices[["NEO"]]) > 0) {
      NEO_selected <- paste0("NEO_", choices[["NEO"]])
    } else {NEO_selected <- NULL}

    # Includes database name in site selection
    selected.db_sites <- c(AMF_selected, FLX_selected, NEO_selected)

    # Omits database name in site selection
    selected.sites <- c(choices[["AMF"]], choices[["FLX"]], choices[["NEO"]])

  }



  # Loop through sites and extract light response fits ----------------------

  for (i in 1:length(selected.db_sites)) {

    # Identify site name and available data
    # The current site (and database)
    db_site <- selected.db_sites[i]

    # Extract the database name and site name from the 'db_site' string
    db <- substr(x = db_site, start = 1, stop = 3)
    site <- str_remove(db_site, paste0(db, "_"))


    # Get site metadata
    site.info <- fGetSiteInfo(dat = site.meta,
                              site = site,
                              db = db,
                              site_col = "site_ID",
                              db_col = "database",
                              site_name_col = "site_Name",
                              start_yr_col = "start_year",
                              end_yr_col = "end_year",
                              exclude_yr_col = "exclude_years",
                              lat_col = "lat",
                              long_col = "long",
                              country_col = "country",
                              IGBP_col = "IGBP",
                              elev_col = "elev_orig",
                              MAT_col = "MAT_orig",
                              MAP_col = "MAP_orig")

    message('')
    message('Processing site (', min(which(selected.sites %in% site)), '/',
            length(selected.sites), '):\t', site, ', ', db, " (", site.info$country, ")")
    message('\t\tName: ', site.info$info)
    message('\t\tlat: ', round(site.info$lat,3), '\tlong: ', round(site.info$long,3))
    message('\t\tMAT: ', round(site.info$MAT,1), '°C\tMAP: ', round(site.info$MAP,0),  'mm')
    message('\t\tForest Type: ', site.info$IGBP)

    message('')
    message('  ', 'Years of record: ', site.info$start_year, '-', site.info$end_year)

    message('')
    message('  ', 'Reading stacked output data from:  ')
    message('    ', paste0(inpath))


    infile <- all.files[grep(paste(site, collapse="|"), all.files)]
    dat <- read.table(paste(inpath, infile, sep=""), header=TRUE)

    years_of_record <- unique(dat$Year)
    start_year <- years_of_record[1]
    end_year <- tail(years_of_record,1)



    # Generate light response curves and extract fit statistics ---------------
    # Determine the total number of days in dataset
    message('')
    message('  ','Generating light response curves and extracting fit statistics...')
    message('')

    # Calculate cumulative number of days in dataset (cumu.DoY)

    # Join the timestamp dataframe to your data
    dat <- dat %>%
      mutate(int.date = Year*1000 + DoY,
             cumu.DoY = cumsum(!duplicated(int.date))) %>% # Calculates cumulative DoY through all years
      select(-c(int.date))

    totdays <- floor(tail(unique(dat$cumu.DoY),1))

    # Create empty dataframe to house results from looping routine
    # First, provide column names

    # 200826 KS
    ## NOTE: Added new columns to record the number of possible datapoints in each interval (n_interval), the number of possible GPP/NEE data as well as the data that are subset (actual);
    # Finally the fraction of GPP/NEE that is available in each interval is also recorded (pct_avail_GPP, pct_avail_NEE)
    headers <- c(
      "Year", "DoY", "fracyr", "cumu.DoY", "k_POSIXdate_plotting", # date columns
      "n_interval", "n_GPP_possible", "n_GPP_omitted", "n_GPP_actual", "pct_avail_GPP",  "m_lin_GPP", "R2_lin_GPP", "adj.R2_lin_GPP", "RMSE_lin_GPP", "pval_lin_GPP",                   # GPP fit parameters for linear model (lin)
      "a_nls_GPP", "b_nls_GPP", "R2_nls_GPP", "adj.R2_nls_GPP", "RMSE_nls_GPP",                               # GPP fit parameters for nonlinear least squares (nls)
      "a_nls_fixed_GPP", "b_nls_fixed_GPP", "R2_nls_fixed_GPP", "adj.R2_nls_fixed_GPP", "RMSE_nls_fixed_GPP", # GPP fit parameters for nonlinear least squares (nls) with fixed b
      "GPPsat_GPP", "GPPsat_fixed_GPP", # summary info for GPP
      "n_NEE_possible", "n_NEE_omitted", "n_NEE_actual", "pct_avail_NEE", "m_lin_NEE", "R2_lin_NEE", "adj.R2_lin_NEE", "RMSE_lin_NEE", "pval_lin_NEE",                   # NEE fit parameters for linear model (lin)
      "a_nls_NEE", "b_nls_NEE", "R2_nls_NEE", "adj.R2_nls_NEE", "RMSE_nls_NEE",                               # NEE fit parameters for nonlinear least squares (nls)
      "a_nls_fixed_NEE", "b_nls_fixed_NEE", "R2_nls_fixed_NEE", "adj.R2_nls_fixed_NEE", "RMSE_nls_fixed_NEE", # NEE fit parameters for nonlinear least squares (nls) with fixed b
      "NEEsat_NEE", "NEEsat_fixed_NEE") # summary info for NEE

    df.param <- as.data.frame(matrix(nrow = totdays, ncol = length(headers)))

    colnames(df.param) <- headers



    pb <- txtProgressBar(min = 0, max = totdays, initial = 0, style = 3) # show progress bar for loop
    start_day <- 1

    for (j in 1:totdays) {

      end_day <- start_day + window - 1
      logical_index_for_time_interval <-  dat$cumu.DoY >= start_day & dat$cumu.DoY < end_day + 1

      # Extract datetime info for window of interest
      df.param[j,"Year"] = floor(mean(dat$Year[logical_index_for_time_interval]))
      df.param[j,"DoY"] = head(dat$DoY[logical_index_for_time_interval],1)
      df.param[j,"fracyr"] = mean(dat$k_fracyr[logical_index_for_time_interval])
      df.param[j,"cumu.DoY"] = head(dat$cumu.DoY[logical_index_for_time_interval],1)
      df.param[j,"k_POSIXdate_plotting"] = as.character(head(dat$k_POSIXdate_plotting[logical_index_for_time_interval],1))

      # subset data for this time interval
      dat.interval <- subset(dat, logical_index_for_time_interval)



      # Runs the light response function that extracts goodness of fit statistics
      # Note: tryCatch will continue the routine even if errors are encountered
      mod <- tryCatch(fGPPandNEELightRespCoeff(dat.interval, PPFD_col='PPFD_NT_gf', GPP_col='GPP_NT_U50_gf', NEE_col='NEE_NT_U50_gf'), error=function(err) NA)


      # calculate number of rows in dat.interval (include NAs)
      points_possible <- nrow(dat.interval)

      # Calculate the number of possible GPP/NEE datapoints, the number of actual GPP/NEE datapoints included in the fit model, and the percent of available data (accounting for removal of flux values at night)
      n_GPP_possible <-  as.numeric(mod["n_GPP_tot"])
      n_GPP_actual <-  as.numeric(mod["n_GPP_included"])
      n_GPP_omitted <- n_GPP_possible - n_GPP_actual

      pct_avail_GPP <- n_GPP_actual/(points_possible - n_GPP_omitted)

      n_NEE_possible <-  as.numeric(mod["n_NEE_tot"])
      n_NEE_actual <-  as.numeric(mod["n_NEE_included"])
      n_NEE_omitted <- n_NEE_possible - n_NEE_actual

      pct_avail_NEE <- n_NEE_actual/(points_possible - n_NEE_omitted)


      # Append fit statistics to empty dataframe
      df.param[j,"n_interval"] = points_possible
      df.param[j,"n_GPP_possible"] = n_GPP_possible
      df.param[j,"n_GPP_omitted"] = n_GPP_omitted
      df.param[j,"n_GPP_actual"] = n_GPP_actual
      df.param[j,"pct_avail_GPP"] = pct_avail_GPP

      df.param[j,"n_NEE_possible"] = n_NEE_possible
      df.param[j,"n_NEE_omitted"] = n_NEE_omitted
      df.param[j,"n_NEE_actual"] = n_NEE_actual
      df.param[j,"pct_avail_NEE"] = pct_avail_NEE


      # linear GPP
      df.param[j,"m_lin_GPP"] = mod["m_lin_GPP"]
      df.param[j,"R2_lin_GPP"] = mod["R2_lin_GPP"]
      df.param[j,"adj.R2_lin_GPP"] = mod["adj.R2_lin_GPP"]
      df.param[j,"RMSE_lin_GPP"] = mod["RMSE_lin_GPP"]
      df.param[j,"pval_lin_GPP"] = mod["pval_lin_GPP"]

      # nonlinear GPP with both a and b allowed to vary
      df.param[j,"a_nls_GPP"] = mod["a_nls_GPP"]
      df.param[j,"b_nls_GPP"] = mod["b_nls_GPP"]
      df.param[j,"R2_nls_GPP"] = mod["R2_nls_GPP"]
      df.param[j,"adj.R2_nls_GPP"] = mod["adj.R2_nls_GPP"]
      df.param[j,"RMSE_nls_GPP"] = mod["RMSE_nls_GPP"]

      # nonlinear GPP with fixed b
      df.param[j,"a_nls_fixed_GPP"] = mod["a_nls_fixed_GPP"]
      df.param[j,"b_nls_fixed_GPP"] = mod["b_nls_fixed_GPP"]
      df.param[j,"R2_nls_fixed_GPP"] = mod["R2_nls_fixed_GPP"]
      df.param[j,"adj.R2_nls_fixed_GPP"] = mod["adj.R2_nls_fixed_GPP"]
      df.param[j,"RMSE_nls_fixed_GPP"] = mod["RMSE_nls_fixed_GPP"]

      # linear NEE
      df.param[j,"m_lin_NEE"] = mod["m_lin_NEE"]
      df.param[j,"R2_lin_NEE"] = mod["R2_lin_NEE"]
      df.param[j,"adj.R2_lin_NEE"] = mod["adj.R2_lin_NEE"]
      df.param[j,"RMSE_lin_NEE"] = mod["RMSE_lin_NEE"]
      df.param[j,"pval_lin_NEE"] = mod["pval_lin_NEE"]

      # nonlinear NEE with both a and b allowed to vary
      df.param[j,"a_nls_NEE"] = mod["a_nls_NEE"]
      df.param[j,"b_nls_NEE"] = mod["b_nls_NEE"]
      df.param[j,"R2_nls_NEE"] = mod["R2_nls_NEE"]
      df.param[j,"adj.R2_nls_NEE"] = mod["adj.R2_nls_NEE"]
      df.param[j,"RMSE_nls_NEE"] = mod["RMSE_nls_NEE"]

      # nonlinear NEE with fixed b
      df.param[j,"a_nls_fixed_NEE"] = mod["a_nls_fixed_NEE"]
      df.param[j,"b_nls_fixed_NEE"] = mod["b_nls_fixed_NEE"]
      df.param[j,"R2_nls_fixed_NEE"] = mod["R2_nls_fixed_NEE"]
      df.param[j,"adj.R2_nls_fixed_NEE"] = mod["adj.R2_nls_fixed_NEE"]
      df.param[j,"RMSE_nls_fixed_NEE"] = mod["RMSE_nls_fixed_NEE"]

      # Step up progress bar and start day
      setTxtProgressBar(pb,j)
      start_day <- start_day + 1
      # code below is for debugging with plots
      # if (mean(dat01$DoY[logical_index_for_time_interval]) > 180) {
      #     plot(dat01$PPFD_f, dat01$GPP_U05_f, col="black")  # plot full year light response
      #     points(x,y,col="red")   # light response for current interval only
      #     tmp <- 33
      #}


    } # end window loop

    message('')
    message('  ', 'Calculating GPPsat and NEEsat')
    # Calculate GPPsat using the derived 'a' and 'b' parameters
    df.param$GPPsat_GPP <- (df.param$a_nls_GPP*1800)/(1 + df.param$b_nls_GPP*1800)
    df.param$GPPsat_fixed_GPP <- (df.param$a_nls_fixed_GPP*1800)/(1 + df.param$b_nls_fixed_GPP*1800)
    # Calculate NEEsat using the derived 'a' and 'b' parameters
    df.param$NEEsat_NEE <- (df.param$a_nls_NEE*1800)/(1 + df.param$b_nls_NEE*1800)
    df.param$NEEsat_fixed_NEE <- (df.param$a_nls_fixed_NEE*1800)/(1 + df.param$b_nls_fixed_NEE*1800)



    if (export.files) {

      message('')
      message('  ', 'Exporting GPPsat fit parameters to:  ')
      message('    ', paste0(outpath))
      message('')

      outfile <- sprintf('%s_%s-%d-%d_GPPsat.txt', db, site, start_year, end_year)

      write.table(df.param, file = paste0(outpath, outfile),
                  sep = "\t", row.names = FALSE,
                  col.names = TRUE,
                  append = FALSE)


    }


  } # end site loop


} # end function