R/fCritDates_QC.R

#'  This function takes two dataframes and returns summary statistics around each estimate of SOS/EOS (e.g. z-score and data representation):

#' @export
#' @title Quality check critical dates (SOS/EOS and SOW/EOW) from GPPsat and cumuGPP time series.
#'
#'
#' @param y numeric vector, data column
#' @param y.POSIX POSIX date vector, the POSIX date timestamp for variable y
#' @param y.ProcYr numeric vector, a vector containing rowwise identification of the process_year for variable y
#' @param crit.ID character vector, contains the names of each critical threshold (e.g. SOS10, SOS25)
#' @param crit.POSIX POSIX date vector, the POSIX dates for each critical threshold
#' @param crit.ProcYr numeric vector, a vector containing rowwise identification of the process_year (must be equal length as critID and critPOSIX)
#' @param buffer numeric, buffer range (in days) by which the availability of data around each SOS/EOS will be assessed (e.g. buffer = 10 will calculate the number of actual data within a 21-day window, the day of interest +/- 10 days)


fCritDates_QC <- function (y, y.POSIX, y.ProcYr, crit.ID, crit.POSIX, crit.ProcYr, buffer) {

  df.dat <- data.frame(y.ProcYr, y.POSIX, y, stringsAsFactors = FALSE)
  names(df.dat) <- c("process_year", "k_POSIXdate_plotting", "y")


  df.crit <- data.frame(crit.ProcYr, crit.POSIX, crit.ID, stringsAsFactors = FALSE)
  names(df.crit) <- c("process_year", "k_POSIXdate_plotting", "CritThreshold")

  rm(y, y.POSIX, y.ProcYr, crit.ID, crit.POSIX, crit.ProcYr)


  CritThresholdNames <- df.crit$CritThreshold

  # If all of the threshold names are NA
  if (all(is.na(CritThresholdNames))) {
    df.crit$pct_avail_aroundCritDate <- NA
    df.crit$Z <- NA
    return(df.crit)

  } else {
    # Remove NAs
    df.crit <- df.crit[!is.na(CritThresholdNames),]


    # Extract names for each SOS/EOS metric
    df.crit_id <- unique(df.crit$CritThreshold)


    # Create an empty list to store results in
    list_k <- list()


    # Loop through each SOS/EOS index (n = 7, including date of maxGPPsat)
    for (k in seq_along(df.crit_id)) {

      # Specify the name of each SOS/EOS metric
      ID <- df.crit_id[k]

      # Subset your df.crit dataframe and extract only the SOS/EOS stat of interest
      df_k <- df.crit %>%
        subset(., df.crit$CritThreshold == ID)

      # Join the crit dates to your GPPsat data
      df.dat_k <- df.dat %>%
        left_join(df_k, by = c("process_year", "k_POSIXdate_plotting"))

      # Extract the exact indexed row number for each SOS/EOS by process year
      rows_k <- df.dat_k %>%
        as.data.frame() %>%
        subset(., df.dat_k$CritThreshold  == ID) %>%
        rownames() %>%
        as.numeric()


      # Create a vector of starting row positions for each SOS/EOS estimate w/buffer (this is the row position of SOS/EOS minus the length of the buffer)
      rows_k_start <- rows_k - buffer

      # Create an empty vector to store the row names as a string
      rows_k_wbuffer <- vector()

      # For each starting position (i.e. the critical date minus the buffer length), generate a sequence of numbers that is twice the length of the buffer, plus 1.
      # So if you have a 10 day buffer, this will generate the row positions of the SOS/EOS date +/- 10 rows around it.
      for (m in seq_along(rows_k_start)) {
        rows_k_wbuffer[m] = paste(seq(rows_k_start[m], length.out = (buffer*2)+1), collapse = ":")
      }

      # Subset your data within the buffer range of SOS/EOS "k" for each process_year
      df.dat_k_wbuffer <- df.dat_k[c(unlist(strsplit(rows_k_wbuffer, split = ":"))),]

      # Summarize the number of actual "y" datapoints within each buffer window
      summary_k <- df.dat_k_wbuffer %>%
        group_by(process_year) %>%
        summarize_at(vars(y), list(~sum(!is.na(.))))

      # Calculate the percentage of available data within each buffer window
      summary_k[,sprintf("pct_avail_aroundCritDate", ID)] <- summary_k[,"y"]/((2*buffer)+1)
      # Provide a column name for the df.crit IDs
      summary_k[,sprintf("%s", "CritThreshold")] <- ID

      # Drop the no.datapoints column
      summary_k <- summary_k %>% select(!y)

      # Append this summary table to your list according to ID
      list_k[[ID]] <- summary_k

    }


    # Take your list elements and bind them into a single dataframe
    df <- bind_rows(list_k, .id = "CritThreshold")

    # Calculate z-scores for each SOS/EOS estimate
    df_zscore <- df.crit[complete.cases(df.crit),] %>% # Note: we want to eliminate rows that do not have an estimate for SOS/EOS
      arrange(.[,"CritThreshold"], process_year) %>%
      mutate(fracyr = decimal_date(k_POSIXdate_plotting),
             fracyr_null = fPOSIX_to_fracyr_null(k_POSIXdate_plotting, process_year)) %>%
      group_by(CritThreshold) %>%
      # After grouping by process year and CritThreshold value, we want to calculate z-scores using fracyr_null as the datetime coordinate
      mutate(mean_fracyr_null = mean(fracyr_null), Z = (fracyr_null - mean(fracyr_null))/sd(fracyr_null)) %>%
      select(!mean_fracyr_null)


    # Join your two dfs (z-scores and % data availability around SOS/EOS estimates) to your df.crit dataframe
    df.crit <- df.crit %>%
      left_join(df, by = c("process_year", "CritThreshold")) %>%
      left_join(select(df_zscore, process_year, CritThreshold, Z), by = c("process_year", "CritThreshold"))

    return(df.crit)
  }



}
ksmiff33/FluxSynthU documentation built on Dec. 15, 2020, 10:29 p.m.