#' This function takes two dataframes and returns summary statistics around each estimate of SOS/EOS (e.g. z-score and data representation):
#' @export
#' @title Quality check critical dates (SOS/EOS and SOW/EOW) from GPPsat and cumuGPP time series.
#'
#'
#' @param y numeric vector, data column
#' @param y.POSIX POSIX date vector, the POSIX date timestamp for variable y
#' @param y.ProcYr numeric vector, a vector containing rowwise identification of the process_year for variable y
#' @param crit.ID character vector, contains the names of each critical threshold (e.g. SOS10, SOS25)
#' @param crit.POSIX POSIX date vector, the POSIX dates for each critical threshold
#' @param crit.ProcYr numeric vector, a vector containing rowwise identification of the process_year (must be equal length as critID and critPOSIX)
#' @param buffer numeric, buffer range (in days) by which the availability of data around each SOS/EOS will be assessed (e.g. buffer = 10 will calculate the number of actual data within a 21-day window, the day of interest +/- 10 days)
fCritDates_QC <- function (y, y.POSIX, y.ProcYr, crit.ID, crit.POSIX, crit.ProcYr, buffer) {
df.dat <- data.frame(y.ProcYr, y.POSIX, y, stringsAsFactors = FALSE)
names(df.dat) <- c("process_year", "k_POSIXdate_plotting", "y")
df.crit <- data.frame(crit.ProcYr, crit.POSIX, crit.ID, stringsAsFactors = FALSE)
names(df.crit) <- c("process_year", "k_POSIXdate_plotting", "CritThreshold")
rm(y, y.POSIX, y.ProcYr, crit.ID, crit.POSIX, crit.ProcYr)
CritThresholdNames <- df.crit$CritThreshold
# If all of the threshold names are NA
if (all(is.na(CritThresholdNames))) {
df.crit$pct_avail_aroundCritDate <- NA
df.crit$Z <- NA
return(df.crit)
} else {
# Remove NAs
df.crit <- df.crit[!is.na(CritThresholdNames),]
# Extract names for each SOS/EOS metric
df.crit_id <- unique(df.crit$CritThreshold)
# Create an empty list to store results in
list_k <- list()
# Loop through each SOS/EOS index (n = 7, including date of maxGPPsat)
for (k in seq_along(df.crit_id)) {
# Specify the name of each SOS/EOS metric
ID <- df.crit_id[k]
# Subset your df.crit dataframe and extract only the SOS/EOS stat of interest
df_k <- df.crit %>%
subset(., df.crit$CritThreshold == ID)
# Join the crit dates to your GPPsat data
df.dat_k <- df.dat %>%
left_join(df_k, by = c("process_year", "k_POSIXdate_plotting"))
# Extract the exact indexed row number for each SOS/EOS by process year
rows_k <- df.dat_k %>%
as.data.frame() %>%
subset(., df.dat_k$CritThreshold == ID) %>%
rownames() %>%
as.numeric()
# Create a vector of starting row positions for each SOS/EOS estimate w/buffer (this is the row position of SOS/EOS minus the length of the buffer)
rows_k_start <- rows_k - buffer
# Create an empty vector to store the row names as a string
rows_k_wbuffer <- vector()
# For each starting position (i.e. the critical date minus the buffer length), generate a sequence of numbers that is twice the length of the buffer, plus 1.
# So if you have a 10 day buffer, this will generate the row positions of the SOS/EOS date +/- 10 rows around it.
for (m in seq_along(rows_k_start)) {
rows_k_wbuffer[m] = paste(seq(rows_k_start[m], length.out = (buffer*2)+1), collapse = ":")
}
# Subset your data within the buffer range of SOS/EOS "k" for each process_year
df.dat_k_wbuffer <- df.dat_k[c(unlist(strsplit(rows_k_wbuffer, split = ":"))),]
# Summarize the number of actual "y" datapoints within each buffer window
summary_k <- df.dat_k_wbuffer %>%
group_by(process_year) %>%
summarize_at(vars(y), list(~sum(!is.na(.))))
# Calculate the percentage of available data within each buffer window
summary_k[,sprintf("pct_avail_aroundCritDate", ID)] <- summary_k[,"y"]/((2*buffer)+1)
# Provide a column name for the df.crit IDs
summary_k[,sprintf("%s", "CritThreshold")] <- ID
# Drop the no.datapoints column
summary_k <- summary_k %>% select(!y)
# Append this summary table to your list according to ID
list_k[[ID]] <- summary_k
}
# Take your list elements and bind them into a single dataframe
df <- bind_rows(list_k, .id = "CritThreshold")
# Calculate z-scores for each SOS/EOS estimate
df_zscore <- df.crit[complete.cases(df.crit),] %>% # Note: we want to eliminate rows that do not have an estimate for SOS/EOS
arrange(.[,"CritThreshold"], process_year) %>%
mutate(fracyr = decimal_date(k_POSIXdate_plotting),
fracyr_null = fPOSIX_to_fracyr_null(k_POSIXdate_plotting, process_year)) %>%
group_by(CritThreshold) %>%
# After grouping by process year and CritThreshold value, we want to calculate z-scores using fracyr_null as the datetime coordinate
mutate(mean_fracyr_null = mean(fracyr_null), Z = (fracyr_null - mean(fracyr_null))/sd(fracyr_null)) %>%
select(!mean_fracyr_null)
# Join your two dfs (z-scores and % data availability around SOS/EOS estimates) to your df.crit dataframe
df.crit <- df.crit %>%
left_join(df, by = c("process_year", "CritThreshold")) %>%
left_join(select(df_zscore, process_year, CritThreshold, Z), by = c("process_year", "CritThreshold"))
return(df.crit)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.