#' Average raw hourly data by year or month-year
#'
#' @param df Data frame containing the variables \code{date}, \code{site_code}, \code{value}.
#'
#' @param avg.time Character vector specifying the time period over which to average the data. Option are 'year' or
#' 'month' (to return the annual/monthly average)
#'
#' @param statistic The statistic to use for averaging. Options are 'mean' or 'median'.
#'
#' @param sites Logical indicating whether averaging should be grouped by site (TRUE) or data should be averaged over all sites (FALSE).
#'
#' @return Data frame of annual/monthly average concentration data
average_data <- function(df, avg.time = "year", statistic = "mean", sites = FALSE, date.format = FALSE){
if(nrow(df) > 1){
df.out <- df %>%
dplyr::mutate(date = if(avg.time == "year"){
lubridate::year(date)
} else if(avg.time == "month"){
format(lubridate::date(date), "%Y-%m")
} else if(avg.time == "day"){
lubridate::date(date)
}) %>%
dplyr::group_by(date) %>%
dplyr::mutate(n = length(unique(site_code))) %>%
dplyr::ungroup()
if(sites == FALSE){
df.out <- df.out %>%
dplyr::group_by(date) %>%
dplyr::summarise(av_value = if(statistic == "median"){
median(value, na.rm = TRUE)}
else if(statistic == "mean"){
av_value = mean(value, na.rm = TRUE)},
n = unique(n)) %>%
dplyr::ungroup()
} else{
df.out <- df.out %>%
dplyr::group_by(date, site_code) %>%
dplyr::summarise(av_value = if(statistic == "median"){
median(value, na.rm = TRUE)}
else if(statistic == "mean"){
av_value = mean(value, na.rm = TRUE)},
n = unique(n)) %>%
dplyr::ungroup()
}
if(date.format == TRUE){
df.out <- df.out %>%
dplyr::mutate(date = if(avg.time == "year"){
paste0(date, "-01-01")
} else if(avg.time == "month"){
paste0(date, "-01")
} else if(avg.time == "day"){
date
}) %>%
dplyr::mutate(date = lubridate::ymd(date))
}
return(df.out)
} else{
df.out <- data.frame("date" = as.POSIXct(character()),
"av_value" = numeric(),
"n" = numeric())
return(df.out)
}
}
#' Calculate average pollutant ratios using linear model
#'
#' @param df.list List of data frames of observation data for each pollutant in the ratio. First element of list should be
#' data frame for the pollutant which is the numerator in the ratio (second element should be for the denominator pollutant).
#' Both data frames should contain the variables \code{date}, \code{site_code}, \code{value}.
#'
#' @param avg.time Character vector specifying the time period over which to average the data. Option are 'year' or
#' 'month' (to return the annual/monthly average)
#'
#' @param statistic The statistic to use for averaging. Options are 'mean' or 'median'.
#'
#' @param sites Logical indicating whether averaging should be grouped by site (TRUE) or data should be averaged over all sites (FALSE).
#'
#' @return Data frame of annual/monthly average concentration data for the pollutant ratio
calculate_pollutant_ratio <- function(df.list, avg.time, statistic, sites = FALSE){
obs.num <- df.list[[1]]
obs.den <- df.list[[2]]
obs.num$variable <- "var1" # var1 = variable on numerator of ratio
obs.den$variable <- "var2" # var2 = variable on denominator of ratio
sites.int <- intersect(unique(obs.num$site_code), unique(obs.den$site_code)) # sites measuring both var1 and var2
# Create var1/var2 variable (coefficient of the linear model of var1 hourly concentration as a function of
# var2 hourly concentration)
obs <- obs.num %>%
dplyr::bind_rows(obs.den) %>% # row-bind observations data frames of var1 and var2
dplyr::distinct(site_code, date, variable, .keep_all = TRUE) %>%
dplyr::filter(site_code %in% sites.int) %>% # filter to only include sites with both var1 and var2 measurements
tidyr::spread(variable, value) %>%
dplyr::mutate(date = if(avg.time == "year"){ # convert date to year/year-month
lubridate::year(date)
} else if(avg.time == "month"){
format(lubridate::date(date), "%Y-%m")
} else if(avg.time == "week"){
format(paste0(lubridate::year(date), "-", lubridate::week(date)))
} else if(avg.time == "day"){
lubridate::date(date)
}) %>%
dplyr::group_by(site_code) %>%
dplyr::filter(!(all(is.na(var1))) & !(all(is.na(var2)))) %>% # remove NA values
dplyr::group_by(date) %>%
dplyr::mutate(n = length(unique(site_code))) %>% # number of sites in each avg.time
dplyr::ungroup() %>%
dplyr::group_by(date,n, site_code) %>% # calculate linear model of var1 as function of var2 for each year
do(model = try(lm(var1 ~ var2, data = .), silent = TRUE)) %>%
dplyr::mutate(av_value = tryCatch({summary(model)$coeff[2]
}, error = function(e){
NA}),
n = unique(n)) %>% # extract gradient of linear model as var1/var2
dplyr::select(-model)
if(sites == FALSE){
obs <- obs %>%
dplyr::group_by(date) %>%
dplyr::summarise(av_value = if(statistic == "median"){
median(av_value, na.rm = TRUE)}
else if(statistic == "mean"){
av_value = mean(av_value, na.rm = TRUE)},
n = unique(n))
}
obs <- obs %>%
dplyr::mutate(date = if(avg.time == "year"){
paste0(date, "-01-01")
} else if(avg.time == "month"){
paste0(date, "-01")
} else if(avg.time == "day"){
date
}) %>%
dplyr::mutate(date = lubridate::ymd(date))
return(obs)
}
#' Check that a character string is in correct date format
#'
#' @param date.string The character string of the date to be tested for whether the format is correct (\%Y-\%m-\%d)
#'
#' @return Logical value indicating whether the string is in the form YYYY-mm-dd (TRUE) or not (FALSE)
check_date_format <- function(date.string){
# Checks that the input character string is in the format %Y-%m-%d
out <- stringr::str_detect(date.string, "[0-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]")
return(out)
}
#' Plot average trends in pollutant concentration
#'
#' @param df Data frame of average pollutant concentrations containing the variables \code{date}, \code{site_code}, \code{av_value}.
#'
#' @param sites Logical indicating whether to plot trends by site (TRUE) or average trend over all sites (FALSE).
#'
#' @param poll The pollutant for which to plot the trends.
#'
#' @param stat The statistic which was used for averaging (for the y axis label)
#'
#' @param smooth.method The smoothing method to use in the \code{geom_smooth} function when plotting trends.
#'
#' @param start.date,end.date A string in the format "YYYY-MM-DD" specifying the starting date and the ending date for the trends.
#'
#' @return A plot of the average trend in \code{poll} concentration over the period \code{start.date} - code{end.date} either
#' for each monitoring site or averaged over all monitoring sites
trends_plots_helper <- function(df, sites, poll, stat, start.date, end.date){
if(nrow(df) <= 5){
smooth.method <- "gam"
} else{
smooth.method <- "loess"
}
plot.colour <- viridis::inferno(1, begin=0.3, end=0.8)
plot <- ggplot2::ggplot(df, ggplot2::aes(x = date, y = av_value)) +
ggplot2::geom_point() +
ggplot2::xlab("Year") +
ggplot2::ylab(openair::quickText(paste0(stringr::str_to_title(stat), " ", poll, " concentration (ugm-3)"))) +
ggplot2::theme_minimal()
if(sites == TRUE){
plot <- plot +
ggplot2::geom_smooth(size = 2,
method = smooth.method,
color = "black") +
ggplot2::geom_smooth(ggplot2::aes(color = site_code), se = FALSE, show.legend = FALSE,
method = smooth.method,
span = 1, size = 0.5) +
viridis::scale_colour_viridis(option = "inferno", begin = 0.3, end = 0.8, discrete = TRUE)
} else{
plot <- plot +
ggplot2::geom_smooth(method = "loess",
color = plot.colour,
fill = plot.colour,
alpha=0.25) +
ggplot2::geom_text(ggplot2::aes(label = n), vjust = 1.3, size = 3)
}
return(plot)
}
#' Check function arguments
#'
#' @param obs,pollutant,window.width,stat,start.date,end.date,data.capture,smooth.method,parallel Pass the arguments to the
#' main function directly to the \code{check_arguments} function for checking.
#'
#' @return If one of the arguments violates the argument requirements, will return an informative error. Otherwise, the function
#' does nothing.
check_arguments <- function(obs = NULL,
pollutant = NULL,
window.width = NULL,
stat = NULL,
start.date = NULL,
end.date = NULL,
parallel = NULL,
avg.ts = NULL){
# 1. obs
if(!(is.null(obs))){
if(!(stringr::str_detect(pollutant, "/"))){
if(!(all(c("date", "site_code", "value") %in% colnames(obs)))){
stop("Required columns missing from data frame. Please ensure 'obs' input has the following columns:'site_code', 'date', 'value'.")
}
} else{
if(!(is.list(obs))){
stop("For this pollutant, the input data frame 'obs' must be a list of data frames of observations data for each pollutant in the ratio (in the order of ratio notation).")
}
if(!(all(c("date", "site_code", "value") %in% colnames(obs[[1]]))) & all(c("date", "site_code", "value") %in% colnames(obs[[2]]))){
stop("The input data frames in the 'obs' list must have the following columns: 'site_code', 'date', 'value'.")
}
}
}
# 2. pollutant
if(!(is.null(pollutant))){
if(!(is.character(pollutant))){
stop("The 'pollutant' argument must be a character string specifying the name of the pollutant of interest.")
}
}
# 4. stat
if(!(is.null(stat))){
if(!(stat %in% c("median", "mean"))){
stop("The 'stat' argument must be one of the following: 'median', 'mean'.")
}
}
# 5. start/end.date
if(!(is.null(start.date)) & !(is.null(end.date))){
if(!(check_date_format(start.date) & check_date_format(end.date))){ # pattern matching to check input dates are in correct format
stop("The 'start.date' and 'end.date' arguments must be character strings in the format '%Y-%m-%d'.")
}
}
# 8. parallel
if(!(is.null(parallel))){
if(!(is.logical(parallel))){
stop("The 'parallel' argument must be a logical.")
}
}
# 9. avg.ts
if(!(is.null(avg.ts))){
if(!(avg.ts %in% c("year", "month"))){
stop("The 'avg.ts' argument must be one of the following: 'year', 'month'.")
}
}
}
#' Extract site codes of monitoring sites open throughout the duration of the moving window
#'
#' @param df Data frame of observations data from which to extract site codes
#'
#' @param resolution The time resolution for which to apply filtering. I.e. if \code{resolution} is "month", the function
#' will select and return monitoring sites measuring over a certain threshold of months during the moving window period.
#' Options: "month", "day"
#'
#' @param data.capture The data capture threshold to apply to filtering the sites. A value of 1 means that the site must
#' have measurements for every month/day within the moving window, while a value of 0 means no filtering takes place at all.
#' A value of 0.9 means that the site must have measurements for at least 90% of the months/days spanned by the moving window.
#'
#' @return A vector of site codes for monitoring sites open for the entire duration of the moving window (as specified by
#' the filtering parameters in the arguments).
sites_open_throughout_window <- function(df, resolution = "month", data.capture = 1){
if(resolution == "month"){
dates.window <- unique(format(as.Date(df$date), "%Y-%m"))
} else if(resolution == "day"){
dates.window <- unique(lubridate::date(df$date))
}
# Vector of sites open over entire duration of moving window (i.e. measuring in each month of window)
open.sites <- df %>%
dplyr::mutate(date = if(resolution == "month"){
format(as.Date(date), "%Y-%m")
} else if(resolution == "day"){
lubridate::date(date)
}) %>%
dplyr::select(date, site_code) %>%
dplyr::distinct() %>%
dplyr::group_by(site_code) %>%
dplyr::summarise(n = n()) %>%
dplyr::ungroup() %>%
dplyr::filter(n >= (length(dates.window))*data.capture) %>%
dplyr::select(site_code) %>%
dplyr::pull()
return(open.sites)
}
#' Return the site codes of the 'long term sites' (recorded measurements during every year of the period specified in the
#' function call with >= \code{data.capture} % data capture).
#'
#' @param df Data frame of observation data from which to get long term sites
#'
#' @param pollutant Pollutant of interest
#'
#' @param start,end Starting and ending year of period of interest (e.g. 2000, 2017)
#'
#' @param resolution Resolution at which to determine whether a site is long term. E.g. for a resolution of 'year', to qualify
#' as 'long term', a monitoring site must have measurements of the pollutant of interest in every year between \code{start}
#' and \code{end}. For a resolution of 'month', the site must have data measured during **each month** between \code{start}
#' and \code{end}.
#'
#' @param data.capture Data capture threshold to apply to 'long term sites' data. To qualify as a long term site,
#' the site must (in addition to having measurements in every year/month/day of the period) have >= \code{data.capture}
#' % data capture over the period of analysis (\code{start} - \code{end}). Default is 90% data capture.
#'
#' @return Vector of site codes of sites meeting the 'long term' criterion (measurements during every year/month/day of the
#' period of interest, and sufficient data capture over period of interest).
get_longterm_sites <- function(df, pollutant, start, end, resolution = "year", data.capture = 90){
start.date <- paste0(start, "-01-01")
end.date <- paste0(end, "-12-31")
if(resolution == "year"){
dates <- start:end
} else if(resolution == "month"){
dates <- seq.Date(as.Date(start.date), as.Date(end.date), by = "1 month") %>% format("%Y-%m")
} else if(resolution == "day"){
dates <- seq.Date(as.Date(start.date), as.Date(end.date), by = "1 day")
} else if(resolution == "hour"){
dates <- seq.POSIXt(lubridate::ymd(start.date, tz = "GMT"), lubridate::ymd(end.date, tz = "GMT"), by = "1 hour")
} else{
stop("The input time resolution is not supported. Supported resolutions are: 'day', 'month', 'year'.")
}
# Vector of sites open over entire duration of moving window (i.e. measuring in each month of window)
longterm.sites <- df %>%
dplyr::filter(date >= lubridate::ymd(start.date) & date <= lubridate::ymd(end.date),
variable == pollutant) %>%
dplyr::mutate(date = if(resolution == "month"){
format(as.Date(date), "%Y-%m")
} else if(resolution == "day"){
lubridate::date(date)
} else if(resolution == "year"){
lubridate::year(date)
}) %>%
dplyr::select(date, site_code) %>%
dplyr::distinct() %>%
dplyr::group_by(site_code) %>%
dplyr::mutate(longterm = dplyr::case_when(all(dates %in% date) ~ "yes",
TRUE ~ "no")) %>%
dplyr::ungroup() %>%
dplyr::filter(longterm == "yes") %>%
dplyr::select(site_code) %>%
dplyr::pull() %>%
unique()
longterm.data <- df %>%
filter(site_code %in% longterm.sites) %>%
data_capture(threshold = data.capture, start.date = start.date, end.date = end.date)
longterm.sites <- longterm.sites %>% .[. %in% unique(longterm.data$site_code)]
return(longterm.sites)
}
#' Define the n moving windows of the rolling regressions
#'
#' @param window.width Width of the moving window
#'
#' @param start.date,end.date Starting and ending dates of the entire period of analysis (e.g. "2000-01-01", "2017-12-31").
#'
#' @return Vector of starting dates of the n moving windows (in Date data type).
define_moving_windows <- function(window.width, obs, start.date, end.date, avg.ts){
start <- lubridate::ymd(start.date)
end <- lubridate::ymd(end.date)
if(is.list(obs)){
obs.all <- dplyr::bind_rows(obs)
} else{
obs.all <- obs
}
measurements <- obs.all %>%
dplyr::filter(!(is.na(value))) %>%
dplyr::mutate(year = lubridate::year(date)) %>%
dplyr::group_by(year) %>%
dplyr::mutate(n = nrow(.)) %>%
dplyr::ungroup() %>%
dplyr::select(year, n) %>%
dplyr::distinct() %>%
dplyr::filter(n > 0) %>%
dplyr::arrange(year)
start <- lubridate::ymd(paste0(max(c(lubridate::year(start), min(measurements$year)), na.rm = T), "-01-01"))
end <- lubridate::ymd(paste0(min(c(lubridate::year(end), max(measurements$year)), na.rm = T), "-12-31"))
# Define the n moving windows
if(avg.ts == "year"){
interval <- "1 year"
} else if(avg.ts == "month"){
interval <- "1 month"
}
window1 <- seq.Date(start, end, by = interval)
window1 <- window1[1:(length(window1) - (window.width-1))] # make sure that start date of final moving window is the time period end - (moving window width - 1)
return(window1)
}
#' Worker function for calculating rolling trend over a single window (i.e. one of the moving windows)
#'
#' @param d1 Date of the starting year of the moving window for which to calculate the rolling trend
#'
#' @param pollutant Pollutant for which to calculate rolling trend
#'
#' @param window.width Width of the moving window
#'
#' @param stat Statistic used to average data (mean, median)
#'
#' @import lubridate
#'
#' @return Dataframe of rolling trend for the moving window.
rolling_worker <- function(d1, obs, pollutant, window.width, stat, avg.ts, verbose){
# Define start, and and range of the moving window
time <- window.width - 1
start <- d1
end <- d1
if(avg.ts == "year"){
lubridate::year(end) <- lubridate::year(end) + time # create 'end' object to specify end of moving window where end = i + (n-1)
end <- end %>% lubridate::year() %>% paste0("-12-31") %>% lubridate::ymd()
} else if(avg.ts == "month"){
end <- end %m+% months(time)
end <- end %>%
format("%Y-%m") %>%
paste("-", lubridate::days_in_month(end), sep="") %>%
lubridate::ymd()
}
range <- paste0(as.character(start), " - ", as.character(end)) # date range of window
if(verbose == TRUE) print(paste0("Calculating rolling trend for the window: ", range))
empty.df <- data.frame(date = numeric(),
av_value = numeric(),
trend_difference = numeric(),
n = integer(),
moving_window = character(),
window_width = numeric(),
trend = numeric())
# Conditional branching - different averaging functions for pollutants and pollutant ratios
if(!(stringr::str_detect(pollutant, "/"))){
longterm.dat <- obs %>%
dplyr::filter(date >= start, date <= end) # filter to include dates within moving window
if(nrow(longterm.dat) > 0){
open.sites <- sites_open_throughout_window(longterm.dat,
resolution = ifelse(avg.ts == "year", "month", "day"),
data.capture = 1)
longterm.dat <- longterm.dat %>%
dplyr::filter(site_code %in% open.sites) # filter to only include sites open for entire duration of window
if(nrow(longterm.dat) == 0){
return(empty.df)
} else{
longterm.dat <- longterm.dat %>%
average_data(avg.time = avg.ts, statistic = stat, sites = FALSE, date.format = TRUE) # calculate annual average concentration
}
} else{
return(empty.df)
}
} else{
longterm.dat <- purrr::map(obs, function(x) x %>% dplyr::filter(date >= start, date <= end)) # filter to include dates within moving window)
if(all(sapply(longterm.dat, nrow) > 0)){ # check that neither data frame in list is empty
open.sites <- purrr::map(longterm.dat, sites_open_throughout_window,
resolution = ifelse(avg.ts == "year", "month", "day"),
data.capture = 1) %>%
{Reduce(intersect, .)}
longterm.dat <- purrr::map(longterm.dat, function(x) x[x$site_code %in% open.sites, ])
if(all(sapply(longterm.dat, nrow) == 0)){
return(empty.df)
} else{
longterm.dat <- longterm.dat %>%
calculate_pollutant_ratio(avg.time = avg.ts, statistic = stat)
}
} else{
return(empty.df)
}
}
longterm.dat <- longterm.dat %>%
dplyr::mutate(moving_window = range, # moving window variable (date range and number of sites)
window_width = window.width) # window width variable (n)
# Rolling regression
if(nrow(longterm.dat) > 1){
# Convert dates to different scale (relative to start of window - i.e. start from 1 = start)
if(avg.ts == "year"){
start.yr <- lubridate::year(start)
longterm.dat <- longterm.dat %>%
dplyr::mutate(x = lubridate::year(date) - start.yr)
} else if(avg.ts == "month"){
seq.months <- seq.Date(start, end, "1 month")
key <- data.frame("x" = 1:length(seq.months), "date" = seq.months)
longterm.dat <- dplyr::left_join(longterm.dat, key, by = "date")
}
# Linear regression
lm.trend <- lm(av_value ~ x, data = longterm.dat) # fit linear model (rolling regression)
trend <- as.numeric(lm.trend$coefficients[2]) # extract linear model coefficient (beta_i)
# Add regression coefficient to output data frame
longterm.dat <- longterm.dat %>%
dplyr::mutate(trend = rep(trend, times = nrow(longterm.dat))) %>% # append linear coefficient to data frame
dplyr::mutate(date = as.Date(date))
} else{
trend <- NA # if there are no long term sites, will return NA in the trend column
longterm.dat <- longterm.dat %>%
dplyr::mutate(trend = rep(trend, times = nrow(longterm.dat))) %>%
dplyr::mutate(moving_window = as.character(moving_window),
trend = as.numeric(trend)) %>% # mutate columns into same classes as those with data (for purrr::map_dfr rowbind operation with other moving window data)
dplyr::mutate(date = as.Date(date))
}
return(longterm.dat)
}
#' Filter observations data to only include sites with sufficient data capture
#'
#' @param data A data frame of observation data with the columns: site_code, date, value (i.e. the same as
#' the column names in the observations table of the air_quality_data database).
#'
#' @param threshold A value between 0-100 specifying the data capture threshold by which to filter the data
#'
#' @param start.date,end.date A string in the format "YYYY-MM-DD" specifying the starting date and the ending date between
#' which to measure data capture.
#'
#' @return Data frame of observation data filtered by data capture over the defined period
#'
#' @import dplyr
data_capture <- function(data, threshold, start.date = NULL, end.date = NULL) {
# Colnames case
colnames(data) <- tolower(colnames(data))
# Check function arguments
args <- c("site_code", "date", "value")
if(!(all(args %in% colnames(data)))){
stop("The input data frame must have the columns 'site code', 'date', 'value'.")
}
if(!is.numeric(threshold)){
stop("The 'threshold' argument must be numeric.")
}
if(!(threshold >= 0 & threshold <= 100)) {
stop("The 'threshold' argument must be between 0 and 100.")
}
# Check start.date and end.date
if(!(is.null(start.date))){
if(!is.character(start.date)){
stop("start.date (if specified) must be a character string.")
}
}
if(!(is.null(end.date))){
if(!is.character(end.date)){
stop("end.date (if specified) must be a character string.")
}
}
# Convert start.date and end.date arguments to datetime format
if(!(is.null(start.date))) start.date <- paste0(start.date, "00:00:00") %>% lubridate::ymd_hms()
if(!(is.null(end.date))) end.date <- paste0(end.date, "00:00:00") %>% lubridate::ymd_hms()
# Define find.time.interval functions from openair
find.time.interval <- function(dates) {
## could have several sites, dates may be unordered
## find the most common time gap in all the data
dates <- unique(dates) ## make sure they are unique
# work out the most common time gap of unique, ordered dates
id <- which.max(table(diff(as.numeric(unique(dates[order(dates)])))))
seconds <- as.numeric(names(id))
if ("POSIXt" %in% class(dates)) seconds <- paste(seconds, "sec")
if (class(dates)[1] == "Date") {
seconds <- seconds * 3600 * 24
seconds <- paste(seconds, "sec")
}
seconds
}
# Convert data capture to a number between 0 and 1
threshold <- threshold / 100
# Split data by site code
dat <- split(data, data$site_code)
fun <- function(data){
# if one line, just return
if (nrow(data) < 2) return(data)
## time zone of data
TZ <- attr(data$date, "tzone")
if (is.null(TZ)) TZ <- "GMT" ## as it is on Windows for BST
if(TZ == "") TZ <- "GMT"
## function to fill missing data gaps
## assume no missing data to begin with
## If start.date and end.date ARE NOT defined in function call, set them as the max and min dates
## of the date range in the data set (if they are defined, leave unchanged and pad data between the
## specified dates).
## pad out missing data
if(is.null(start.date)) start.date <- min(data$date, na.rm = TRUE)
if(is.null(end.date)) end.date <- max(data$date, na.rm = TRUE)
## interval in seconds
interval <- find.time.interval(data$date)
## equivalent number of days, used to refine interval for month/year
days <- as.numeric(strsplit(interval, split = " ")[[1]][1]) /
24 / 3600
## find time interval of data
if (class(data$date)[1] == "Date") {
interval <- paste(days, "day")
} else {
## this will be in seconds
interval <- find.time.interval(data$date)
}
## better interval, most common interval in a year
if (days == 31) interval <- "month"
if (days %in% c(365, 366)) interval <- "year"
## only pad if there are missing data
if (length(unique(diff(data$date))) != 1L) {
all.dates <- data.frame(date = seq(start.date, end.date, by = interval))
data <- data %>% dplyr::full_join(all.dates, by = "date")
}
# add missing site codes
data$site_code <- rep(unique(data$site_code[!is.na(data$site_code)]), times = nrow(data))
# if the data capture for this site code is < the specified data capture, delete it
if((sum(is.na(data$value))/length(data$value)) <= 1 - threshold) {data.out <- data
} else data.out <- data.frame()
return(data.out)
}
# Apply function to all data frames of different site codes and recombine
data.out <- list()
data.out <- purrr::map(dat, fun)
data <- dplyr::bind_rows(data.out)
# Remove NA values from 'values' column
if(nrow(data) > 0) data <- data[!is.na(data$value), ]
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.