#' Calculate completeness as percentage of non-NA values
#'
#' @param x vector of values
#' @param logical if TRUE, x must be a logical vector with FALSE representing NA
#'
#' @return completeness in percent rounded to one decimal place
#' @export
percent_complete <- function(x,logical=FALSE) {
# mapping to logical: FALSE means NA, TRUE means DATA
if (!logical) {
x <- !is.na(x)
}
round(sum(x) / length(x) * 100 ,1)
}
#' Assert completeness by threshold in percent of of non-NA values
#'
#' @param x vector of values
#' @param na.rm completeness percentage below which the input should be considered NA
#'
#' @return TRUE if less than na.rm percent of values are NA, FALSE otherwise
#' @export
#'
is_complete <- function(x,na.rm) {
if (na.rm < 100) {
return(percent_complete(x) >= na.rm)
} else {
return(!anyNA(x))
}
}
#' Calculate Longest Contiguous Stretch of NAs
#'
#' @param x vector of values
#' @param logical if TRUE, x must be a logical vector with FALSE representing NA
#'
#' @return length of longest contiguous Stretch of NAs
#' @export
#'
#' @importFrom magrittr "%>%"
longest_gap <- function(x,logical=FALSE) {
# mapping to logical: FALSE means NA, TRUE means DATA
if (!logical) {
x <- !is.na(x)
}
if (all(x)) {
return(0)
} else {
# invert NA/DATA so na.contiguos returns longest gap
return(x %>% na_if(TRUE) %>% na.contiguous() %>% length)
}
}
#' Calculate Gap Statistics for different time ranges
#'
#' Given a univariate daily time series, the following statistics are computed for the specified date ranges:
#' * Completeness Percentage, daily (c_d): Percentage of non-NA values in the time series
#' * Completeness Percentage, monthly (c_m): Percentage of non-NA values after aggregating to monthly
#' * Longest Gap,monthly (l_g): Longest consecutive stretch of NA values after aggregating to monthly
#'
#' The time ranges are specified in years as "start-end", with "2000-2009" meaning from 2000-01-01 to 2009-12-31.
#'
#' @param x daily timeseries as tibble with Date and data columns named with the stationcode
#' @param date_ranges character vector of date ranges
#' @param completeness_threshold completeness percentage below which a month is marked as NA when calculating monthly statistics (c_m, l_g)
#' @param md metadata table (tibble) with stationcode column
#'
#' @return the metadata table `md` with the following vars added: stationcode + c_d, c_m, l_g for each date range
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom lubridate make_date
#' @import zoo dplyr hydroTSM
gap_statistics <- function(x, date_ranges, completeness_threshold=100,md=cl_catchment_md) {
gaps_statistics_single <- function(x,date_range,completeness_threshold) {
range <- strsplit(date_range,"-")[[1]]
start <- lubridate::make_date(range[1])
end <- lubridate::make_date(range[2],12,31)
x %<>% dplyr::filter(Date >= start, Date <= end) %>%
as.data.frame() %>%
zoo::read.zoo()
x_m <- hydroTSM::daily2monthly(x,FUN=is_complete,na.rm=completeness_threshold)
compl_d <- apply(x, 2 ,percent_complete)
compl_m <- apply(x_m, 2, percent_complete, logical=TRUE)
lg_m <- apply(x_m, 2 ,longest_gap, logical = TRUE)
list_names <- paste(c("c_d","c_m","l_g"),date_range)
return(setNames(list(compl_d,compl_m,lg_m),list_names))
}
all_ranges <- lapply(date_ranges,gaps_statistics_single,x=x,
completeness_threshold=completeness_threshold)
dplyr::as_tibble(unlist(all_ranges,recursive = FALSE))%>%
dplyr::mutate(stationcode=colnames(x)[-1]) %>%
dplyr::left_join(md,.) %>%
dplyr::select(-starts_with("lc"),-starts_with("wu"),everything())
}
#' Filter on gap statistics for a given date range
#'
#' Extract rows matching the given gap statistics criteria created with [gap_statistics()].
#'
#' @param x tibble containing columns with gaps statistics (c_d, c_m and l_g)
#' @param date_range date range for the gap statistics, character formatted as "start-end" in years
#' @param c_d_min minimum daily completeness
#' @param c_m_min minimum monthly completeness
#' @param lg_max maximum gap length
#'
#' @return x filtered according to c_d_min, c_m_min and lg_max for the given date range
#' @export
#' @importFrom magrittr "%>%"
#' @import dplyr
gap_filter <- function(x,date_range, c_d_min = 80, c_m_min = 80, lg_max = 6) {
lg <- as.name(paste("l_g",date_range))
c_d <- as.name(paste("c_d",date_range))
c_m <- as.name(paste("c_m",date_range))
x %>% dplyr::filter((!!lg) <= lg_max,
(!!c_d) >= c_d_min,
(!!c_m) >= c_m_min)
}
#' Filter times series by stationcode and/or date range
#'
#' @param x daily timeseries (tibble) with Date and data columns named with the stationcode
#' @param md metadata table (tibble) with a stationcode column containing the stations to be extracted
#' @param date_range date range (character) formatted as "start-end" in years, optional
#'
#' @return x subset according to md and date_range (tibble)
#' @export
#'
#' @importFrom lubridate make_date
#' @import dplyr
filter_ts_from_md <- function(x, md=NULL, date_range=NULL) {
if (!is.null(date_range)) {
range <- strsplit(date_range, "-")[[1]]
start <- lubridate::make_date(range[1])
end <- lubridate::make_date(range[2], 12, 31)
x <- dplyr::filter(x, Date >= start, Date <= end)
}
if (!is.null(md)) {
sc <- as.character(pull(md, stationcode))
x <- dplyr::select(x, Date, one_of(sc))
}
return(x)
}
#' Write out multivariate time series into separate files per variable
#'
#' If a metadata table is given, only the stationcodes present in the stationcode column will be written out.
#' If a date range is given, the time series is first subset accordingly (e.g. "2000-2004" translates to 2000-01-01-2004-12-31),
#' and files will be written into a sub-directory of `outdir` named by `date_range`.
#'
#' @param x daily timeseries (tibble) with Date and data columns named with the stationcode
#' @param outdir directory path to write
#' @param md metadata table (tibble) with a stationcode column containing the stations to be used, optional
#' @param date_range date range (character) formatted as "start-end" in years, optional
#'
#' @return invisible, writes variables of `x` into seperate files named <stationcode>.csv (delimited by space) into `outdir`.
#' @export
#' @import dplyr readr
write_univariate <- function(x, outdir, md=NULL, date_range=NULL) {
write_single <- function(x, sc, outdir) {
outfile <- file.path(outdir, paste0(sc,".csv"))
sc <- as.name(sc)
x <- dplyr::select(x, Date, !!sc)
readr::write_delim(x, outfile)
}
if (!is.null(date_range)) {
outdir <- file.path(outdir, date_range)
}
if (!dir.exists(outdir)) {
dir.create(outdir, recursive=TRUE)
}
x <- filter_ts_from_md(x, md, date_range)
if (!is.null(md)) {
sc <- as.character(dplyr::pull(md, stationcode))
} else {
sc <- colnames(x)[-1]
}
sapply(sc, write_single, x=x, outdir=outdir)
invisible()
}
#' Extract gap filtered time series
#'
#' The metadata table must have gap statistics columns matching the date range(s), as created by `gap_statistics`.
#'
#'
#' @param x x daily timeseries (tibble) with Date and data columns named with the stationcode
#' @param md metadata table (tibble) with a stationcode column containing the stations to be used
#' @param date_ranges date range(s) (character or character vector) formatted as "start-end" in years
#' @param c_d_min minimum daily completeness
#' @param c_m_min minimum monthly completeness
#' @param lg_max maximum gap length
#' @param excelfile file path to write excel table with metadata per date range as workbooks, optional
#' @param outdir directory path for saving timeseries into univariate files (see `write_univariate`), optional
#'
#' @return a list of two objects:
#' * streamflow: a list of daily time series, named by date ranges
#' * metadata: a list of metadata tables, named by date ranges
#' @export
#'
#' @import openxlsx
extract_ts <- function(x=cl_streamflow_mm, md, date_ranges, c_d_min = 80,
c_m_min = 80, lg_max = 6, excelfile=NULL, outdir=NULL){
# list of metadata tables with the selected stations per date range
md_ranges_filtered <- sapply(date_ranges, gap_filter, x = md, lg_max = lg_max,
c_d_min = c_d_min, c_m_min = c_m_min, simplify = FALSE)
# create list of discharge timeseries with the selected stations for each date range
dc_filtered <- mapply(filter_ts_from_md,
md_ranges_filtered, names(md_ranges_filtered),
MoreArgs=list(x=x),SIMPLIFY=FALSE)
# export selected metadata per date range into excel workbooks
if (!is.null(excelfile)) {
md_excel <- md_ranges_filtered
md_excel$all<- md
openxlsx::write.xlsx(md_excel, excelfile, asTable=TRUE,
firstRow=TRUE, colWidths="auto")
}
# write out each time series into seperate files, seperate directory per date range
# can be easily read into zoo with: read.zoo(filepath,header=TRUE,drop=FALSE)
if (!is.null(outdir)) {
mapply(write_univariate,
md_ranges_filtered, names(md_ranges_filtered),
MoreArgs=list(x=x,outdir=file.path(outdir)))
}
return(list(streamflow=dc_filtered,metadata=md_ranges_filtered))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.