Nothing
#' Sample a daily time series at a set frequency
#'
#' Sample a daily water quality time series at a set monthly frequency
#'
#' @param dat_in input \code{\link[base]{data.frame}} that is returned from \code{\link{lnres_sim}} or \code{\link{all_sims}}
#' @param unit chr string indicating sampling unit, must be year, quarter, month, week, or yday for equivalent lubridate function
#' @param irregular logical indicating if monthly sampling is done randomly within each \code{unit}, otherwise the first value is returned
#' @param missper numeric from 0-1 indicating percentage of observations used for test dataset
#' @param blck numeric indicating block size for resampling test dataset, see details
#' @param blckper logical indicating if the value passed to \code{blck} is a proportion of \code{missper}, i.e., blocks are to be sized as a percentage of the total size of the missing data
#'
#' @details This function is intended for sampling a simulated daily time series of water quality that is returned by \code{\link{lnres_sim}} or \code{\link{all_sims}}.
#'
#' The \code{missper} argument is used to create a test dataset as a proportion of all observations in the sub-sampled output dataset. The test dataset is created with random block sampling appropriate for time series. Block sampling of the output dataset occurs until the number of unique observations is equal to the percentage defined by \code{missper}. Overlap of blocks are not doubly considered towards the observation counts to satisfy \code{missper}, i.e., sets of continuous observations longer than \code{blck} can be returned because of sampling overlap. Setting \code{blck = 1} and \code{blockper = FALSE} is completely random sampling for missing data. Values for \code{blck} must be 1 or greater if \code{blockper = FALSE} and 1 or less if \code{blckper = T}. If \code{blck = 1} and \code{blckper = T}, the missing data will be one continuous block.
#'
#' @return Original data frame with rows subset based on number of desired monthly samples. If \code{missper > 0}, a list is returned where the first element is the index values for the test dataset and the second is the complete subsampled dataset.
#'
#' @export
#'
#' @import dplyr
#'
#' @seealso \code{\link{lnres_sim}}, \code{\link{all_sims}}
#'
#' @examples
#' \dontrun{
#' ## example data
#' data(daydat)
#'
#' ## simulate
#' tosamp <- all_sims(daydat)
#'
#' ## sample
#' samp_sim(tosamp)
#'
#' ## sample and create test dataset
#' # test dataset is 30% size of monthly subsample using block sampling with size = 4
#' samp_sim(tosamp, missper = 0.3, blck = 4)
#' }
samp_sim <- function(dat_in, unit = 'month', irregular = TRUE, missper = 0, blck = 1, blckper = FALSE){
# sanity check
if(!unit %in% c('year', 'quarter', 'month', 'week', 'yday'))
stop('unit must year, quarter, month, week, or yday')
# get sampling unit
uni_str <- paste0('lubridate::', unit, '(dat_in$date)')
units <- eval(parse(text = uni_str))
# get indices in dat_in for sampling, only if not by day
if(unit != 'yday'){
inds <- data.frame(inds = 1:nrow(dat_in), year = lubridate::year(dat_in$date), units = units) %>%
group_by(year, units) %>%
summarize(inds = ifelse(irregular, sample(inds, 1), inds[1])) %>%
.$inds
} else {
inds <- 1:nrow(dat_in)
}
# output
out <- dat_in[inds, ]
# create missing data with random block sampling if missing percentage greater than zero
if(missper > 0){
# sanity checks
if(missper > 1)
stop('missper must be between 0 - 1')
if(blck < 1 & !blckper)
stop('block must be at least one if blckper = F')
if(blck > 1 & blckper)
stop('block must be less than one if blckper = T')
# number of samples to get and sample pool
torm <- round(nrow(out) * missper)
pool <- 1:nrow(out)
# vectorized sampling for blk = 1
if(blck == 1 & blckper == FALSE){
smps <- sample(pool, torm, replace = FALSE) %>%
sort
# otherwise start creating blocks
} else {
# change block proportion to counts if blckper is T
if(blckper)
blck <- round(torm * blck)
# get number of samples for initial grab
blck_sd <- floor(torm/blck)
# initial grab
smps <- sample(pool, blck_sd, replace = F) %>%
sapply(function(x) x:(x + blck - 1)) %>%
c %>%
unique %>%
.[. <= nrow(out)] %>%
sort
# adjust sampling pool and number of samples left
pool <- pool[!pool %in% smps]
lft <- torm - length(smps)
# continue sampling one block at a time until enough samples in missper
while(lft > 0){
# take one sample with block size as minimum between block of samples left
smps_tmp <- sample(pool, 1, replace = F) %>%
.:(. + pmin(lft, blck) - 1)
# append new sample to initial grab
smps <- c(smps, smps_tmp) %>%
unique %>%
sort %>%
.[. <= nrow(out)]
# update samples left and sample pool
lft <- torm - length(smps)
pool <- pool[!pool %in% smps]
}
}
# out is now a list with block samples and subsampled data
out <- list(smps = smps, alldat = out)
}
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.