Nothing
#' Sample time series data
#'
#' Sample time series using completely at random (MCAR) or at random (MAR)
#'
#' @param datin input numeric vector
#' @param smps chr string of sampling type to use, options are \code{"mcar"} or \code{"mar"}
#' @param repetition numeric for repetitions to be done for each missPercent value
#' @param b numeric indicating the total amount of missing data as a percentage to remove from the complete time series
#' @param blck numeric indicating block sizes as a proportion of the sample size for the missing data
#' @param blckper logical indicating if the value passed to \code{blck} is a proportion of \code{missper}, i.e., blocks are to be sized as a percentage of the total size of the missing data
#' @param plot logical indicating if a plot is returned showing the sampled data, plots only the first repetition
#'
#' @return Input data with \code{NA} values for the sampled observations if \code{plot = FALSE}, otherwise a plot showing the missing observations over the complete dataset.
#'
#' The missing data if \code{smps = 'mar'} are based on random sampling by blocks. The start location of each block is random and overlapping blocks are not counted uniquely for the required sample size given by \code{b}. Final blocks are truncated to ensure the correct value of \code{b} is returned. Blocks are fixed at 1 if the proportion is too small, in which case \code{"mcar"} should be used. Block sizes are also truncated to the required sample size if the input value is too large if \code{blckper = FALSE}. For the latter case, this is the same as setting \code{blck = 1} and \code{blckper = TRUE}.
#'
#' For all cases, the first and last observation will never be removed to allow comparability of interpolation schemes. This is especially relevant for cases when \code{b} is large and \code{smps = 'mar'} is used. For example, \code{method = na.approx} will have rmse = 0 for a dataset where the removed block includes the last n observations. This result could provide misleading information in comparing methods.
#'
#' @export
#'
#' @import dplyr ggplot2
#'
#' @examples
#' a <- rnorm(1000)
#'
#' # default sampling
#' sample_dat(a)
#'
#' # use mar sampling
#' sample_dat(a, smps = 'mar')
#'
#' # show a plot of one repetition
#' sample_dat(a, plot = TRUE)
#'
#' # show a plot of one repetition, mar sampling
#' sample_dat(a, smps = 'mar', plot = TRUE)
#'
#' # change plot aesthetics
#' library(ggplot2)
#' p <- sample_dat(a, plot = TRUE)
#' p + scale_colour_manual(values = c('black', 'grey'))
#' p + theme_minimal()
#' p + ggtitle('Example of simulating missing data')
sample_dat <- function(datin, smps = 'mcar', repetition = 10, b = 10, blck = 50, blckper = TRUE, plot = FALSE){
# sanity checks
if(!smps %in% c('mcar', 'mar'))
stop('smps must be mcar or mar')
# upper sample value
upp <- length(datin) -1
# sample to take for missing data given x
pool <- 2:upp
torm <- round(length(datin) * b/100)
out <- vector('list', length = repetition)
# sampling complately at random
if(smps == 'mcar'){
# label for plot
lab <- paste0('b = ', b, ', smps = "', smps, '"')
for(i in 1:repetition){
c <- sample(pool, torm, replace = FALSE)
datsmp <- datin
datsmp[c] <- NA
out[i] <- data.frame(datsmp)
}
}
# sampling at random
if(smps == 'mar'){
# label for plot
lab <- paste0('b = ', b, ', smps = "', smps, '", blck = ', blck, ', blckper = ', blckper)
# block size
if(blckper){
if(blck > 100 | blck < 0)
stop('block must be between 0 - 100 if blckper = T')
blck <- pmax(1, round(torm * blck/100))
} else {
if(blck < 1)
stop('block must be at least one if blckper = F')
blck <- pmin(torm, blck)
}
# get number of samples for initial grab
blck_sd <- floor(torm/blck)
# create samples for each repetition
for(i in 1:repetition){
# pool is the number of obs up to the max minus blck size
# ensures that those on the right do not overlap the end
pool <- 2:(length(datin) - blck + 1)
# initial grab
grbs <- sample(pool, blck_sd, replace = F) %>%
sapply(function(x) x:(x + blck - 1)) %>%
c %>%
unique %>%
.[. <= upp] %>%
sort
# adjust sampling pool and number of samples left
pool <- 2:upp %>%
.[!. %in% grbs]
lft <- torm - length(grbs)
# continue sampling one block at a time until enough samples in missper
while(lft > 0){
# take one sample with block size as minimum between block of samples left
grbs_tmp <- sample(pool, 1, replace = F) %>%
.:(. + pmin(lft, blck) - 1)
# append new sample to initial grab
grbs <- c(grbs, grbs_tmp) %>%
unique %>%
sort %>%
.[. <= upp]
# update samples left and sample pool
lft <- torm - length(grbs)
pool <- pool[!pool %in% grbs]
}
# append for each repetition
datsmp <- datin
datsmp[grbs] <- NA
out[i] <- data.frame(datsmp)
}
}
# outplot for plot, otherwise return sampled data
if(plot){
miss <- is.na(out[[1]])
toplo <- data.frame(x = 1:length(datin), y = datin, col = 'Retained', stringsAsFactors = FALSE)
toplo$col[miss] <- 'Removed'
p <- ggplot(toplo, aes(x = x, y = y, shape = col, colour = col)) +
scale_shape_manual(values = c(21, 16)) +
scale_colour_manual(values = c('black', '#00BFC4')) +
geom_point(alpha = 0.75) +
theme_bw() +
theme(
legend.title = element_blank(),
plot.title = element_text(size = 12),
legend.position = 'top',
legend.key = element_blank()
) +
ggtitle(lab)
return(p)
}
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.