Nothing
#' Remove error values manually
#'
#' @description `remove_manually()` removes unreasonable values manually by
#' indicating specific timestamps.
#'
#' @inheritParams n_valid
#' @param vctr_time A timestamp vector of class POSIXct or POSIXlt.
#' @param vctr_target A vector of a targeted time series to be checked. The
#' length of the time series must be the same as that of `vctr_time`.
#' @param vctr_time_err A timestamp vector of class POSIXct or POSIXlt,
#' indicating specific error timings.
#'
#' @returns
#' A vector of cleaned time series. The length of the time series is the same
#' as the input time series. The data points at the indicated time points by
#' `vctr_time_err` are replaced with the error label specified in `label_err`.
#'
#' @examples
#' ## Load data
#' data(dt_noisy)
#' time <- dt_noisy$time[12097:14400]
#' target <- dt_noisy$dt[12097:14400]
#' time_err <- seq(as.POSIXct("2013/06/27 18:00", tz = "Etc/GMT-8"),
#' as.POSIXct("2013/06/27 22:30", tz = "Etc/GMT-8"),
#' by = "30 min")
#'
#' ## Remove error values
#' result <-
#' remove_manually(vctr_time = time, vctr_target = target,
#' vctr_time_err = time_err)
#'
#' @author Yoshiaki Hata
#'
#' @export
remove_manually <-
function(vctr_time, vctr_target, vctr_time_err, label_err = -9999) {
## avoid "No visible binding for global variable" notes
time <- NULL
target <- NULL
target_cleaned <- NULL
df <- data.frame(time = vctr_time,
target = vctr_target)
df %>%
dplyr::transmute(target_cleaned = ifelse(time %in% vctr_time_err,
label_err, target)) %>%
dplyr::pull(target_cleaned) %>%
return()
}
#' Remove outliers by absolute limits
#'
#' @description `check_absolute_limits()` removes out-of-range values by
#' setting lower and upper limits.
#'
#' @inheritParams n_valid
#' @param vctr_target A vector of a targeted time series to be checked.
#' @param thres_al_min A threshold value for the input time series to define
#' the lower limit. Default is 3.0. The data points with values below the
#' threshold are considered outliers and removed. The unit of the threshold
#' must match that of the input time series.
#' @param thres_al_max A threshold value for the input time series to define
#' the upper limit. Default is 50.0. The data points with values above the
#' threshold are considered outliers and removed. The unit of the threshold
#' must match that of the input time series.
#'
#' @returns
#' A vector of cleaned time series. The length of the time series is the same
#' as the input time series. The data points with values below `thres_al_min`
#' or above `thres_al_max` are replaced with the error label specified in
#' `label_err`.
#'
#' @examples
#' ## Load data
#' data(dt_noisy)
#' target <- dt_noisy$dt
#'
#' ## Remove out-of-range values
#' result <- check_absolute_limits(vctr_target = target)
#'
#' @author Yoshiaki Hata
#'
#' @export
check_absolute_limits <-
function(vctr_target, thres_al_min = 3.0, thres_al_max = 50.0,
label_err = -9999) {
## avoid "No visible binding for global variable" notes
. <- NULL
vctr_target %>%
dplyr::na_if(label_err) %>%
ifelse(. < thres_al_min | . > thres_al_max, label_err, .) %>%
tidyr::replace_na(label_err) %>%
return()
}
#' Modify short-term drifts
#'
#' @description `modify_short_drift()` corrects short-term drifts by adjusting
#' the 5th and 95th percentiles of the drifted time series to those of
#' the reference time series.
#'
#' @details
#' The short-term drift correction is to correct sudden changes in the average
#' in the time series over a short period (hours to days) specified by
#' `vctr_time_drft_head` and `vctr_time_drft_tail`. Multiple short-term drifts
#' can be corrected at once using this function.This procedure uses a reference
#' period, which is defined to consist of the three days (default) before and
#' after the occurrence of the anomaly. Then, the anomalous time series is
#' standardized so that the 5th and 95th percentile values of the anomalous and
#' reference (non-anomalous) time series match over this period. These
#' percentile values are used instead of the maximum and minimum values to
#' ensure robustness against possible outliers in the original or reference
#' time series.
#'
#' @inheritParams get_interval
#' @inheritParams remove_manually
#' @param vctr_time_drft_head A timestamp vector of class POSIXct or POSIXlt,
#' indicating when each drift starts.
#' @param vctr_time_drft_tail A timestamp vector of class POSIXct or POSIXlt,
#' indicating when each drift ends. The length of the time series must be the
#' same as that of `vctr_time_drft_head`.
#' @param n_day_ref A positive integer representing the number of days to be
#' referenced before and after the anomaly period. Default is 3 (days).
#'
#' @returns
#' A vector of the drift-corrected time series. The length of the time series
#' is the same as the input time series.
#'
#' @examples
#' ## Load data
#' data(dt_noisy)
#' time <- dt_noisy$time[11931:12891]
#' target <- dt_noisy$dt[11931:12891]
#' time_drft_head <- time[1]
#' time_drft_tail <- time[148]
#'
#' ## Correct a short-term drift
#' result <-
#' modify_short_drift(vctr_time = time, vctr_target = target,
#' vctr_time_drft_head = time_drft_head,
#' vctr_time_drft_tail = time_drft_tail)
#'
#' @author Yoshiaki Hata
#'
#' @include utils.R
#'
#' @importFrom rlang :=
#'
#' @export
modify_short_drift <-
function(vctr_time, vctr_target, vctr_time_drft_head, vctr_time_drft_tail,
n_day_ref = 3, label_err = -9999) {
## avoid "No visible binding for global variable" notes
. <- NULL
time <- NULL
dT <- NULL
dT_drft_mod <- NULL
dT_mod <- NULL
## check input values
interval_time <- get_interval(vctr_time)
timezone <- lubridate::tz(vctr_time[1])
data_dT <- data.frame(time = vctr_time,
dT = vctr_target,
dT_mod = vctr_target)
if(length(vctr_time_drft_head) != length(vctr_time_drft_tail)) {
stop("Drift periods must be specified by both start and end time stamps")
}
for (i in 1:length(vctr_time_drft_head)) {
colname_drft <- paste0("dT_drft_", i)
colname_ref <- paste0("dT_ref_", i)
data_dT <-
data_dT %>%
dplyr::mutate(!!colname_drft :=
ifelse(dplyr::between(time,
vctr_time_drft_head[[i]],
vctr_time_drft_tail[[i]]),
dT, label_err),
!!colname_ref :=
ifelse(dplyr::between(time,
vctr_time_drft_head[[i]] -
lubridate::days(n_day_ref),
vctr_time_drft_head[[i]] -
interval_time) |
dplyr::between(time,
vctr_time_drft_tail[[i]] +
interval_time,
vctr_time_drft_tail[[i]] +
lubridate::days(n_day_ref)),
dT, label_err)) %>%
dplyr::mutate(dplyr::across(tidyselect::where(is.numeric),
~dplyr::na_if(., label_err)))
## calculate 5th and 95th percentile for drift correction
vctr_drft <- data_dT %>% dplyr::pull(., !!colname_drft)
vctr_ref <- data_dT %>% dplyr::pull(., !!colname_ref)
q05_drft <-
stats::quantile(vctr_drft, 0.05, na.rm = TRUE, names = FALSE)
q95_drft <-
stats::quantile(vctr_drft, 0.95, na.rm = TRUE, names = FALSE)
q05_ref <- stats::quantile(vctr_ref, 0.05, na.rm = TRUE, names = FALSE)
q95_ref <- stats::quantile(vctr_ref, 0.95, na.rm = TRUE, names = FALSE)
## drift correction
data_dT <-
data_dT %>%
dplyr::mutate(dT_drft_mod =
(get(colname_drft) - q05_drft) /
(q95_drft - q05_drft) * (q95_ref - q05_ref) + q05_ref,
dT_mod =
ifelse(dplyr::between(time,
vctr_time_drft_head[[i]],
vctr_time_drft_tail[[i]]),
dT_drft_mod, dT_mod))
}
tidyr::replace_na(data_dT$dT_mod, label_err) %>% return()
}
#' Filter high frequency noise by Gaussian filter
#'
#' @description `filter_highfreq_noise()` filters a time series with a specific
#' period by convolving it with a Gaussian window, removing high-frequency
#' noise.
#'
#' @inheritParams modify_short_drift
#' @param vctr_time_noise A timestamp vector of class POSIXct or POSIXlt,
#' indicating when high-frequency noise exists in the targeted time series.
#' @param wndw_size_noise A positive integer indicating the number of data
#' points included in a moving Gaussian window for the high-frequency noise
#' filtering. The default is 13, meaning that the window size is 6.5 hours if
#' the time interval of the input timestamp is 30 minutes.
#' @param inv_sigma_noise A positive value defining a Gaussian window width for
#' the high-frequency noise filtering. The width of the Gaussian window is
#' inversely proportional to this parameter. Default is 0.01.
#'
#' @returns
#' A vector of the noise-filtered time series. The length of the time series is
#' the same as the input time series.
#'
#' @examples
#' ## Create data
#' time <- seq(as.POSIXct("2026/01/01"), length.out = 360, by = "1 day")
#' x <- seq(1, 360)
#' target <- sin(x / 180 * pi) + stats::rnorm(length(x), sd = 0.2)
#' time_noise <-
#' seq(as.POSIXct("2026/01/01"), as.POSIXct("2026/09/01"), by = "1 day")
#'
#' ## Filter noise
#' result <-
#' filter_highfreq_noise(vctr_time = time, vctr_target = target,
#' vctr_time_noise = time_noise)
#'
#' @author Yoshiaki Hata
#'
#' @importFrom rlang :=
#'
#' @export
filter_highfreq_noise <-
function(vctr_time, vctr_target, vctr_time_noise,
wndw_size_noise = 13, inv_sigma_noise = 0.01, label_err = -9999) {
## avoid "No visible binding for global variable" notes
dT <- NULL
dT_noise <- NULL
time <- NULL
dT_filtered <- NULL
dT_mod <- NULL
g_filter <- gsignal::gaussian(wndw_size_noise, inv_sigma_noise)
g_filter <- g_filter / sum(g_filter)
data_dT <- data.frame(time = vctr_time,
dT = vctr_target)
## interpolate data gaps temporarily for convolution
data_dT <-
data_dT %>%
dplyr::mutate(dT_noise = ifelse(dT == label_err, NA, dT),
dT_noise = zoo::na.approx(dT_noise, na.rm = FALSE)) %>%
tidyr::fill(dT_noise, .direction = "updown") %>%
dplyr::mutate(dT_filtered = gsignal::conv(dT_noise, g_filter,
shape = "same"),
dT_mod = ifelse(time %in% vctr_time_noise, dT_filtered, dT),
dT_mod = ifelse(dT == label_err, label_err, dT_mod))
return(data_dT$dT_mod)
}
#' Remove outliers by Z-score time series
#'
#' @description `remove_zscore_outlier()` detects and removes outlier values by
#' converting an original time series into a Z-score time series using a
#' moving window.
#'
#' @details
#' The input time series is standardized using a moving window, and the data
#' values are converted to Z-scores. In this step, the width of the moving
#' window is set to 15 days by default, centered on the target time point,
#' and standardization is performed individually for each time point in the
#' time series. The threshold of the Z-score absolute value (default: 5 as
#' specified by 'thres_z') is set, and data points outside that range are
#' removed as outliers. After the outliers have been removed, the Z-score is
#' returned to the original value using the original mean and standard
#' deviation time series, and standardization is performed again using a moving
#' window to remove additional outliers. These procedures are repeated until
#' either no more outliers are removed or the maximum number of iterations
#' (default 10) is reached.
#'
#' Users can define sub-periods across the entire time series using
#' `vctr_time_prd_tail`, and the Z-score conversion is performed in each
#' sub-period separately. This separated conversion is useful when the input
#' time series suddenly changes its nature, such as after a sensor replacement.
#'
#' In some cases, for sap flow measurements, the input dT (the temperature
#' difference between sap flow probes) time series may yield a signal that is
#' attenuated for only a short period, for example, when rainfall continues for
#' days, causing the moving window mean (or standard deviation) to increase
#' (or decrease). In such cases, standardization will cause the Z-score time
#' series immediately before and after the rainfall to be unnaturally
#' distorted, hindering the construction of the random forest model. If
#' `modify_z` is `TRUE`, after the outlier removal, this function modifies the
#' Z-score time series for periods when the moving window average has an upward
#' peak, and the moving window standard deviation has a downward peak
#' simultaneously. First, the average and standard deviation time series are
#' interpolated if they contain missing values. Second, they are smoothed by
#' convolution with a user-specified Gaussian window, defined by the parameters
#' `wndw_size_conv` and `inv_sigma_conv`. Third, the first-order and
#' second-order differences of both smoothed time series are calculated, which
#' determine the upward peak positions of the average and the downward peak
#' positions of the standard deviation. Fourth, possible signal attenuation
#' periods are determined based on these peak positions. The start and end of
#' the periods are defined by the timings when the first-order differenced
#' standard deviation time series changes its sign before and after each peak.
#' Fifth, the final attenuation periods are selected if the average of the
#' ratio of the standard deviation at the detected attenuation peak to that at
#' the beginning and end of the attenuation period is below the threshold value
#' specified by `thres_ratio`. Optionally, users can specify the periods to be
#' modified by `vctr_time_zmod`. Sixth, the average and standard deviation time
#' series during the attenuation periods are deleted and linearly interpolated.
#' Finally, the modified Z-score time series is calculated using these average
#' and standard deviation time series.
#'
#' @inheritParams remove_manually
#' @param vctr_time_prd_tail A timestamp vector of class POSIXct or POSIXlt,
#' indicating the end timings of each sub-period. Note that users must not
#' include the final timestamp for the entire time series. For instance, if
#' users want to split the entire measurement period into three sub-periods,
#' they only need to specify the end time stamps of the first two sub-periods.
#' Default is `NULL`.
#' @param n_calc_max A positive integer indicating the maximum number of
#' Z-score outlier detection iterations. Default is 10.
#' @param wndw_size_z A positive integer indicating the number of data points
#' included in a moving window for the Z-score outlier removal. The default is
#' 48 * 15, meaning that the window size is 15 days if the time interval of
#' the input timestamp is 30 minutes.
#' @param min_n_wndw_z A positive integer indicating the minimum number of data
#' points for calculating statistics using a moving window (default is 5) for
#' the Z-score outlier removal. If the number of data points is less than this
#' threshold, the statistics are not calculated in the window.
#' @param thres_z A positive threshold value for the Z-score time series to
#' define outliers. Default is 5.0. The data points with Z-scores (absolute
#' values) above the threshold are considered outliers and removed.
#' @param modify_z A boolean. If `TRUE`, conduct Z-score short attenuation
#' correction; else, the correction is not applied. Default is `FALSE`.
#' @param vctr_time_zmod Only valid if `modify_z` is `TRUE`. A timestamp vector
#' of class POSIXct or POSIXlt, indicating the timings when the short-term
#' signal attenuation correction is applied. Default is `NULL`.
#' @param wndw_size_conv Only valid if `modify_z` is `TRUE`. A positive integer
#' indicating the number of data points included in a moving window for the
#' short-term signal attenuation detection. The default is 48 * 15, meaning
#' that the window size is 15 days if the time interval of the input timestamp
#' is 30 minutes.
#' @param inv_sigma_conv Only valid if `modify_z` is `TRUE`. A positive value
#' defining a Gaussian window width for the short-term signal attenuation
#' detection. The width of the Gaussian window is inversely proportional to
#' this parameter. Default is 0.01.
#' @param thres_ratio Only valid if `modify_z` is `TRUE`. A positive threshold
#' value of the ratio for determining whether the signal attenuation
#' correction is applied to each detected attenuation period. The ratio
#' represents the average of the standard deviation at the detected
#' attenuation peak relative to that at the beginning and end of the
#' attenuation period. If the ratio is below this threshold value, the
#' correction is applied. Default is 0.5.
#'
#' @returns
#' A data frame with columns below:
#'
#' * The first column, `time`, gives the same timestamp as `vctr_time`.
#'
#' * The second column, `cleaned`, gives the cleaned time series after replacing
#' the detected outliers with the value specified by `label_err`.
#'
#' * The third column, `z_cleaned`, gives the Z-score of the input time series
#' after removing the detected outliers.
#'
#' * The fourth column, `avg_cleaned`, gives the moving window average of the
#' input time series after removing the detected outliers.
#'
#' * The fifth column, `sd_cleaned`, gives the moving window standard
#' deviation of the input time series after removing the detected outliers.
#'
#' * The sixth column, `flag_out` gives a flag variable time series indicating
#' the status of the cleaned time series (0: the input data point is not
#' originally missing and not detected as an outlier; 1: the input data point
#' is not originally missing but detected as an outlier; 2: the input data
#' point is originally missing).
#'
#' @examples
#' ## Load data
#' data(dt_noisy)
#' time <- dt_noisy$time[12097:14400]
#' target <- dt_noisy$dt[12097:14400]
#'
#' ## Remove outliers
#' result <- remove_zscore_outlier(vctr_time = time, vctr_target = target)
#'
#' @author Yoshiaki Hata
#'
#' @include utils.R
#'
#' @export
remove_zscore_outlier <-
function(vctr_time, vctr_target, vctr_time_prd_tail = NULL,
wndw_size_z = 48 * 15, min_n_wndw_z = 5, thres_z = 5.0,
n_calc_max = 10, modify_z = FALSE, vctr_time_zmod = NULL,
wndw_size_conv = 48 * 15, inv_sigma_conv = 0.01, thres_ratio = 0.5,
label_err = -9999) {
## avoid "No visible binding for global variable" notes
time <- NULL
dT <- NULL
dT_mod <- NULL
dT_avg <- NULL
dT_sd <- NULL
dT_z <- NULL
flag_out_z_cum <- NULL
dT_mod_prd <- NULL
dT_n_prd <- NULL
dT_avg_prd <- NULL
dT_sd_prd <- NULL
dT_z_prd <- NULL
dT_n <- NULL
dT_z_ori <- NULL
flag_out_z <- NULL
ratio <- NULL
time_start <- NULL
time_end <- NULL
message("Z-score outlier detection started")
data_dT <- data.frame(time = vctr_time,
dT = vctr_target)
data_dT <-
data_dT %>%
dplyr::mutate(dT_mod = dT,
dT_avg = NA,
dT_sd = NA,
dT_z = NA,
flag_out_z_cum = ifelse(dT == label_err, 2, 0))
n_outliers <- 10^6
n_calc <- 0
## standardization for each sub-period
while (n_outliers > 0 & n_calc <= n_calc_max) {
n_calc <- n_calc + 1
if(!is.null(vctr_time_prd_tail)) {
for (i in 1:(length(vctr_time_prd_tail) + 1)) {
if(i == 1) {
data_dT <-
data_dT %>%
dplyr::mutate(dT_mod_prd = ifelse(time <= vctr_time_prd_tail[[i]],
dT_mod, label_err))
} else if(i == length(vctr_time_prd_tail) + 1) {
data_dT <-
data_dT %>%
dplyr::mutate(dT_mod_prd =
ifelse(time > vctr_time_prd_tail[[i - 1]],
dT_mod, label_err))
} else {
data_dT <-
data_dT %>%
dplyr::mutate(dT_mod_prd =
ifelse(time > vctr_time_prd_tail[[i - 1]] &
time <= vctr_time_prd_tail[[i]],
dT_mod, label_err))
}
data_dT <-
data_dT %>%
dplyr::mutate(dplyr::across(tidyselect::where(is.numeric),
~dplyr::na_if(., label_err))) %>%
dplyr::mutate(dT_n_prd = zoo::rollapply(dT_mod_prd,
width = wndw_size_z,
FUN = n_valid, fill = NA,
partial = TRUE),
dT_avg_prd = zoo::rollapply(dT_mod_prd,
width = wndw_size_z,
FUN = mean, fill = NA,
na.rm = TRUE,
partial = TRUE),
dT_avg_prd = ifelse(dT_n_prd < min_n_wndw_z, NA,
dT_avg_prd),
dT_sd_prd = zoo::rollapply(dT_mod_prd,
width = wndw_size_z,
FUN = stats::sd, fill = NA,
na.rm = TRUE,
partial = min_n_wndw_z),
dT_sd_prd = ifelse(dT_n_prd < min_n_wndw_z, NA,
dT_sd_prd),
dT_z_prd = (dT_mod_prd - dT_avg_prd) / dT_sd_prd,
dT_avg = ifelse(!is.na(dT_mod_prd), dT_avg_prd,
dT_avg),
dT_sd = ifelse(!is.na(dT_mod_prd), dT_sd_prd, dT_sd),
dT_z = ifelse(!is.na(dT_mod_prd), dT_z_prd, dT_z))
}
} else {
data_dT <-
data_dT %>%
dplyr::mutate(dplyr::across(tidyselect::where(is.numeric),
~dplyr::na_if(., label_err))) %>%
dplyr::mutate(dT_n = zoo::rollapply(dT_mod,
width = wndw_size_z,
FUN = n_valid, fill = NA,
partial = TRUE),
dT_avg = zoo::rollapply(dT_mod,
width = wndw_size_z,
FUN = mean, fill = NA,
na.rm = TRUE,
partial = TRUE),
dT_avg = ifelse(dT_n < min_n_wndw_z | is.na(dT_mod),
NA, dT_avg),
dT_sd = zoo::rollapply(dT_mod,
width = wndw_size_z,
FUN = stats::sd, fill = NA,
na.rm = TRUE,
partial = min_n_wndw_z),
dT_sd = ifelse(dT_n < min_n_wndw_z | is.na(dT_mod),
NA, dT_sd),
dT_z = (dT_mod - dT_avg) / dT_sd)
}
if(n_calc == 1) {
data_dT <-
data_dT %>%
dplyr::mutate(dT_z_ori = dT_z)
}
data_dT <-
data_dT %>%
dplyr::mutate(flag_out_z = ifelse(abs(dT_z) > thres_z, 1, 0),
flag_out_z = ifelse(is.na(dT_z), 0, flag_out_z),
flag_out_z_cum = ifelse(flag_out_z == 1, flag_out_z,
flag_out_z_cum),
dT_z = ifelse(flag_out_z == 1, NA, dT_z),
dT_mod = dT_z * dT_sd + dT_avg)
n_outliers <- data_dT %>% dplyr::pull(flag_out_z) %>% sum()
if(n_outliers > 1 & n_calc < n_calc_max) {
message("--- ", n_outliers,
" Z-score outliers were detected")
} else if(n_outliers == 1 & n_calc < n_calc_max) {
message("--- 1 Z-score outlier was detected")
} else if(n_outliers > 1 & n_calc == n_calc_max) {
message("--- Z-score outliers still exist but finish the detection")
} else {
message("--- No Z-score outlier was detected")
}
}
## correct Z-score time series during short attenuation periods
if(modify_z == TRUE) {
if(is.null(vctr_time_zmod)) {
df_attn <-
check_short_attenuation(data_dT$time, data_dT$dT_z, data_dT$dT_avg,
data_dT$dT_sd, wndw_size_conv,
inv_sigma_conv, label_err)
n_attn <-
df_attn %>%
dplyr::filter(ratio < thres_ratio) %>%
nrow()
if(n_attn > 0) {
vctr_time_attn_head <-
df_attn %>%
dplyr::filter(ratio < thres_ratio) %>%
dplyr::pull(time_start)
vctr_time_attn_tail <-
df_attn %>%
dplyr::filter(ratio < thres_ratio) %>%
dplyr::pull(time_end)
vctr_ratio <-
df_attn %>%
dplyr::filter(ratio < thres_ratio) %>%
dplyr::pull(ratio)
message("--- Short attenuation periods were detected at:")
for (i_attn in 1:n_attn) {
message(" [", vctr_time_attn_head[[i_attn]], ", ",
vctr_time_attn_tail[[i_attn]], "], ratio = ",
vctr_ratio[[i_attn]])
data_dT <-
data_dT %>%
dplyr::mutate(dT_avg =
ifelse(time >= vctr_time_attn_head[[i_attn]] &
time <= vctr_time_attn_tail[[i_attn]],
NA, dT_avg),
dT_avg = zoo::na.approx(dT_avg, na.rm = FALSE),
dT_sd =
ifelse(time >= vctr_time_attn_head[[i_attn]] &
time <= vctr_time_attn_tail[[i_attn]],
NA, dT_sd),
dT_sd = zoo::na.approx(dT_sd, na.rm = FALSE),
dT_z = (dT_mod - dT_avg) / dT_sd)
}
message("--- Z-score modification during attenuation periods finished")
}
} else {
data_dT <-
data_dT %>%
dplyr::mutate(dT_avg =
ifelse(time %in% vctr_time_zmod, NA, dT_avg),
dT_avg = zoo::na.approx(dT_avg, na.rm = FALSE),
dT_sd =
ifelse(time %in% vctr_time_zmod, NA, dT_sd),
dT_sd = zoo::na.approx(dT_sd, na.rm = FALSE),
dT_z = (dT_mod - dT_avg) / dT_sd)
message("--- Z-score modification during attenuation periods finished")
}
}
message("Z-score outlier detection finished")
data_dT %>%
tidyr::replace_na(replace = list(dT_mod = label_err,
dT_avg = label_err,
dT_sd = label_err,
dT_z = label_err)) %>%
dplyr::select(time, dT_mod, dT_z, dT_avg, dT_sd, flag_out_z_cum) %>%
dplyr::rename(cleaned = dT_mod,
z_cleaned = dT_z,
avg_cleaned = dT_avg,
sd_cleaned = dT_sd,
flag_out = flag_out_z_cum) %>%
return()
}
#' Define reference values of average and standard deviation
#'
#' @description `calc_ref_stats()` determines reference values of average and
#' standard deviation for the entire period by calculating the median of
#' these statistical values for the first several days in each sub-period.
#'
#' @inheritParams remove_zscore_outlier
#' @param wndw_size_ref A positive integer indicating the number of data points
#' included in calculating the average and standard deviation for their
#' reference value determination. The default is 48 * 15, meaning that the
#' first 15 days of each sub-period are used in the calculation when the time
#' interval of the input timestamp is 30 minutes.
#'
#' @returns
#' A vector of two components. The first one is the reference average, and the
#' second one is the reference standard deviation. The unit of these values is
#' the same as that of the input time series.
#'
#' @examples
#' ## Load data
#' data(dt_noisy)
#' time <- dt_noisy$time[11931:12891]
#' target <- dt_noisy$dt[11931:12891]
#' time_prd_tail <- time[148]
#'
#' ## Calculate reference values of average and standard deviation
#' result <-
#' calc_ref_stats(vctr_time = time, vctr_target = target,
#' vctr_time_prd_tail = time_prd_tail)
#'
#' @author Yoshiaki Hata
#'
#' @seealso retrieve_ts
#'
#' @include utils.R
#'
#' @export
calc_ref_stats <-
function(vctr_time, vctr_target, vctr_time_prd_tail = NULL,
wndw_size_ref = 48 * 15, label_err = -9999) {
## avoid "No visible binding for global variable" notes
time <- NULL
dT <- NULL
dT_prd <- NULL
## check input values
interval_time <- get_interval(vctr_time)
timezone <- lubridate::tz(vctr_time[1])
data_dT <-
data.frame(time = vctr_time,
dT = vctr_target) %>%
dplyr::mutate(dT = ifelse(dT == label_err, NA, dT))
time_head_ref_1st <-
data_dT %>%
dplyr::filter(!is.na(dT)) %>%
dplyr::slice_head() %>%
dplyr::pull(time)
vctr_dT_avg_ref <- rep(NA, length(vctr_time_prd_tail) + 1)
vctr_dT_sd_ref <- rep(NA, length(vctr_time_prd_tail) + 1)
for (i in 1:(length(vctr_time_prd_tail) + 1)) {
if(i == 1) {
vctr_dT_prd <-
data_dT %>%
dplyr::mutate(dT_prd =
ifelse(time >= time_head_ref_1st &
time < time_head_ref_1st +
lubridate::minutes(interval_time *
wndw_size_ref),
dT, NA)) %>%
dplyr::pull(dT_prd)
vctr_dT_avg_ref[[i]] <- mean(vctr_dT_prd, na.rm = TRUE)
vctr_dT_sd_ref[[i]] <- stats::sd(vctr_dT_prd, na.rm = TRUE)
} else {
vctr_dT_prd <-
data_dT %>%
dplyr::mutate(dT_prd =
ifelse(time > vctr_time_prd_tail[[i - 1]] &
time <= vctr_time_prd_tail[[i - 1]] +
lubridate::minutes(interval_time *
wndw_size_ref),
dT, NA)) %>%
dplyr::pull(dT_prd)
vctr_dT_avg_ref[[i]] <- mean(vctr_dT_prd, na.rm = TRUE)
vctr_dT_sd_ref[[i]] <- stats::sd(vctr_dT_prd, na.rm = TRUE)
}
}
dT_avg_ref <- stats::median(vctr_dT_avg_ref, na.rm = TRUE)
dT_sd_ref <- stats::median(vctr_dT_sd_ref, na.rm = TRUE)
return(c(dT_avg_ref, dT_sd_ref))
}
#' Retrieve time series in its original units
#'
#' @description `retrieve_ts()` converts a standardized Z-score time series
#' into a time series in its original units using specific average and
#' standard deviation time series.
#'
#' @details
#' Retrieving a time series with its original units is conducted by multiplying
#' a Z-score by the standard deviation, followed by adding the average. If the
#' average and standard deviation time series are the same as those in
#' converting the original time series into the Z-score time series, the
#' original values with the original average and standard deviation are
#' retrieved. If reference values of the average and/or standard deviation are
#' used, the output time series are detrended and/or applied to signal damping
#' correction.
#'
#' @inheritParams calc_ref_stats
#' @param vctr_target_z A vector of Z-score time series to be converted.
#' Missing values must be gap-filled previously.
#' @param vctr_target_avg A vector of average time series. Missing values are
#' acceptable but automatically gap-filled by interpolation during the
#' retrieving process. The length of the vector must match that of
#' `vctr_target_z`. The unit of the time series must match that of time series
#' to be output. Default is `NA`.
#' @param vctr_target_sd A vector of standard deviation time series. Missing
#' values are acceptable but automatically gap-filled by interpolation during
#' the retrieving process. The length of the vector must match that of
#' `vctr_target_z`. The unit of the time series must match that of time series
#' to be output. Default is `NA`.
#' @param detrend A boolean. If `TRUE`, detrending is applied and the reference
#' average specified by `avg_ref` is used to convert Z-score time series into
#' the time series with the reference average in its original units; else, the
#' detrending is not applied, and the average time series specified by
#' `vctr_target_avg` is used in the conversion. Default is `FALSE`.
#' @param correct_damping A boolean. If `TRUE`, the signal damping correction
#' is applied and the reference standard deviation specified by `sd_ref` is
#' used to convert Z-score time series into the time series in its original
#' units with the reference standard deviation; else, the correction is not
#' applied, and the standard deviation time series specified by
#' `vctr_target_sd` is used in the conversion. Default is `FALSE`.
#' @param avg_ref Only valid if `detrend` is `TRUE`. A numeric value
#' representing the reference average. A vector of reference average time
#' series is also acceptable, but the length of the vector must match that of
#' `vctr_target_z`, and the unit of the time series must match that of time
#' series to be output. Default is `NULL`.
#' @param sd_ref Only valid if `correct_damping` is `TRUE`. A positive numeric
#' value representing the reference standard deviation. A vector of reference
#' standard deviation time series is also acceptable, but the length of the
#' vector must match that of `vctr_target_z`, and the unit of the time series
#' must match that of time series to be output. Default is `NULL`.
#'
#' @returns
#' A vector of the retrieved time series. The length of the vector is the same
#' as `vctr_target_z`.
#'
#' @examples
#' ## Create data
#' target <- seq(1, 10)
#' target_avg <- rep(mean(target), 10)
#' target_sd <- rep(stats::sd(target), 10)
#' target_z <- (target - target_avg) / target_sd
#'
#' ## Retrieve time series in its original units
#' result <-
#' retrieve_ts(vctr_target_z = target_z, vctr_target_avg = target_avg,
#' vctr_target_sd = target_sd)
#'
#' @author Yoshiaki Hata
#'
#' @seealso calc_ref_stats
#'
#' @export
retrieve_ts <-
function(vctr_target_z, vctr_target_avg = NA, vctr_target_sd = NA,
detrend = FALSE, correct_damping = FALSE,
avg_ref = NULL, sd_ref = NULL, label_err = -9999) {
## avoid "No visible binding for global variable" notes
avg_target <- NULL
sd_target <- NULL
avg_target_gf <- NULL
sd_target_gf <- NULL
message("Time series retrieval started")
## check input
if(detrend == TRUE & correct_damping == TRUE &
(is.null(avg_ref) | is.null(sd_ref))) {
stop("Both the reference average and the reference standard deviation \n
must be provided to apply detrending and signal damping correction.")
}
if(detrend == TRUE & correct_damping == FALSE &
(is.null(avg_ref) | length(vctr_target_sd) == 1)) {
stop("Both the reference average and the original standard deviation \n
time series must be provided to apply detrending and retrieval.")
}
if(detrend == FALSE & correct_damping == TRUE &
(length(vctr_target_avg) == 1 | is.null(sd_ref))) {
stop("Both the original average time series and the reference standard \n
deviation time series must be provided to apply signal damping \n
and retrieval.")
}
if(detrend == FALSE & correct_damping == FALSE &
(length(vctr_target_avg) == 1 | length(vctr_target_sd) == 1)) {
stop("Both the original average and standard deviation time series \n
must be provided to apply retrieval.")
}
df_rt <-
data.frame(target_z = vctr_target_z,
avg_target = vctr_target_avg,
sd_target = vctr_target_sd) %>%
dplyr::mutate(avg_target = ifelse(avg_target == label_err, NA,
avg_target),
sd_target = ifelse(sd_target == label_err, NA, sd_target),
avg_target_gf = zoo::na.approx(avg_target, na.rm = FALSE),
sd_target_gf = zoo::na.approx(sd_target,
na.rm = FALSE)) %>%
tidyr::fill(c(avg_target_gf, sd_target_gf), .direction = "updown")
if(correct_damping == TRUE) {
vctr_target <- vctr_target_z * sd_ref
message("--- Signal damping correction finished")
} else {
vctr_target <- vctr_target_z * df_rt$sd_target_gf
message("--- Signal damping correction was not applied")
}
if(detrend == TRUE) {
vctr_target <- vctr_target + avg_ref
message("--- Detrending finished")
} else {
vctr_target <- vctr_target + df_rt$avg_target_gf
message("--- Detrending was not applied")
}
message("Time series retrieval finished")
return(vctr_target)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.