R/dm.na.interpolation.R

Defines functions .dm_interp_empty_plot .dm_interp_time_step_seconds .dm_interp_expand_gap_info .dm_interp_extract_parts plot_dm_assessment plot_dm_interpolation plot_dm_gaps plot.dm_na_interpolation dm.na.interpolation

Documented in dm.na.interpolation plot_dm_assessment plot_dm_gaps plot_dm_interpolation plot.dm_na_interpolation

#' @title Detection and interpolation of missing values in dendrometer data
#'
#' @description
#' Detects missing timestamps (based on provided resolution), inserts rows with \code{NA},
#' and optionally interpolates missing values using either cubic spline interpolation
#' (\code{\link[zoo:na.spline]{na.spline}}) or seasonal decomposition interpolation
#' (\code{\link[forecast:na.interp]{na.interp}}).
#'
#' Optionally, the function can test the reliability of interpolation for increasing
#' gap lengths (e.g., 6 hours to 3 days) using real data. This allows the user to assess
#' how well a given interpolation method recovers missing values over different durations.
#'
#' @param df A data frame with the first column as datetime (\code{"yyyy-mm-dd HH:MM:SS"},
#'   or POSIXct/Date), followed by dendrometer values.
#' @param resolution Integer. Temporal resolution in minutes (e.g., 60 for hourly data).
#' @param fill Logical. If \code{TRUE}, missing values will be interpolated.
#' @param method Character. Interpolation method (only used if \code{fill = TRUE}):
#'   \itemize{
#'     \item \code{"spline"} --- cubic spline (\code{zoo::na.spline}).
#'     \item \code{"seasonal"} --- seasonal decomposition (\code{forecast::na.interp}).
#'   }
#' @param assess Logical. If \code{TRUE}, run a simulation-based test to evaluate how well
#'   interpolation performs at different gap lengths (on existing data).
#' @param assess_lengths_hours Integer vector. The gap durations (in hours) to test during
#'   assessment. Default: \code{seq(6, 72, by = 6)}.
#' @param assess_samples_per_length Integer. Number of artificial gaps per gap length
#'   per series. Default: \code{10}.
#' @param assess_buffer_hours Integer. Minimum number of hours of valid data required
#'   before and after the artificial gap to allow valid assessment. Default: \code{6}.
#' @param assess_seed Integer or \code{NULL}. Random seed to ensure reproducibility
#'   of sampling windows. Default: \code{NULL}.
#' @param verbose Logical. If \code{TRUE}, prints messages during filling and testing.
#'
#' @details
#' The assessment simulates interpolation over artificial gaps of different durations.
#' For each dendrometer series and each gap length:
#' \enumerate{
#'   \item Random windows (with clean data) are selected.
#'   \item Data in the window is temporarily set to \code{NA}.
#'   \item The gap is interpolated using the selected method.
#'   \item The predicted values are compared to the original (true) values using:
#'     \itemize{
#'       \item MAPE --- Mean Absolute Percentage Error.
#'       \item MdAPE --- Median Absolute Percentage Error.
#'       \item RMSE_pct --- Root Mean Square Error (\%).
#'       \item Bias_pct --- Mean Percentage Error (\%).
#'       \item Max_diff_abs --- Mean absolute difference in daily maximum.
#'       \item Min_diff_abs --- Mean absolute difference in daily minimum.
#'       \item Time_max_diff_h --- Mean absolute difference in time of daily maximum (hours).
#'       \item Time_min_diff_h --- Mean absolute difference in time of daily minimum (hours).
#'     }
#' }
#'
#' @return
#' A list of class \code{"dm_na_interpolation"} with components:
#' \describe{
#'   \item{\code{$data}}{Data frame containing the original or gap-filled series.}
#'   \item{\code{$gap_info}}{Table describing timestamp gaps and internal NA runs.}
#'   \item{\code{$assessment}}{A tibble summarizing interpolation accuracy for each
#'   series and gap length, or \code{NULL} when \code{assess = FALSE}.}
#'   \item{\code{$filled}}{Logical indicating whether interpolation was performed.}
#'   \item{\code{$assessed}}{Logical indicating whether interpolation assessment was performed.}
#'   \item{\code{$method}}{Interpolation method used.}
#'   \item{\code{$resolution}}{Temporal resolution in minutes.}
#' }
#'
#' @examples
#' \donttest{
#' library(dendRoAnalyst)
#' data(nepa17)
#'
#' #res0 <- dm.na.interpolation(nepa17[1:1000, ], resolution = 60)
#' #plot(res0)
#' #plot(res0, type = "gaps")
#'
#' #res1 <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "spline"
#' #)
#' #plot(res1)
#' #plot(res1, type = "gaps")
#' #plot(res1, type = "interpolation", original = nepa17[1:1000, ])
#'
#' #res2 <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "seasonal",
#' #  assess = TRUE
#' # )
#' #plot(res2)
#' #plot(res2, type = "assessment", metric = "MdAPE")
#' }
#'
#' @importFrom stats ts median
#' @importFrom lubridate ymd_hms minutes
#' @importFrom dplyr mutate rename select bind_cols first lead last left_join summarise group_by arrange bind_rows relocate
#' @importFrom tibble as_tibble tibble
#' @importFrom zoo na.spline
#' @importFrom forecast na.interp
#' @importFrom utils head
#' @export
dm.na.interpolation <- function(df, resolution, fill = FALSE, method = "spline",
                                assess = FALSE,
                                assess_lengths_hours = seq(6, 72, by = 6),
                                assess_samples_per_length = 10,
                                assess_buffer_hours = 6,
                                assess_seed = NULL,
                                verbose = FALSE) {

  to_posix <- function(x) {
    if (inherits(x, "POSIXct")) return(x)
    if (inherits(x, "Date")) return(as.POSIXct(x))
    lubridate::ymd_hms(x)
  }
  TIME <- time_min <- diff_time <- start_time <- end_time <- type <- series <- prev_time <- next_time <- NULL
  start_missing <- end_missing <- n_consecutive <- duration_min <- duration_hr <- Series <- n_cases <- n_points <- MAPE <- MdAPE <- NULL
  RMSE_pct <- Bias_pct <- Max_diff_abs <- Min_diff_abs <- Time_max_diff_h <- Time_min_diff_h  <- NULL

  make_output <- function(data, gap_info, assessment = NULL,
                          filled = FALSE, assessed = FALSE) {
    out <- list(
      data = tibble::as_tibble(data),
      gap_info = tibble::as_tibble(gap_info),
      assessment = if (is.null(assessment)) NULL else tibble::as_tibble(assessment),
      filled = filled,
      assessed = assessed,
      method = method,
      resolution = resolution
    )
    class(out) <- "dm_na_interpolation"
    out
  }

  replace_na_with_spline <- function(column) {
    if (!is.numeric(column) || !anyNA(column)) return(column)
    if (sum(!is.na(column)) < 2) return(column)

    sp <- tryCatch(
      zoo::na.spline(column),
      error = function(e) column
    )

    if (length(sp) == length(column)) {
      column[is.na(column)] <- sp[is.na(column)]
    }
    column
  }

  replace_na_with_seasonality <- function(column, s) {
    if (!is.numeric(column) || !anyNA(column)) return(column)
    if (sum(!is.na(column)) < 2) return(column)

    tryCatch(
      as.numeric(forecast::na.interp(stats::ts(column, frequency = s))),
      error = function(e) column
    )
  }

  pct_errors <- function(truth, fit) {
    pe  <- 100 * (fit - truth) / truth
    ape <- abs(pe)
    c(
      MAPE     = mean(ape, na.rm = TRUE),
      MdAPE    = stats::median(ape, na.rm = TRUE),
      RMSE_pct = sqrt(mean(pe^2, na.rm = TRUE)),
      Bias_pct = mean(pe, na.rm = TRUE)
    )
  }

  quiet <- function(x) suppressWarnings(suppressMessages(x))

  mean_or_na <- function(x) {
    if (length(x) == 0 || all(is.na(x))) return(NA_real_)
    mean(x, na.rm = TRUE)
  }

  find_na_runs <- function(time_vec, x, series_name) {
    na_flag <- is.na(x)

    if (!any(na_flag)) {
      return(tibble::tibble(
        series = character(0),
        start_time = as.POSIXct(character()),
        end_time = as.POSIXct(character()),
        n_consecutive = integer(0)
      ))
    }

    r <- rle(na_flag)
    ends <- cumsum(r$lengths)
    starts <- c(1, utils::head(ends, -1) + 1)
    sel <- which(r$values)

    if (!length(sel)) {
      return(tibble::tibble(
        series = character(0),
        start_time = as.POSIXct(character()),
        end_time = as.POSIXct(character()),
        n_consecutive = integer(0)
      ))
    }

    tibble::tibble(
      series = series_name,
      start_time = time_vec[starts[sel]],
      end_time = time_vec[ends[sel]],
      n_consecutive = r$lengths[sel]
    )
  }

  if (!(method %in% c("spline", "seasonal"))) {
    stop("The method must be either 'spline' or 'seasonal'.")
  }

  df <- tibble::as_tibble(df)
  names(df)[1] <- "TIME"
  df$TIME <- to_posix(df$TIME)

  if (any(is.na(df$TIME))) {
    stop("The datetime column could not be fully converted to POSIXct.")
  }

  df2 <- df %>%
    dplyr::mutate(
      time_min = as.integer(difftime(TIME, dplyr::first(TIME), units = "mins")),
      diff_time = dplyr::lead(time_min) - time_min
    )

  gap_rows <- which(df2$diff_time > resolution)

  if (length(gap_rows) > 0L) {
    ts_gap_info <- tibble::tibble(
      type = "timestamp_gap",
      series = "ALL",
      prev_time = df2$TIME[gap_rows],
      next_time = df2$TIME[gap_rows] + lubridate::minutes(df2$diff_time[gap_rows]),
      start_missing = df2$TIME[gap_rows] + lubridate::minutes(resolution),
      end_missing = df2$TIME[gap_rows] + lubridate::minutes(df2$diff_time[gap_rows] - resolution),
      n_consecutive = as.integer(df2$diff_time[gap_rows] / resolution - 1L),
      duration_min = as.integer(df2$diff_time[gap_rows] - resolution),
      duration_hr = (df2$diff_time[gap_rows] - resolution) / 60
    )
  } else {
    ts_gap_info <- tibble::tibble(
      type = character(0),
      series = character(0),
      prev_time = as.POSIXct(character()),
      next_time = as.POSIXct(character()),
      start_missing = as.POSIXct(character()),
      end_missing = as.POSIXct(character()),
      n_consecutive = integer(0),
      duration_min = integer(0),
      duration_hr = numeric(0)
    )
  }

  df <- dplyr::select(df2, -time_min, -diff_time)

  if (nrow(ts_gap_info) > 0L) {
    x1 <- dplyr::first(df$TIME)
    x2 <- dplyr::last(df$TIME)
    full_time <- tibble::tibble(
      TIME = seq(from = x1, to = x2, by = paste(resolution, "mins"))
    )
    df_work <- dplyr::left_join(full_time, df, by = "TIME")
  } else {
    df_work <- df
  }

  na_runs_list <- lapply(names(df)[-1], function(nm) {
    find_na_runs(df$TIME, df[[nm]], nm)
  })
  na_runs_info <- dplyr::bind_rows(na_runs_list)

  if (nrow(na_runs_info)) {
    na_runs_info <- na_runs_info %>%
      dplyr::mutate(
        type = "internal_na",
        prev_time = as.POSIXct(NA),
        next_time = as.POSIXct(NA),
        duration_min = NA_integer_,
        duration_hr = NA_real_
      ) %>%
      dplyr::rename(
        start_missing = start_time,
        end_missing = end_time
      ) %>%
      dplyr::relocate(
        type, series, prev_time, next_time, start_missing, end_missing,
        n_consecutive, duration_min, duration_hr
      )
  }

  gap_info <- dplyr::bind_rows(ts_gap_info, na_runs_info)

  if (nrow(ts_gap_info) == 0L && nrow(na_runs_info) == 0L) {
    message("There is no gap in your dendrometer data.")
    return(make_output(
      data = df,
      gap_info = gap_info,
      assessment = NULL,
      filled = FALSE,
      assessed = FALSE
    ))
  }

  if (nrow(ts_gap_info) == 0L && nrow(na_runs_info) > 0L) {
    message("There is no gap in timestamp but there are internal NA values.")
    print(gap_info)

    if (!fill) {
      return(make_output(
        data = df,
        gap_info = gap_info,
        assessment = NULL,
        filled = FALSE,
        assessed = FALSE
      ))
    }
  }

  if (nrow(ts_gap_info) > 0L) {
    print(gap_info)

    if (!fill) {
      return(make_output(
        data = df_work,
        gap_info = gap_info,
        assessment = NULL,
        filled = FALSE,
        assessed = FALSE
      ))
    }
  }

  if (verbose) {
    message("Filling missing values using method = '", method, "'.")
  }

  if (nrow(ts_gap_info) > 0L) {
    if (max(ts_gap_info$n_consecutive, na.rm = TRUE) >= (1440 / resolution)) {
      warning("There is a timestamp gap lasting more than 24 h; interpolation may be unreliable.")
    }
  }

  if (method == "spline") {
    df_vals <- lapply(df_work[-1], replace_na_with_spline)
  } else {
    s <- 1440 / resolution
    df_vals <- lapply(df_work[-1], function(col) replace_na_with_seasonality(col, s))
  }

  df_filled <- dplyr::bind_cols(df_work[1], tibble::as_tibble(df_vals))

  if (!assess) {
    return(make_output(
      data = df_filled,
      gap_info = gap_info,
      assessment = NULL,
      filled = TRUE,
      assessed = FALSE
    ))
  }

  if (!is.null(assess_seed)) {
    set.seed(assess_seed)
  }

  if (verbose) {
    message(
      "Running self test on artificial gaps: ",
      paste(assess_lengths_hours, collapse = ", "),
      " hours"
    )
  }

  time_dates <- as.Date(df_filled$TIME)
  time_hours <- as.numeric(format(df_filled$TIME, "%H")) +
    as.numeric(format(df_filled$TIME, "%M")) / 60 +
    as.numeric(format(df_filled$TIME, "%S")) / 3600

  circular_hour_diff <- function(x, y) {
    d <- abs(x - y)
    pmin(d, 24 - d)
  }

  fast_daily_extrema <- function(date_vec, hour_vec, value_vec) {
    udates <- unique(date_vec)

    out <- lapply(udates, function(dd) {
      idx <- which(date_vec == dd)
      vv <- value_vec[idx]
      hh <- hour_vec[idx]

      if (length(vv) == 0 || all(is.na(vv))) {
        return(data.frame(
          DATE = dd,
          Min = NA_real_,
          Time_min_h = NA_real_,
          Max = NA_real_,
          Time_max_h = NA_real_
        ))
      }

      i_min <- which.min(replace(vv, is.na(vv), Inf))[1]
      i_max <- which.max(replace(vv, is.na(vv), -Inf))[1]

      data.frame(
        DATE = dd,
        Min = min(vv, na.rm = TRUE),
        Time_min_h = hh[i_min],
        Max = max(vv, na.rm = TRUE),
        Time_max_h = hh[i_max]
      )
    })

    do.call(rbind, out)
  }

  daily_case_differences_fast <- function(idx_days, truth, fit) {
    out_na <- c(
      Max_diff_abs = NA_real_,
      Min_diff_abs = NA_real_,
      Time_max_diff_h = NA_real_,
      Time_min_diff_h = NA_real_
    )

    if (!any(idx_days)) {
      return(out_na)
    }

    truth_daily <- fast_daily_extrema(
      date_vec = time_dates[idx_days],
      hour_vec = time_hours[idx_days],
      value_vec = truth[idx_days]
    )

    fit_daily <- fast_daily_extrema(
      date_vec = time_dates[idx_days],
      hour_vec = time_hours[idx_days],
      value_vec = fit[idx_days]
    )

    merged <- merge(
      truth_daily,
      fit_daily,
      by = "DATE",
      suffixes = c("_truth", "_fit")
    )

    if (nrow(merged) == 0) {
      return(out_na)
    }

    c(
      Max_diff_abs = mean(abs(merged$Max_fit - merged$Max_truth), na.rm = TRUE),
      Min_diff_abs = mean(abs(merged$Min_fit - merged$Min_truth), na.rm = TRUE),
      Time_max_diff_h = mean(
        circular_hour_diff(merged$Time_max_h_fit, merged$Time_max_h_truth),
        na.rm = TRUE
      ),
      Time_min_diff_h = mean(
        circular_hour_diff(merged$Time_min_h_fit, merged$Time_min_h_truth),
        na.rm = TRUE
      )
    )
  }

  impute_with_gap <- function(series, gap_idx, meth) {
    tmp <- series
    tmp[gap_idx] <- NA_real_

    if (meth == "spline") {
      tryCatch(
        as.numeric(quiet(zoo::na.spline(tmp))),
        error = function(e) tmp
      )
    } else {
      s <- 1440 / resolution
      tryCatch(
        as.numeric(quiet(forecast::na.interp(stats::ts(tmp, frequency = s)))),
        error = function(e) tmp
      )
    }
  }

  pts_per_hour <- max(1L, round(60 / resolution))
  buffer_pts <- max(1L, round(assess_buffer_hours * pts_per_hour))
  n_total <- nrow(df_filled)

  max_rows <- (ncol(df_filled) - 1L) *
    length(assess_lengths_hours) *
    assess_samples_per_length

  stat_rows <- vector("list", max_rows)
  row_id <- 0L

  for (j in seq.int(2, ncol(df_filled))) {
    series_name <- names(df_filled)[j]
    y_full <- df_filled[[j]]

    ok_full <- is.finite(y_full)
    cs_ok <- c(0L, cumsum(ok_full))

    for (gap_h in assess_lengths_hours) {
      gap_pts <- max(1L, round(gap_h * pts_per_hour))

      start_min <- buffer_pts + 1L
      start_max <- n_total - gap_pts - buffer_pts + 1L

      if (start_max < start_min) {
        if (verbose) {
          message("No valid windows for ", series_name, " at ", gap_h, " h.")
        }
        next
      }

      starts <- seq.int(start_min, start_max)

      win_start <- starts - buffer_pts
      win_end <- starts + gap_pts - 1L + buffer_pts
      required_n <- gap_pts + 2L * buffer_pts

      finite_counts <- cs_ok[win_end + 1L] - cs_ok[win_start]
      valid_starts <- starts[finite_counts == required_n]

      if (length(valid_starts) == 0L) {
        if (verbose) {
          message("No valid windows for ", series_name, " at ", gap_h, " h.")
        }
        next
      }

      starts_to_use <- if (length(valid_starts) > assess_samples_per_length) {
        sample(valid_starts, assess_samples_per_length, replace = FALSE)
      } else {
        valid_starts
      }

      for (i in starts_to_use) {
        idx_gap <- i:(i + gap_pts - 1L)

        y_imputed <- impute_with_gap(y_full, idx_gap, method)
        errs_all <- pct_errors(y_full[idx_gap], y_imputed[idx_gap])

        affected_dates <- unique(time_dates[idx_gap])
        idx_days <- time_dates %in% affected_dates

        daily_errs <- daily_case_differences_fast(
          idx_days = idx_days,
          truth = y_full,
          fit = y_imputed
        )

        row_id <- row_id + 1L
        stat_rows[[row_id]] <- data.frame(
          Series = series_name,
          gap_h = gap_h,
          n_cases = 1L,
          n_points = length(idx_gap),
          MAPE = errs_all["MAPE"],
          MdAPE = errs_all["MdAPE"],
          RMSE_pct = errs_all["RMSE_pct"],
          Bias_pct = errs_all["Bias_pct"],
          Max_diff_abs = daily_errs["Max_diff_abs"],
          Min_diff_abs = daily_errs["Min_diff_abs"],
          Time_max_diff_h = daily_errs["Time_max_diff_h"],
          Time_min_diff_h = daily_errs["Time_min_diff_h"],
          stringsAsFactors = FALSE
        )
      }
    }
  }

  if (row_id == 0L) {
    warning("Assessment produced no results (insufficient clean windows).")
    assessment <- tibble::tibble()
  } else {
    raw_stats <- do.call(rbind, stat_rows[seq_len(row_id)])

    assessment <- as.data.frame(raw_stats) %>%
      dplyr::group_by(Series, gap_h) %>%
      dplyr::summarise(
        n_cases = sum(n_cases),
        n_points = sum(n_points),
        MAPE = mean_or_na(MAPE),
        MdAPE = mean_or_na(MdAPE),
        RMSE_pct = mean_or_na(RMSE_pct),
        Bias_pct = mean_or_na(Bias_pct),
        Max_diff_abs = mean_or_na(Max_diff_abs),
        Min_diff_abs = mean_or_na(Min_diff_abs),
        Time_max_diff_h = mean_or_na(Time_max_diff_h),
        Time_min_diff_h = mean_or_na(Time_min_diff_h),
        .groups = "drop"
      ) %>%
      dplyr::arrange(Series, gap_h)
  }

  make_output(
    data = df_filled,
    gap_info = gap_info,
    assessment = assessment,
    filled = TRUE,
    assessed = TRUE
  )
}


#' @title Plot method for dendrometer NA interpolation results
#'
#' @description
#' S3 plot method for objects returned by \code{\link{dm.na.interpolation}}.
#'
#' @param x Object returned by \code{dm.na.interpolation()}.
#' @param type Character. Plot type:
#'   \code{"gaps"}, \code{"interpolation"}, or \code{"assessment"}.
#'   If omitted, the default is:
#'   \itemize{
#'     \item \code{"assessment"} when assessment exists,
#'     \item \code{"interpolation"} when filled data exist,
#'     \item \code{"gaps"} when gaps exist but data were not filled,
#'     \item otherwise \code{"interpolation"}.
#'   }
#' @param series Optional character vector of series names to plot.
#' @param metric Character assessment metric used when \code{type = "assessment"}.
#' @param original Optional original input data frame used before interpolation.
#' @param start Optional start datetime for subsetting.
#' @param end Optional end datetime for subsetting.
#' @param free_y Logical. If \code{TRUE}, facets use free y scales.
#' @param ncol Integer. Number of facet columns.
#' @param facet Logical. If \code{TRUE}, facet assessment plots by series.
#' @param empty_gaps Character. Behavior when \code{type = "gaps"} but no gaps are
#'   present: \code{"plot"} for an informative empty plot, or \code{"error"}.
#' @param ... Further arguments.
#'
#' @return A \code{ggplot2} object.
#'
#' @examples
#' \donttest{
#' library(dendRoAnalyst)
#' data(nepa17)
#'
#' ## No gaps: defaults to time-series plot
#' #res0 <- dm.na.interpolation(nepa17[1:1000, ], resolution = 60)
#' #plot(res0)
#' #plot(res0, type = "gaps")
#' #plot(res0, type = "gaps", empty_gaps = "error")
#'
#' ## Filled data
#' #res1 <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "spline"
#' #)
#' #plot(res1)
#' #plot(res1, type = "gaps")
#' #plot(res1, type = "interpolation", original = nepa17[1:1000, ])
#'
#' ## Assessed data
#' #res2 <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "seasonal",
#' #  assess = TRUE
#' #)
#' #plot(res2)
#' #plot(res2, type = "assessment", metric = "MdAPE")
#' }
#'
#' @method plot dm_na_interpolation
#' @export
plot.dm_na_interpolation <- function(x,
                                     type = NULL,
                                     series = NULL,
                                     metric = "MdAPE",
                                     original = NULL,
                                     start = NULL,
                                     end = NULL,
                                     free_y = TRUE,
                                     ncol = 1,
                                     facet = TRUE,
                                     empty_gaps = c("plot", "error"),
                                     ...) {
  if (!inherits(x, "dm_na_interpolation")) {
    stop("x must be an object returned by dm.na.interpolation().")
  }

  empty_gaps <- match.arg(empty_gaps)

  has_assessment <- isTRUE(x$assessed) &&
    !is.null(x$assessment) &&
    nrow(x$assessment) > 0

  has_gaps <- !is.null(x$gap_info) &&
    nrow(x$gap_info) > 0

  if (is.null(type)) {
    if (has_assessment) {
      type <- "assessment"
    } else if (isTRUE(x$filled)) {
      type <- "interpolation"
    } else if (has_gaps) {
      type <- "gaps"
    } else {
      type <- "interpolation"
    }
  }

  type <- match.arg(type, choices = c("gaps", "interpolation", "assessment"))

  if (type == "gaps") {
    return(plot_dm_gaps(
      x = x,
      series = series,
      empty_gaps = empty_gaps
    ))
  }

  if (type == "interpolation") {
    return(plot_dm_interpolation(
      x = x,
      original = original,
      series = series,
      start = start,
      end = end,
      free_y = free_y,
      ncol = ncol
    ))
  }

  if (type == "assessment") {
    if (!has_assessment) {
      stop("Assessment plot is only available when dm.na.interpolation(..., assess = TRUE) produced results.")
    }

    return(plot_dm_assessment(
      x = x,
      metric = metric,
      series = series,
      facet = facet
    ))
  }
}


#' @title Plot detected gaps in dendrometer data
#'
#' @param x Object returned by \code{dm.na.interpolation()}.
#' @param series Optional character vector of series names to plot.
#' @param empty_gaps Character. Behavior when no gaps are present:
#'   \code{"plot"} for an informative empty plot, or \code{"error"}.
#'
#' @return A \code{ggplot2} object.
#' @examples
#' \donttest{
#' #res <- dm.na.interpolation(nepa17[1:1000, ], resolution = 60)
#' #plot_dm_gaps(res)
#' }
#' @export
plot_dm_gaps <- function(x, series = NULL, empty_gaps = c("plot", "error")) {
  empty_gaps <- match.arg(empty_gaps)
  start_missing <- end_missing <- type <- NULL
  parts <- .dm_interp_extract_parts(x)
  data <- parts$data
  gap_info <- parts$gap_info

  if (is.null(gap_info) || nrow(gap_info) == 0) {
    if (empty_gaps == "error") {
      stop("No gap information available.")
    }
    return(.dm_interp_empty_plot("No gaps detected in the data."))
  }

  series_all <- names(data)[-1]

  if (is.null(series)) {
    series <- series_all
  } else {
    series <- intersect(series, series_all)
    if (!length(series)) {
      stop("None of the requested 'series' were found in the result.")
    }
  }

  gap_plot <- .dm_interp_expand_gap_info(gap_info, series_all = series_all)
  gap_plot <- gap_plot[gap_plot$series %in% series, , drop = FALSE]

  if (nrow(gap_plot) == 0) {
    if (empty_gaps == "error") {
      stop("No gaps available for the selected series.")
    }
    return(.dm_interp_empty_plot("No gaps available for the selected series."))
  }

  gap_plot$series <- factor(gap_plot$series, levels = rev(series))

  ggplot2::ggplot(gap_plot) +
    ggplot2::geom_segment(
      ggplot2::aes(
        x = start_missing,
        xend = end_missing,
        y = series,
        yend = series,
        colour = type
      ),
      linewidth = 5,
      alpha = 0.6,
      lineend = "butt"
    ) +
    ggplot2::geom_point(
      ggplot2::aes(x = start_missing, y = series, colour = type),
      size = 1.5
    ) +
    ggplot2::labs(
      x = "Time",
      y = "Series",
      colour = "Gap type",
      title = "Detected gaps in dendrometer data"
    ) +
    ggplot2::theme_bw()
}


#' @title Plot original and interpolated dendrometer series
#'
#' @param x Object returned by \code{dm.na.interpolation()}.
#' @param original Optional original input data frame used in
#'   \code{dm.na.interpolation()}.
#' @param series Optional character vector of series names to plot.
#' @param start Optional start datetime for plotting subset.
#' @param end Optional end datetime for plotting subset.
#' @param free_y Logical. If \code{TRUE}, each facet gets its own y-scale.
#' @param ncol Integer. Number of facet columns.
#'
#' @return A \code{ggplot2} object.
#' @examples
#' \donttest{
#' #res <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "spline"
#' #)
#' #plot_dm_interpolation(res, original = nepa17[1:1000, ])
#' }
#' @export
plot_dm_interpolation <- function(x, original = NULL, series = NULL,
                                  start = NULL, end = NULL,
                                  free_y = TRUE, ncol = 1) {
  TIME <- xmin <- xmax <- type <- filled <- NULL
  parts <- .dm_interp_extract_parts(x)
  plot_data <- parts$data
  gap_info <- parts$gap_info

  plot_data <- tibble::as_tibble(plot_data)
  names(plot_data)[1] <- "TIME"

  series_all <- names(plot_data)[-1]

  if (is.null(series)) {
    series <- series_all
  } else {
    series <- intersect(series, series_all)
    if (!length(series)) {
      stop("None of the requested 'series' were found in the result.")
    }
  }

  plot_data <- plot_data[, c("TIME", series), drop = FALSE]

  if (!is.null(start)) {
    plot_data <- plot_data[plot_data$TIME >= as.POSIXct(start), , drop = FALSE]
  }
  if (!is.null(end)) {
    plot_data <- plot_data[plot_data$TIME <= as.POSIXct(end), , drop = FALSE]
  }

  filled_long <- tidyr::pivot_longer(
    plot_data,
    cols = -TIME,
    names_to = "series",
    values_to = "filled"
  )

  if (!is.null(original)) {
    original <- tibble::as_tibble(original)
    names(original)[1] <- "TIME"

    miss_series <- setdiff(series, names(original))
    if (length(miss_series)) {
      stop(
        "These series are missing in 'original': ",
        paste(miss_series, collapse = ", ")
      )
    }

    original <- dplyr::left_join(
      plot_data["TIME"],
      original[, c("TIME", series), drop = FALSE],
      by = "TIME"
    )

    if (!is.null(start)) {
      original <- original[original$TIME >= as.POSIXct(start), , drop = FALSE]
    }
    if (!is.null(end)) {
      original <- original[original$TIME <= as.POSIXct(end), , drop = FALSE]
    }

    original_long <- tidyr::pivot_longer(
      original,
      cols = -TIME,
      names_to = "series",
      values_to = "original"
    )

    plot_df <- dplyr::left_join(
      filled_long,
      original_long,
      by = c("TIME", "series")
    )

    plot_df$imputed <- is.na(plot_df$original) & !is.na(plot_df$filled)

  } else {
    plot_df <- filled_long
    plot_df$original <- NA_real_
    plot_df$imputed <- FALSE

    if (!is.null(gap_info) && nrow(gap_info) > 0) {
      gap_plot <- .dm_interp_expand_gap_info(gap_info, series_all = series_all)
      gap_plot <- gap_plot[gap_plot$series %in% series, , drop = FALSE]

      if (!is.null(start)) {
        gap_plot <- gap_plot[gap_plot$end_missing >= as.POSIXct(start), , drop = FALSE]
      }
      if (!is.null(end)) {
        gap_plot <- gap_plot[gap_plot$start_missing <= as.POSIXct(end), , drop = FALSE]
      }

      for (i in seq_len(nrow(gap_plot))) {
        sel <- plot_df$series == gap_plot$series[i] &
          plot_df$TIME >= gap_plot$start_missing[i] &
          plot_df$TIME <= gap_plot$end_missing[i]
        plot_df$imputed[sel] <- TRUE
      }
    }
  }

  gap_rect <- NULL
  if (!is.null(gap_info) && nrow(gap_info) > 0) {
    gap_rect <- .dm_interp_expand_gap_info(gap_info, series_all = series_all)
    gap_rect <- gap_rect[gap_rect$series %in% series, , drop = FALSE]

    if (!is.null(start)) {
      gap_rect <- gap_rect[gap_rect$end_missing >= as.POSIXct(start), , drop = FALSE]
    }
    if (!is.null(end)) {
      gap_rect <- gap_rect[gap_rect$start_missing <= as.POSIXct(end), , drop = FALSE]
    }

    if (nrow(gap_rect) > 0) {
      step_sec <- .dm_interp_time_step_seconds(plot_data$TIME)
      gap_rect$xmin <- gap_rect$start_missing - step_sec / 2
      gap_rect$xmax <- gap_rect$end_missing + step_sec / 2
    }
  }

  title_txt <- if (isTRUE(x$filled)) {
    "Original and interpolated dendrometer series"
  } else {
    "Dendrometer series"
  }

  p <- ggplot2::ggplot(plot_df, ggplot2::aes(x = TIME)) +
    ggplot2::theme_bw() +
    ggplot2::labs(
      x = "Time",
      y = "Dendrometer value",
      title = title_txt
    )

  if (!is.null(gap_rect) && nrow(gap_rect) > 0) {
    p <- p +
      ggplot2::geom_rect(
        data = gap_rect,
        ggplot2::aes(
          xmin = xmin, xmax = xmax,
          ymin = -Inf, ymax = Inf,
          fill = type
        ),
        inherit.aes = FALSE,
        alpha = 0.12
      )
  }

  if (any(!is.na(plot_df$original))) {
    p <- p +
      ggplot2::geom_line(
        ggplot2::aes(y = original),
        linewidth = 0.35,
        colour = "grey45",
        na.rm = TRUE
      ) +
      ggplot2::geom_line(
        ggplot2::aes(y = filled),
        linewidth = 0.45,
        colour = "#1F78B4",
        na.rm = TRUE
      ) +
      ggplot2::geom_point(
        data = plot_df[plot_df$imputed, , drop = FALSE],
        ggplot2::aes(y = filled),
        colour = "#D7301F",
        size = 1.1,
        na.rm = TRUE
      )
  } else {
    p <- p +
      ggplot2::geom_line(
        ggplot2::aes(y = filled),
        linewidth = 0.45,
        colour = "#1F78B4",
        na.rm = TRUE
      )

    if (any(plot_df$imputed)) {
      p <- p +
        ggplot2::geom_point(
          data = plot_df[plot_df$imputed, , drop = FALSE],
          ggplot2::aes(y = filled),
          colour = "#D7301F",
          size = 1.1,
          na.rm = TRUE
        )
    }
  }

  p +
    ggplot2::facet_wrap(
      ~series,
      scales = if (free_y) "free_y" else "fixed",
      ncol = ncol
    ) +
    ggplot2::labs(fill = "Gap type")
}


#' @title Plot interpolation assessment metrics
#'
#' @param x Object returned by \code{dm.na.interpolation()} with
#'   \code{assess = TRUE}.
#' @param metric Character. One of:
#'   \code{"MAPE"}, \code{"MdAPE"}, \code{"RMSE_pct"}, \code{"Bias_pct"},
#'   \code{"Max_diff_abs"}, \code{"Min_diff_abs"},
#'   \code{"Time_max_diff_h"}, \code{"Time_min_diff_h"}.
#' @param series Optional character vector of series names to plot.
#' @param facet Logical. If \code{TRUE}, create one panel per series.
#'
#' @return A \code{ggplot2} object.
#' @examples
#' \donttest{
#' #res <- dm.na.interpolation(
#' #  nepa17[1:1000, ],
#' #  resolution = 60,
#' #  fill = TRUE,
#' #  method = "seasonal",
#' #  assess = TRUE
#' #)
#' #plot_dm_assessment(res, metric = "MdAPE")
#' }
#' @export
plot_dm_assessment <- function(x, metric = "MdAPE", series = NULL, facet = TRUE) {
  gap_h <- value <- Series <- NULL
  parts <- .dm_interp_extract_parts(x)
  assessment <- parts$assessment

  if (is.null(assessment) || nrow(assessment) == 0) {
    stop("No assessment found. Run dm.na.interpolation(..., assess = TRUE).")
  }

  valid_metrics <- c(
    "MAPE", "MdAPE", "RMSE_pct", "Bias_pct",
    "Max_diff_abs", "Min_diff_abs",
    "Time_max_diff_h", "Time_min_diff_h"
  )

  if (!(metric %in% valid_metrics)) {
    stop(
      "Invalid metric. Choose one of: ",
      paste(valid_metrics, collapse = ", ")
    )
  }

  assessment <- tibble::as_tibble(assessment)

  if (!is.null(series)) {
    assessment <- assessment[assessment$Series %in% series, , drop = FALSE]
    if (nrow(assessment) == 0) {
      stop("No matching series found in assessment.")
    }
  }

  assessment$value <- assessment[[metric]]

  p <- ggplot2::ggplot(
    assessment,
    ggplot2::aes(x = gap_h, y = value, group = Series, colour = Series)
  ) +
    ggplot2::geom_line(linewidth = 0.5, na.rm = TRUE) +
    ggplot2::geom_point(size = 1.8, na.rm = TRUE) +
    ggplot2::theme_bw() +
    ggplot2::labs(
      x = "Artificial gap length (hours)",
      y = metric,
      colour = "Series",
      title = paste("Interpolation assessment:", metric)
    )

  if (facet) {
    p + ggplot2::facet_wrap(~Series, scales = "free_y")
  } else {
    p
  }
}

# -------------------------------------------------------------------------
# Internal helpers
# ----------------------------------------------------------------
.dm_interp_extract_parts <- function(x) {
  if (!inherits(x, "dm_na_interpolation")) {
    stop("x must be an object returned by dm.na.interpolation().")
  }

  list(
    data = tibble::as_tibble(x$data),
    gap_info = x$gap_info,
    assessment = x$assessment
  )
}


.dm_interp_expand_gap_info <- function(gap_info, series_all) {
  if (is.null(gap_info) || nrow(gap_info) == 0) {
    return(tibble::tibble(
      type = character(0),
      series = character(0),
      start_missing = as.POSIXct(character()),
      end_missing = as.POSIXct(character()),
      n_consecutive = integer(0)
    ))
  }

  out <- list()
  k <- 0L

  ts_rows <- gap_info[gap_info$type == "timestamp_gap", , drop = FALSE]
  if (nrow(ts_rows) > 0) {
    for (i in seq_len(nrow(ts_rows))) {
      for (s in series_all) {
        k <- k + 1L
        out[[k]] <- tibble::tibble(
          type = ts_rows$type[i],
          series = s,
          start_missing = ts_rows$start_missing[i],
          end_missing = ts_rows$end_missing[i],
          n_consecutive = ts_rows$n_consecutive[i]
        )
      }
    }
  }

  na_rows <- gap_info[gap_info$type == "internal_na", , drop = FALSE]
  if (nrow(na_rows) > 0) {
    na_rows <- na_rows[na_rows$series %in% series_all, , drop = FALSE]
    if (nrow(na_rows) > 0) {
      for (i in seq_len(nrow(na_rows))) {
        k <- k + 1L
        out[[k]] <- tibble::tibble(
          type = na_rows$type[i],
          series = na_rows$series[i],
          start_missing = na_rows$start_missing[i],
          end_missing = na_rows$end_missing[i],
          n_consecutive = na_rows$n_consecutive[i]
        )
      }
    }
  }

  if (!length(out)) {
    return(tibble::tibble(
      type = character(0),
      series = character(0),
      start_missing = as.POSIXct(character()),
      end_missing = as.POSIXct(character()),
      n_consecutive = integer(0)
    ))
  }

  dplyr::bind_rows(out)
}


.dm_interp_time_step_seconds <- function(time_vec) {
  if (length(time_vec) < 2) {
    return(0)
  }
  stats::median(diff(as.numeric(time_vec)), na.rm = TRUE)
}


.dm_interp_empty_plot <- function(label = "No gaps detected in the data.") {
  ggplot2::ggplot() +
    ggplot2::annotate("text", x = 0, y = 0, label = label, size = 5) +
    ggplot2::xlim(-1, 1) +
    ggplot2::ylim(-1, 1) +
    ggplot2::labs(
      title = "Detected gaps in dendrometer data",
      x = NULL,
      y = NULL
    ) +
    ggplot2::theme_void()
}

Try the dendRoAnalyst package in your browser

Any scripts or data that you put into this service are public.

dendRoAnalyst documentation built on May 20, 2026, 5:07 p.m.